jimnoneill commited on
Commit
9bd985f
·
verified ·
1 Parent(s): e14e4bc

Upload bsg_training_data_gen.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. bsg_training_data_gen.py +663 -0
bsg_training_data_gen.py ADDED
@@ -0,0 +1,663 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Enhanced DeepSeek Training Data Generator for Scientific Summarization
4
+ Generates high-quality training data with integrated cleanup and row slicing
5
+ """
6
+
7
+ import requests
8
+ import json
9
+ import pandas as pd
10
+ import time
11
+ import csv
12
+ import os
13
+ import re
14
+ import hashlib
15
+ from pathlib import Path
16
+ from typing import List, Tuple, Dict, Optional
17
+ from datetime import datetime, timedelta
18
+
19
+ class EnhancedDeepSeekTrainingDataGenerator:
20
+ """Generate training data using DeepSeek API with integrated cleanup and row slicing"""
21
+
22
+ def __init__(self, api_key: str, base_url: str = "https://api.deepseek.com/v1"):
23
+ """
24
+ Initialize DeepSeek API client
25
+
26
+ Args:
27
+ api_key: Your DeepSeek API key
28
+ base_url: DeepSeek API base URL
29
+ """
30
+ self.api_key = api_key
31
+ self.base_url = base_url
32
+ self.headers = {
33
+ "Authorization": f"Bearer {api_key}",
34
+ "Content-Type": "application/json"
35
+ }
36
+ self.start_time = None
37
+ self.processed_count = 0
38
+
39
+ def clean_deepseek_output(self, text: str) -> str:
40
+ """
41
+ Clean up DeepSeek output to remove formatting artifacts
42
+
43
+ Args:
44
+ text: Raw text from DeepSeek API
45
+
46
+ Returns:
47
+ Cleaned text without formatting artifacts
48
+ """
49
+ if not text or pd.isna(text):
50
+ return text
51
+
52
+ text = str(text).strip()
53
+
54
+ # Remove numbered prefixes (1., 2., 3.)
55
+ text = re.sub(r'^\d+\.\s*', '', text)
56
+
57
+ # Remove component labels
58
+ text = re.sub(r'^(ABSTRACT[_\s]*SUMMARY:?|SHORT[_\s]*SUMMARY:?|TITLE:?)', '', text, flags=re.IGNORECASE)
59
+
60
+ # Remove excessive whitespace
61
+ text = re.sub(r'\s+', ' ', text)
62
+
63
+ # Remove trailing colons or dashes
64
+ text = re.sub(r'[:\-]+$', '', text)
65
+
66
+ # Remove markdown formatting
67
+ text = re.sub(r'\*+', '', text)
68
+
69
+ # Remove quotes that sometimes wrap the entire response
70
+ text = re.sub(r'^["\']+|["\']+$', '', text)
71
+
72
+ return text.strip()
73
+
74
+ def create_few_shot_prompt(self, concatenated_abstracts: str, keywords: str) -> str:
75
+ """
76
+ Create optimized few-shot prompt for DeepSeek with clean output formatting
77
+ """
78
+ prompt = (
79
+ "You are an expert scientific summarization assistant. Generate exactly three components separated by '|||':\n"
80
+ "1. ABSTRACT_SUMMARY: A detailed 4-6 sentence summary highlighting key findings, methods, and implications\n"
81
+ "2. SHORT_SUMMARY: A concise 2-3 sentence summary capturing the core essence\n"
82
+ "3. TITLE: A sophisticated, detailed title reflecting the research scope and methods\n\n"
83
+ "CRITICAL: Respond ONLY with the three components separated by '|||'. Do not include conversational text, explanations, or markdown formatting.\n\n"
84
+ "Format: ABSTRACT_SUMMARY|||SHORT_SUMMARY|||TITLE\n\n"
85
+ "Focus on:\n"
86
+ "- Specific computational methods, techniques, and approaches\n"
87
+ "- Key biological processes and mechanisms\n"
88
+ "- Research methodologies and experimental designs\n"
89
+ "- Clinical or therapeutic implications\n"
90
+ "- Be specific and detailed; avoid generic terms\n\n"
91
+ )
92
+
93
+ # Few-shot Example 1 - Immunology/Antimicrobial Research
94
+ example1_text = (
95
+ "Studies investigated mammary gland candidiasis models using immunocompetent and immunodeficient mice "
96
+ "treated with amphotericin B. Complement activation analysis revealed tissue inflammation patterns. "
97
+ "Research on antigen processing examined proteasome mutants lacking specific protease activities for "
98
+ "peptide generation. Novel ankyrin-repeat family member MAIL was identified with nuclear localization "
99
+ "potentiating IL-6 expression. Antimicrobial peptides pseudins 1-4 were isolated from frog skin showing "
100
+ "activity against various pathogens."
101
+ )
102
+ example1_keywords = "MAIL; proteasome; antimicrobial peptides; complement activation; mammary glands"
103
+
104
+ prompt += (
105
+ f"INPUT: {example1_text}\n"
106
+ f"KEYWORDS: {example1_keywords}\n"
107
+ "OUTPUT: "
108
+ "Comprehensive investigation of innate immune responses utilizing murine mammary gland candidiasis models "
109
+ "with complement activation analysis and proteasome-mediated antigen processing pathways, complemented by "
110
+ "characterization of novel antimicrobial peptides and nuclear transcription modulators. Research demonstrates "
111
+ "the critical role of specific protease activities in MHC class I-restricted peptide generation while identifying "
112
+ "MAIL as a nuclear factor potentiating cytokine expression and pseudins as promising therapeutic antimicrobials. "
113
+ "These findings advance understanding of immunopathological mechanisms and provide validated experimental models "
114
+ "for antifungal compound evaluation.|||"
115
+ "Studies utilized murine models to investigate immune responses in candidiasis while characterizing novel "
116
+ "antimicrobial compounds and antigen processing mechanisms. Research identified critical protease activities "
117
+ "and nuclear factors regulating immune responses.|||"
118
+ "Integrated Immunological Modeling and Antimicrobial Peptide Discovery: Proteasome-Mediated Antigen Processing "
119
+ "and Complement-Dependent Host Defense Mechanisms\n\n"
120
+ )
121
+
122
+ # Few-shot Example 2 - Biotechnology/Tissue Engineering
123
+ example2_text = (
124
+ "Biotechnology development focused on hematopoietic stem cell expansion using cytokine combinations. "
125
+ "Temperature-responsive polymers enabled designed cell sheet engineering for tissue applications. "
126
+ "Vascular anastomosis techniques using titanium clips reduced neointimal hyperplasia. Endothelial cell "
127
+ "seeding protocols for vascular grafts were optimized. Gene transfer therapies for therapeutic angiogenesis "
128
+ "showed clinical promise in cardiovascular applications."
129
+ )
130
+ example2_keywords = "biotechnology; tissue engineering; vascular grafts; stem cells; angiogenesis"
131
+
132
+ prompt += (
133
+ f"INPUT: {example2_text}\n"
134
+ f"KEYWORDS: {example2_keywords}\n"
135
+ "OUTPUT: "
136
+ "Advanced biotechnology approaches combining cytokine-mediated hematopoietic stem cell expansion protocols "
137
+ "with temperature-responsive polymer systems for precision cell sheet engineering and vascular reconstruction. "
138
+ "Integration of titanium clip anastomosis techniques and optimized endothelial cell seeding methodologies "
139
+ "demonstrates significant reduction in neointimal hyperplasia while enhancing graft patency. Gene transfer "
140
+ "strategies for therapeutic angiogenesis represent promising clinical interventions for cardiovascular disease "
141
+ "treatment, establishing proof-of-concept for growth factor-mediated collateral vessel development.|||"
142
+ "Research combines stem cell expansion technologies with polymer-based cell engineering and vascular "
143
+ "reconstruction techniques. Gene therapy approaches show clinical promise for treating cardiovascular disease "
144
+ "through enhanced angiogenesis.|||"
145
+ "Multiscale Biotechnology Integration: Cytokine-Mediated Stem Cell Engineering and Polymer-Assisted "
146
+ "Vascular Reconstruction with Gene Transfer-Enhanced Therapeutic Angiogenesis\n\n"
147
+ )
148
+
149
+ # User query
150
+ prompt += (
151
+ f"INPUT: {concatenated_abstracts}\n"
152
+ f"KEYWORDS: {keywords}\n"
153
+ "OUTPUT:"
154
+ )
155
+
156
+ return prompt
157
+
158
+ def call_deepseek_api(self, prompt: str, max_retries: int = 3) -> str:
159
+ """
160
+ Call DeepSeek API with enhanced retry logic and timeout handling
161
+ """
162
+ for attempt in range(max_retries):
163
+ try:
164
+ payload = {
165
+ "model": "deepseek-chat", # DeepSeek-V3 instruct model
166
+ "messages": [
167
+ {
168
+ "role": "user",
169
+ "content": prompt
170
+ }
171
+ ],
172
+ "max_tokens": 800,
173
+ "temperature": 0.7,
174
+ "top_p": 0.9,
175
+ "stream": False
176
+ }
177
+
178
+ # Enhanced timeout handling
179
+ response = requests.post(
180
+ f"{self.base_url}/chat/completions",
181
+ headers=self.headers,
182
+ json=payload,
183
+ timeout=(10, 60) # (connection timeout, read timeout)
184
+ )
185
+
186
+ if response.status_code == 200:
187
+ result = response.json()
188
+ return result['choices'][0]['message']['content'].strip()
189
+ elif response.status_code == 429: # Rate limit
190
+ wait_time = min(60, 2 ** attempt * 30)
191
+ print(f"Rate limit hit. Waiting {wait_time} seconds...")
192
+ time.sleep(wait_time)
193
+ continue
194
+ elif response.status_code >= 500: # Server errors
195
+ wait_time = min(30, 2 ** attempt * 5)
196
+ print(f"Server error {response.status_code}. Retrying in {wait_time} seconds...")
197
+ time.sleep(wait_time)
198
+ continue
199
+ else:
200
+ print(f"API Error {response.status_code}: {response.text}")
201
+ if attempt < max_retries - 1:
202
+ time.sleep(2 ** attempt)
203
+ continue
204
+ else:
205
+ return ""
206
+
207
+ except requests.exceptions.Timeout as e:
208
+ print(f"Timeout error on attempt {attempt + 1}: {e}")
209
+ if attempt < max_retries - 1:
210
+ wait_time = min(30, 2 ** attempt * 10)
211
+ print(f"Retrying in {wait_time} seconds...")
212
+ time.sleep(wait_time)
213
+ continue
214
+ else:
215
+ print(f"Max retries exceeded due to timeout")
216
+ return ""
217
+ except requests.exceptions.ConnectionError as e:
218
+ print(f"Connection error on attempt {attempt + 1}: {e}")
219
+ if attempt < max_retries - 1:
220
+ wait_time = min(30, 2 ** attempt * 10)
221
+ print(f"Retrying in {wait_time} seconds...")
222
+ time.sleep(wait_time)
223
+ continue
224
+ else:
225
+ print(f"Max retries exceeded due to connection error")
226
+ return ""
227
+ except Exception as e:
228
+ print(f"Attempt {attempt + 1} failed: {str(e)}")
229
+ if attempt < max_retries - 1:
230
+ time.sleep(2 ** attempt)
231
+ continue
232
+ else:
233
+ return ""
234
+
235
+ return ""
236
+
237
+ def parse_response(self, response: str) -> Tuple[str, str, str]:
238
+ """
239
+ Enhanced parsing for DeepSeek responses with integrated cleanup
240
+ """
241
+ if not response:
242
+ return "Failed to generate", "Failed to generate", "Failed to generate"
243
+
244
+ # Clean the response first
245
+ response = response.strip()
246
+
247
+ # Remove common DeepSeek conversational elements
248
+ conversational_starters = [
249
+ "Here are the structured outputs",
250
+ "Here's the structured output",
251
+ "Based on the provided keywords",
252
+ "Let me know if you'd like",
253
+ "Would you like me to",
254
+ "I can help you",
255
+ "Here's my analysis"
256
+ ]
257
+
258
+ for starter in conversational_starters:
259
+ if response.startswith(starter):
260
+ # Find the actual content after conversational part
261
+ lines = response.split('\n')
262
+ content_lines = []
263
+ found_content = False
264
+ for line in lines:
265
+ if any(marker in line.upper() for marker in ['ABSTRACT_SUMMARY:', 'ABSTRACT:', '1.', '**1.']):
266
+ found_content = True
267
+ if found_content:
268
+ content_lines.append(line)
269
+ if content_lines:
270
+ response = '\n'.join(content_lines)
271
+ break
272
+
273
+ # Remove markdown formatting
274
+ response = re.sub(r'\*\*(\d+\.)\*\*', r'\1', response) # **1.** -> 1.
275
+ response = re.sub(r'\*\*(.*?)\*\*', r'\1', response) # **text** -> text
276
+ response = re.sub(r'^\s*---\s*$', '', response, flags=re.MULTILINE) # Remove --- lines
277
+
278
+ abstract_summary = ""
279
+ short_summary = ""
280
+ title = ""
281
+
282
+ try:
283
+ # Method 1: Look for standard ||| separator
284
+ if '|||' in response:
285
+ parts = [part.strip() for part in response.split('|||')]
286
+ if len(parts) >= 3:
287
+ abstract_summary = parts[0]
288
+ short_summary = parts[1]
289
+ title = parts[2]
290
+ elif len(parts) == 2:
291
+ abstract_summary = parts[0]
292
+ title = parts[1]
293
+ # Generate short summary from abstract
294
+ sentences = re.split(r'[.!?]+', abstract_summary)
295
+ short_summary = '. '.join(sentences[:2]).strip() + '.'
296
+
297
+ # Method 2: Look for numbered sections (DeepSeek's preferred format)
298
+ elif "1. ABSTRACT_SUMMARY:" in response or "1.ABSTRACT_SUMMARY:" in response:
299
+ # Extract by numbered sections
300
+ abstract_match = re.search(r'1\.?\s*ABSTRACT_SUMMARY:\s*(.*?)(?=2\.|3\.|$)', response, re.DOTALL | re.IGNORECASE)
301
+ short_match = re.search(r'2\.?\s*SHORT_SUMMARY:\s*(.*?)(?=3\.|$)', response, re.DOTALL | re.IGNORECASE)
302
+ title_match = re.search(r'3\.?\s*TITLE:\s*(.*?)(?=\n\n|$)', response, re.DOTALL | re.IGNORECASE)
303
+
304
+ if abstract_match:
305
+ abstract_summary = abstract_match.group(1).strip()
306
+ if short_match:
307
+ short_summary = short_match.group(1).strip()
308
+ if title_match:
309
+ title = title_match.group(1).strip()
310
+
311
+ # Method 3: Look for any mention of the three components
312
+ else:
313
+ # Try to find ABSTRACT_SUMMARY, SHORT_SUMMARY, TITLE anywhere
314
+ abstract_match = re.search(r'ABSTRACT[_\s]*SUMMARY:?\s*(.*?)(?=SHORT|TITLE|$)', response, re.DOTALL | re.IGNORECASE)
315
+ short_match = re.search(r'SHORT[_\s]*SUMMARY:?\s*(.*?)(?=TITLE|$)', response, re.DOTALL | re.IGNORECASE)
316
+ title_match = re.search(r'TITLE:?\s*(.*?)(?=\n|$)', response, re.DOTALL | re.IGNORECASE)
317
+
318
+ if abstract_match:
319
+ abstract_summary = abstract_match.group(1).strip()
320
+ if short_match:
321
+ short_summary = short_match.group(1).strip()
322
+ if title_match:
323
+ title = title_match.group(1).strip()
324
+
325
+ except Exception as e:
326
+ print(f"Error in enhanced parsing: {e}")
327
+
328
+ # Fallback: if still no content, try to extract from the full response
329
+ if not abstract_summary and not short_summary and not title:
330
+ # Split response into sentences and distribute intelligently
331
+ sentences = re.split(r'[.!?]+', response)
332
+ sentences = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10]
333
+
334
+ if len(sentences) >= 6:
335
+ abstract_summary = '. '.join(sentences[:4]) + '.'
336
+ short_summary = '. '.join(sentences[4:6]) + '.'
337
+ title = sentences[6] if len(sentences) > 6 else "Advanced Scientific Research Analysis"
338
+ elif len(sentences) >= 3:
339
+ abstract_summary = '. '.join(sentences[:2]) + '.'
340
+ short_summary = sentences[2] + '.'
341
+ title = sentences[-1] if len(sentences) > 3 else "Scientific Research Study"
342
+ elif len(sentences) >= 1:
343
+ abstract_summary = sentences[0]
344
+ short_summary = sentences[0][:100] + "..." if len(sentences[0]) > 100 else sentences[0]
345
+ title = "Scientific Analysis"
346
+ else:
347
+ abstract_summary = response[:200] + "..." if len(response) > 200 else response
348
+ short_summary = response[:100] + "..." if len(response) > 100 else response
349
+ title = "Research Summary"
350
+
351
+ # Apply integrated cleanup to all components
352
+ abstract_summary = self.clean_deepseek_output(abstract_summary)
353
+ short_summary = self.clean_deepseek_output(short_summary)
354
+ title = self.clean_deepseek_output(title)
355
+
356
+ # Ensure reasonable lengths after cleanup
357
+ if len(abstract_summary.split()) > 150:
358
+ abstract_summary = ' '.join(abstract_summary.split()[:150]) + "..."
359
+
360
+ if len(short_summary.split()) > 75:
361
+ short_summary = ' '.join(short_summary.split()[:75]) + "..."
362
+
363
+ if len(title.split()) > 25:
364
+ title = ' '.join(title.split()[:25]) + "..."
365
+
366
+ # Final validation - ensure we have actual content
367
+ if not abstract_summary or abstract_summary in ["", "Content not extracted", "Content not properly extracted"]:
368
+ abstract_summary = "Content generation failed"
369
+ if not short_summary or short_summary in ["", "Content not extracted", "Content not properly extracted"]:
370
+ short_summary = "Content generation failed"
371
+ if not title or title in ["", "Content not extracted", "Content not properly extracted"]:
372
+ title = "Content generation failed"
373
+
374
+ return abstract_summary, short_summary, title
375
+
376
+ def load_checkpoint(self, checkpoint_file: str) -> Tuple[List[Dict], set]:
377
+ """
378
+ Load existing checkpoint data and return processed data + processed indices
379
+ """
380
+ if os.path.exists(checkpoint_file):
381
+ try:
382
+ df = pd.read_csv(checkpoint_file, sep='\t')
383
+ processed_data = df.to_dict('records')
384
+ processed_indices = set(df['OriginalIndex'].astype(str))
385
+ print(f"✓ Loaded checkpoint with {len(processed_data)} processed entries")
386
+ return processed_data, processed_indices
387
+ except Exception as e:
388
+ print(f"Error loading checkpoint: {e}")
389
+ return [], set()
390
+ return [], set()
391
+
392
+ def save_checkpoint(self, output_data: List[Dict], checkpoint_file: str):
393
+ """
394
+ Save current progress to checkpoint file
395
+ """
396
+ try:
397
+ df = pd.DataFrame(output_data)
398
+ df.to_csv(checkpoint_file, sep='\t', index=False, quoting=csv.QUOTE_ALL)
399
+ print(f"💾 Checkpoint saved: {len(output_data)} entries")
400
+ except Exception as e:
401
+ print(f"Error saving checkpoint: {e}")
402
+
403
+ def estimate_time_remaining(self, current_progress: int, total_rows: int) -> str:
404
+ """
405
+ Estimate time remaining based on current progress
406
+ """
407
+ if self.start_time is None or current_progress == 0:
408
+ return "Calculating..."
409
+
410
+ elapsed = datetime.now() - self.start_time
411
+ elapsed_seconds = elapsed.total_seconds()
412
+
413
+ if current_progress > 0:
414
+ avg_time_per_row = elapsed_seconds / current_progress
415
+ remaining_rows = total_rows - current_progress
416
+ remaining_seconds = remaining_rows * avg_time_per_row
417
+ remaining_time = timedelta(seconds=int(remaining_seconds))
418
+ return str(remaining_time)
419
+
420
+ return "Calculating..."
421
+
422
+ def process_data_file(self, input_file: str, output_file: str, delay: float = 1.0,
423
+ save_every: int = 50, debug_first_n: int = 3,
424
+ start_row: int = 0, end_row: Optional[int] = None):
425
+ """
426
+ Process the input TSV file and generate training data with checkpointing and row slicing
427
+
428
+ Args:
429
+ input_file: Path to input TSV file
430
+ output_file: Path to output TSV file
431
+ delay: Delay between API calls to respect rate limits
432
+ save_every: Save checkpoint every N processed rows
433
+ debug_first_n: Print full input/output for first N generations for QC
434
+ start_row: Starting row index (0-based)
435
+ end_row: Ending row index (0-based, None for all remaining rows)
436
+ """
437
+ self.start_time = datetime.now()
438
+
439
+ # Setup checkpoint file
440
+ checkpoint_file = output_file.replace('.tsv', '_checkpoint.tsv')
441
+
442
+ # Load existing checkpoint
443
+ output_data, processed_indices = self.load_checkpoint(checkpoint_file)
444
+
445
+ # Read input data
446
+ try:
447
+ df = pd.read_csv(input_file, sep='\t')
448
+ except Exception as e:
449
+ print(f"Error reading input file: {e}")
450
+ return
451
+
452
+ # Apply row slicing
453
+ original_length = len(df)
454
+ if end_row is None:
455
+ end_row = original_length
456
+ else:
457
+ end_row = min(end_row, original_length)
458
+
459
+ if start_row >= original_length:
460
+ print(f"❌ Error: start_row {start_row} is >= total rows {original_length}")
461
+ return
462
+
463
+ df_slice = df.iloc[start_row:end_row].copy()
464
+ total_rows = len(df_slice)
465
+
466
+ initial_processed = len(output_data)
467
+
468
+ print(f"📊 Processing Overview:")
469
+ print(f" Input file total rows: {original_length}")
470
+ print(f" Processing slice: rows {start_row} to {end_row-1}")
471
+ print(f" Rows in slice: {total_rows}")
472
+ print(f" Already processed: {initial_processed}")
473
+ print(f" Remaining: {total_rows - initial_processed}")
474
+ print(f" Checkpoint saves every {save_every} rows")
475
+ print(f" Estimated cost: ~${total_rows * 0.0014:.2f}")
476
+ print(f" Estimated time: ~{total_rows * 1.5 / 3600:.1f} hours")
477
+ print(f" Debug mode: First {debug_first_n} generations will show detailed output")
478
+ print("-" * 80)
479
+
480
+ successful_processed = 0
481
+ failed_processed = 0
482
+ generations_count = 0
483
+ processed_this_run = 0
484
+
485
+ for index, row in df_slice.iterrows():
486
+ original_index = str(row.get('Index', index))
487
+
488
+ # Skip if already processed
489
+ if original_index in processed_indices:
490
+ continue
491
+
492
+ concatenated_abstracts = str(row.get('ConcatenatedAbstracts', ''))
493
+ keywords = str(row.get('TopKeywords', ''))
494
+
495
+ # Skip if no content
496
+ if not concatenated_abstracts or concatenated_abstracts == 'nan':
497
+ print(f"[{processed_this_run + 1}/{total_rows}] Skipping empty cluster {original_index}")
498
+ continue
499
+
500
+ actual_row_num = start_row + processed_this_run
501
+ print(f"[{processed_this_run + 1}/{total_rows}] Processing row {actual_row_num} (cluster {original_index})...")
502
+
503
+ # Create prompt
504
+ prompt = self.create_few_shot_prompt(concatenated_abstracts, keywords)
505
+
506
+ # DEBUG: Print detailed input/output for first few generations
507
+ if generations_count < debug_first_n:
508
+ print("\n" + "="*80)
509
+ print(f"🔍 DEBUG OUTPUT FOR GENERATION #{generations_count + 1}")
510
+ print("="*80)
511
+ print(f"📋 CLUSTER INDEX: {original_index} (Row {actual_row_num})")
512
+ print(f"🔑 KEYWORDS: {keywords}")
513
+ print(f"📄 ABSTRACTS (first 500 chars): {concatenated_abstracts[:500]}...")
514
+ print("\n" + "-"*60)
515
+ print("📤 FULL PROMPT BEING SENT TO API:")
516
+ print("-"*60)
517
+ print(prompt)
518
+ print("-"*60)
519
+
520
+ # Call API
521
+ response = self.call_deepseek_api(prompt)
522
+
523
+ # Continue debug printing
524
+ if generations_count < debug_first_n:
525
+ print("📥 RAW API RESPONSE:")
526
+ print("-"*60)
527
+ print(response if response else "❌ NO RESPONSE / ERROR")
528
+ print("-"*60)
529
+
530
+ if response:
531
+ # Parse response (now includes integrated cleanup)
532
+ abstract_summary, short_summary, title = self.parse_response(response)
533
+
534
+ # Continue debug printing
535
+ if generations_count < debug_first_n:
536
+ print("🔧 PARSED & CLEANED COMPONENTS:")
537
+ print("-"*60)
538
+ print(f"📝 ABSTRACT SUMMARY:\n{abstract_summary}\n")
539
+ print(f"⚡ SHORT SUMMARY:\n{short_summary}\n")
540
+ print(f"🏷️ TITLE:\n{title}\n")
541
+ print("="*80 + "\n")
542
+
543
+ # Add to output data
544
+ output_data.append({
545
+ 'OriginalIndex': original_index,
546
+ 'SourceRow': actual_row_num, # Track original row number
547
+ 'AbstractSummary': abstract_summary,
548
+ 'ShortSummary': short_summary,
549
+ 'Title': title,
550
+ 'OriginalKeywords': keywords,
551
+ 'OriginalText': concatenated_abstracts[:1000] + "..." if len(concatenated_abstracts) > 1000 else concatenated_abstracts
552
+ })
553
+
554
+ successful_processed += 1
555
+ print(f"✓ Success! ({successful_processed} total successes)")
556
+ else:
557
+ if generations_count < debug_first_n:
558
+ print("❌ FAILED TO PARSE OR GET RESPONSE")
559
+ print("="*80 + "\n")
560
+
561
+ print(f"✗ Failed to process cluster {original_index}")
562
+ # Add empty entry to maintain tracking
563
+ output_data.append({
564
+ 'OriginalIndex': original_index,
565
+ 'SourceRow': actual_row_num,
566
+ 'AbstractSummary': 'Failed to generate',
567
+ 'ShortSummary': 'Failed to generate',
568
+ 'Title': 'Failed to generate',
569
+ 'OriginalKeywords': keywords,
570
+ 'OriginalText': concatenated_abstracts[:1000] + "..." if len(concatenated_abstracts) > 1000 else concatenated_abstracts
571
+ })
572
+ failed_processed += 1
573
+
574
+ generations_count += 1
575
+ processed_this_run += 1
576
+
577
+ # Update processed set
578
+ processed_indices.add(original_index)
579
+
580
+ # Save checkpoint periodically
581
+ if len(output_data) % save_every == 0:
582
+ self.save_checkpoint(output_data, checkpoint_file)
583
+ time_remaining = self.estimate_time_remaining(processed_this_run, total_rows)
584
+ print(f"📁 Checkpoint saved! Progress: {processed_this_run}/{total_rows} | ETA: {time_remaining}")
585
+
586
+ # Rate limiting
587
+ time.sleep(delay)
588
+
589
+ # Final save
590
+ try:
591
+ output_df = pd.DataFrame(output_data)
592
+ output_df.to_csv(output_file, sep='\t', index=False, quoting=csv.QUOTE_ALL)
593
+
594
+ print(f"\n🎉 GENERATION COMPLETED!")
595
+ print(f"✓ Successfully processed: {successful_processed}")
596
+ print(f"✗ Failed: {failed_processed}")
597
+ print(f"📄 Total entries saved: {len(output_data)}")
598
+ print(f"💾 Final output saved to: {output_file}")
599
+ print(f"💰 Estimated cost: ~${successful_processed * 0.0014:.2f}")
600
+ print(f"📊 Processed rows {start_row} to {end_row-1} from source file")
601
+
602
+ # Clean up checkpoint file
603
+ if os.path.exists(checkpoint_file):
604
+ os.remove(checkpoint_file)
605
+ print(f"🗑️ Checkpoint file cleaned up")
606
+
607
+ except Exception as e:
608
+ print(f"Error saving final output file: {e}")
609
+ print(f"Your data is still safe in checkpoint: {checkpoint_file}")
610
+
611
+ def main():
612
+ """
613
+ Main function to run the training data generation with row slicing
614
+ """
615
+ # Configuration for processing all 30,000 examples
616
+ API_KEY = "sk-6185ef64c68d473d984963356ab0378e" # Replace with your actual API key
617
+ INPUT_FILE = "/home/joneill/pubmed_clustered_data_sciner.tsv" # Your input TSV file
618
+
619
+ # Row slicing configuration - MODIFY THESE FOR YOUR BATCHES
620
+ START_ROW = 0 # Starting row (0-based)
621
+ END_ROW = 30000 # Ending row (None for all rows, or specify number)
622
+ BATCH_NAME = "full" # Used in output filename
623
+
624
+ # You can also run in batches, e.g.:
625
+ # Batch 1: START_ROW = 0, END_ROW = 5000, BATCH_NAME = "batch1"
626
+ # Batch 2: START_ROW = 5000, END_ROW = 10000, BATCH_NAME = "batch2"
627
+ # Batch 3: START_ROW = 10000, END_ROW = 15000, BATCH_NAME = "batch3"
628
+ # etc.
629
+
630
+ OUTPUT_FILE = f"bsg_training_data_{BATCH_NAME}.tsv" # Output file for training data
631
+ DELAY_BETWEEN_CALLS = 1.0 # Seconds between API calls
632
+ SAVE_EVERY = 50 # Save checkpoint every N rows
633
+ DEBUG_FIRST_N = 3 # Print full input/output for first N generations for QC
634
+
635
+ # Initialize generator
636
+ generator = EnhancedDeepSeekTrainingDataGenerator(API_KEY)
637
+
638
+ # Calculate batch info
639
+ total_rows_to_process = END_ROW - START_ROW if END_ROW else "all remaining"
640
+
641
+ # Process data
642
+ print("🚀 Starting Enhanced DeepSeek Training Data Generation")
643
+ print("="*80)
644
+ print(f"🎯 Processing: {total_rows_to_process} rows (from row {START_ROW} to {END_ROW-1 if END_ROW else 'end'})")
645
+ print(f"💰 Estimated cost: ~${(END_ROW - START_ROW if END_ROW else 30000) * 0.0014:.2f}")
646
+ print(f"⏱️ Estimated time: ~{(END_ROW - START_ROW if END_ROW else 30000) * 1.5 / 3600:.1f} hours")
647
+ print(f"🔍 Debug mode: Will show detailed input/output for first {DEBUG_FIRST_N} generations")
648
+ print(f"💾 Automatic checkpointing every {SAVE_EVERY} rows")
649
+ print(f"🔄 Auto-resume: Restart script to continue from checkpoint")
650
+ print(f"🧹 Integrated cleanup: All outputs automatically cleaned of formatting artifacts")
651
+ print("="*80)
652
+
653
+ generator.process_data_file(
654
+ INPUT_FILE, OUTPUT_FILE, DELAY_BETWEEN_CALLS, SAVE_EVERY, DEBUG_FIRST_N,
655
+ START_ROW, END_ROW
656
+ )
657
+
658
+ print("\n🎉 Training data generation completed!")
659
+ print(f"📁 Output file: {OUTPUT_FILE}")
660
+ print("✨ Data is automatically cleaned and ready for training! 🧪")
661
+
662
+ if __name__ == "__main__":
663
+ main()