File size: 30,685 Bytes
9bd985f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
#!/usr/bin/env python3
"""
Enhanced DeepSeek Training Data Generator for Scientific Summarization
Generates high-quality training data with integrated cleanup and row slicing
"""

import requests
import json
import pandas as pd
import time
import csv
import os
import re
import hashlib
from pathlib import Path
from typing import List, Tuple, Dict, Optional
from datetime import datetime, timedelta

class EnhancedDeepSeekTrainingDataGenerator:
    """Generate training data using DeepSeek API with integrated cleanup and row slicing"""

    def __init__(self, api_key: str, base_url: str = "https://api.deepseek.com/v1"):
        """
        Initialize DeepSeek API client

        Args:
            api_key: Your DeepSeek API key
            base_url: DeepSeek API base URL
        """
        self.api_key = api_key
        self.base_url = base_url
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
        self.start_time = None
        self.processed_count = 0

    def clean_deepseek_output(self, text: str) -> str:
        """
        Clean up DeepSeek output to remove formatting artifacts

        Args:
            text: Raw text from DeepSeek API

        Returns:
            Cleaned text without formatting artifacts
        """
        if not text or pd.isna(text):
            return text

        text = str(text).strip()

        # Remove numbered prefixes (1., 2., 3.)
        text = re.sub(r'^\d+\.\s*', '', text)

        # Remove component labels
        text = re.sub(r'^(ABSTRACT[_\s]*SUMMARY:?|SHORT[_\s]*SUMMARY:?|TITLE:?)', '', text, flags=re.IGNORECASE)

        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)

        # Remove trailing colons or dashes
        text = re.sub(r'[:\-]+$', '', text)

        # Remove markdown formatting
        text = re.sub(r'\*+', '', text)

        # Remove quotes that sometimes wrap the entire response
        text = re.sub(r'^["\']+|["\']+$', '', text)

        return text.strip()

    def create_few_shot_prompt(self, concatenated_abstracts: str, keywords: str) -> str:
        """
        Create optimized few-shot prompt for DeepSeek with clean output formatting
        """
        prompt = (
            "You are an expert scientific summarization assistant. Generate exactly three components separated by '|||':\n"
            "1. ABSTRACT_SUMMARY: A detailed 4-6 sentence summary highlighting key findings, methods, and implications\n"
            "2. SHORT_SUMMARY: A concise 2-3 sentence summary capturing the core essence\n"
            "3. TITLE: A sophisticated, detailed title reflecting the research scope and methods\n\n"
            "CRITICAL: Respond ONLY with the three components separated by '|||'. Do not include conversational text, explanations, or markdown formatting.\n\n"
            "Format: ABSTRACT_SUMMARY|||SHORT_SUMMARY|||TITLE\n\n"
            "Focus on:\n"
            "- Specific computational methods, techniques, and approaches\n"
            "- Key biological processes and mechanisms\n"
            "- Research methodologies and experimental designs\n"
            "- Clinical or therapeutic implications\n"
            "- Be specific and detailed; avoid generic terms\n\n"
        )

        # Few-shot Example 1 - Immunology/Antimicrobial Research
        example1_text = (
            "Studies investigated mammary gland candidiasis models using immunocompetent and immunodeficient mice "
            "treated with amphotericin B. Complement activation analysis revealed tissue inflammation patterns. "
            "Research on antigen processing examined proteasome mutants lacking specific protease activities for "
            "peptide generation. Novel ankyrin-repeat family member MAIL was identified with nuclear localization "
            "potentiating IL-6 expression. Antimicrobial peptides pseudins 1-4 were isolated from frog skin showing "
            "activity against various pathogens."
        )
        example1_keywords = "MAIL; proteasome; antimicrobial peptides; complement activation; mammary glands"

        prompt += (
            f"INPUT: {example1_text}\n"
            f"KEYWORDS: {example1_keywords}\n"
            "OUTPUT: "
            "Comprehensive investigation of innate immune responses utilizing murine mammary gland candidiasis models "
            "with complement activation analysis and proteasome-mediated antigen processing pathways, complemented by "
            "characterization of novel antimicrobial peptides and nuclear transcription modulators. Research demonstrates "
            "the critical role of specific protease activities in MHC class I-restricted peptide generation while identifying "
            "MAIL as a nuclear factor potentiating cytokine expression and pseudins as promising therapeutic antimicrobials. "
            "These findings advance understanding of immunopathological mechanisms and provide validated experimental models "
            "for antifungal compound evaluation.|||"
            "Studies utilized murine models to investigate immune responses in candidiasis while characterizing novel "
            "antimicrobial compounds and antigen processing mechanisms. Research identified critical protease activities "
            "and nuclear factors regulating immune responses.|||"
            "Integrated Immunological Modeling and Antimicrobial Peptide Discovery: Proteasome-Mediated Antigen Processing "
            "and Complement-Dependent Host Defense Mechanisms\n\n"
        )

        # Few-shot Example 2 - Biotechnology/Tissue Engineering
        example2_text = (
            "Biotechnology development focused on hematopoietic stem cell expansion using cytokine combinations. "
            "Temperature-responsive polymers enabled designed cell sheet engineering for tissue applications. "
            "Vascular anastomosis techniques using titanium clips reduced neointimal hyperplasia. Endothelial cell "
            "seeding protocols for vascular grafts were optimized. Gene transfer therapies for therapeutic angiogenesis "
            "showed clinical promise in cardiovascular applications."
        )
        example2_keywords = "biotechnology; tissue engineering; vascular grafts; stem cells; angiogenesis"

        prompt += (
            f"INPUT: {example2_text}\n"
            f"KEYWORDS: {example2_keywords}\n"
            "OUTPUT: "
            "Advanced biotechnology approaches combining cytokine-mediated hematopoietic stem cell expansion protocols "
            "with temperature-responsive polymer systems for precision cell sheet engineering and vascular reconstruction. "
            "Integration of titanium clip anastomosis techniques and optimized endothelial cell seeding methodologies "
            "demonstrates significant reduction in neointimal hyperplasia while enhancing graft patency. Gene transfer "
            "strategies for therapeutic angiogenesis represent promising clinical interventions for cardiovascular disease "
            "treatment, establishing proof-of-concept for growth factor-mediated collateral vessel development.|||"
            "Research combines stem cell expansion technologies with polymer-based cell engineering and vascular "
            "reconstruction techniques. Gene therapy approaches show clinical promise for treating cardiovascular disease "
            "through enhanced angiogenesis.|||"
            "Multiscale Biotechnology Integration: Cytokine-Mediated Stem Cell Engineering and Polymer-Assisted "
            "Vascular Reconstruction with Gene Transfer-Enhanced Therapeutic Angiogenesis\n\n"
        )

        # User query
        prompt += (
            f"INPUT: {concatenated_abstracts}\n"
            f"KEYWORDS: {keywords}\n"
            "OUTPUT:"
        )

        return prompt

    def call_deepseek_api(self, prompt: str, max_retries: int = 3) -> str:
        """
        Call DeepSeek API with enhanced retry logic and timeout handling
        """
        for attempt in range(max_retries):
            try:
                payload = {
                    "model": "deepseek-chat",  # DeepSeek-V3 instruct model
                    "messages": [
                        {
                            "role": "user",
                            "content": prompt
                        }
                    ],
                    "max_tokens": 800,
                    "temperature": 0.7,
                    "top_p": 0.9,
                    "stream": False
                }

                # Enhanced timeout handling
                response = requests.post(
                    f"{self.base_url}/chat/completions",
                    headers=self.headers,
                    json=payload,
                    timeout=(10, 60)  # (connection timeout, read timeout)
                )

                if response.status_code == 200:
                    result = response.json()
                    return result['choices'][0]['message']['content'].strip()
                elif response.status_code == 429:  # Rate limit
                    wait_time = min(60, 2 ** attempt * 30)
                    print(f"Rate limit hit. Waiting {wait_time} seconds...")
                    time.sleep(wait_time)
                    continue
                elif response.status_code >= 500:  # Server errors
                    wait_time = min(30, 2 ** attempt * 5)
                    print(f"Server error {response.status_code}. Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"API Error {response.status_code}: {response.text}")
                    if attempt < max_retries - 1:
                        time.sleep(2 ** attempt)
                        continue
                    else:
                        return ""

            except requests.exceptions.Timeout as e:
                print(f"Timeout error on attempt {attempt + 1}: {e}")
                if attempt < max_retries - 1:
                    wait_time = min(30, 2 ** attempt * 10)
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"Max retries exceeded due to timeout")
                    return ""
            except requests.exceptions.ConnectionError as e:
                print(f"Connection error on attempt {attempt + 1}: {e}")
                if attempt < max_retries - 1:
                    wait_time = min(30, 2 ** attempt * 10)
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"Max retries exceeded due to connection error")
                    return ""
            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)
                    continue
                else:
                    return ""

        return ""

    def parse_response(self, response: str) -> Tuple[str, str, str]:
        """
        Enhanced parsing for DeepSeek responses with integrated cleanup
        """
        if not response:
            return "Failed to generate", "Failed to generate", "Failed to generate"

        # Clean the response first
        response = response.strip()

        # Remove common DeepSeek conversational elements
        conversational_starters = [
            "Here are the structured outputs",
            "Here's the structured output",
            "Based on the provided keywords",
            "Let me know if you'd like",
            "Would you like me to",
            "I can help you",
            "Here's my analysis"
        ]

        for starter in conversational_starters:
            if response.startswith(starter):
                # Find the actual content after conversational part
                lines = response.split('\n')
                content_lines = []
                found_content = False
                for line in lines:
                    if any(marker in line.upper() for marker in ['ABSTRACT_SUMMARY:', 'ABSTRACT:', '1.', '**1.']):
                        found_content = True
                    if found_content:
                        content_lines.append(line)
                if content_lines:
                    response = '\n'.join(content_lines)
                break

        # Remove markdown formatting
        response = re.sub(r'\*\*(\d+\.)\*\*', r'\1', response)  # **1.** -> 1.
        response = re.sub(r'\*\*(.*?)\*\*', r'\1', response)    # **text** -> text
        response = re.sub(r'^\s*---\s*$', '', response, flags=re.MULTILINE)  # Remove --- lines

        abstract_summary = ""
        short_summary = ""
        title = ""

        try:
            # Method 1: Look for standard ||| separator
            if '|||' in response:
                parts = [part.strip() for part in response.split('|||')]
                if len(parts) >= 3:
                    abstract_summary = parts[0]
                    short_summary = parts[1]
                    title = parts[2]
                elif len(parts) == 2:
                    abstract_summary = parts[0]
                    title = parts[1]
                    # Generate short summary from abstract
                    sentences = re.split(r'[.!?]+', abstract_summary)
                    short_summary = '. '.join(sentences[:2]).strip() + '.'

            # Method 2: Look for numbered sections (DeepSeek's preferred format)
            elif "1. ABSTRACT_SUMMARY:" in response or "1.ABSTRACT_SUMMARY:" in response:
                # Extract by numbered sections
                abstract_match = re.search(r'1\.?\s*ABSTRACT_SUMMARY:\s*(.*?)(?=2\.|3\.|$)', response, re.DOTALL | re.IGNORECASE)
                short_match = re.search(r'2\.?\s*SHORT_SUMMARY:\s*(.*?)(?=3\.|$)', response, re.DOTALL | re.IGNORECASE)
                title_match = re.search(r'3\.?\s*TITLE:\s*(.*?)(?=\n\n|$)', response, re.DOTALL | re.IGNORECASE)

                if abstract_match:
                    abstract_summary = abstract_match.group(1).strip()
                if short_match:
                    short_summary = short_match.group(1).strip()
                if title_match:
                    title = title_match.group(1).strip()

            # Method 3: Look for any mention of the three components
            else:
                # Try to find ABSTRACT_SUMMARY, SHORT_SUMMARY, TITLE anywhere
                abstract_match = re.search(r'ABSTRACT[_\s]*SUMMARY:?\s*(.*?)(?=SHORT|TITLE|$)', response, re.DOTALL | re.IGNORECASE)
                short_match = re.search(r'SHORT[_\s]*SUMMARY:?\s*(.*?)(?=TITLE|$)', response, re.DOTALL | re.IGNORECASE)
                title_match = re.search(r'TITLE:?\s*(.*?)(?=\n|$)', response, re.DOTALL | re.IGNORECASE)

                if abstract_match:
                    abstract_summary = abstract_match.group(1).strip()
                if short_match:
                    short_summary = short_match.group(1).strip()
                if title_match:
                    title = title_match.group(1).strip()

        except Exception as e:
            print(f"Error in enhanced parsing: {e}")

        # Fallback: if still no content, try to extract from the full response
        if not abstract_summary and not short_summary and not title:
            # Split response into sentences and distribute intelligently
            sentences = re.split(r'[.!?]+', response)
            sentences = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10]

            if len(sentences) >= 6:
                abstract_summary = '. '.join(sentences[:4]) + '.'
                short_summary = '. '.join(sentences[4:6]) + '.'
                title = sentences[6] if len(sentences) > 6 else "Advanced Scientific Research Analysis"
            elif len(sentences) >= 3:
                abstract_summary = '. '.join(sentences[:2]) + '.'
                short_summary = sentences[2] + '.'
                title = sentences[-1] if len(sentences) > 3 else "Scientific Research Study"
            elif len(sentences) >= 1:
                abstract_summary = sentences[0]
                short_summary = sentences[0][:100] + "..." if len(sentences[0]) > 100 else sentences[0]
                title = "Scientific Analysis"
            else:
                abstract_summary = response[:200] + "..." if len(response) > 200 else response
                short_summary = response[:100] + "..." if len(response) > 100 else response
                title = "Research Summary"

        # Apply integrated cleanup to all components
        abstract_summary = self.clean_deepseek_output(abstract_summary)
        short_summary = self.clean_deepseek_output(short_summary)
        title = self.clean_deepseek_output(title)

        # Ensure reasonable lengths after cleanup
        if len(abstract_summary.split()) > 150:
            abstract_summary = ' '.join(abstract_summary.split()[:150]) + "..."

        if len(short_summary.split()) > 75:
            short_summary = ' '.join(short_summary.split()[:75]) + "..."

        if len(title.split()) > 25:
            title = ' '.join(title.split()[:25]) + "..."

        # Final validation - ensure we have actual content
        if not abstract_summary or abstract_summary in ["", "Content not extracted", "Content not properly extracted"]:
            abstract_summary = "Content generation failed"
        if not short_summary or short_summary in ["", "Content not extracted", "Content not properly extracted"]:
            short_summary = "Content generation failed"
        if not title or title in ["", "Content not extracted", "Content not properly extracted"]:
            title = "Content generation failed"

        return abstract_summary, short_summary, title

    def load_checkpoint(self, checkpoint_file: str) -> Tuple[List[Dict], set]:
        """
        Load existing checkpoint data and return processed data + processed indices
        """
        if os.path.exists(checkpoint_file):
            try:
                df = pd.read_csv(checkpoint_file, sep='\t')
                processed_data = df.to_dict('records')
                processed_indices = set(df['OriginalIndex'].astype(str))
                print(f"✓ Loaded checkpoint with {len(processed_data)} processed entries")
                return processed_data, processed_indices
            except Exception as e:
                print(f"Error loading checkpoint: {e}")
                return [], set()
        return [], set()

    def save_checkpoint(self, output_data: List[Dict], checkpoint_file: str):
        """
        Save current progress to checkpoint file
        """
        try:
            df = pd.DataFrame(output_data)
            df.to_csv(checkpoint_file, sep='\t', index=False, quoting=csv.QUOTE_ALL)
            print(f"💾 Checkpoint saved: {len(output_data)} entries")
        except Exception as e:
            print(f"Error saving checkpoint: {e}")

    def estimate_time_remaining(self, current_progress: int, total_rows: int) -> str:
        """
        Estimate time remaining based on current progress
        """
        if self.start_time is None or current_progress == 0:
            return "Calculating..."

        elapsed = datetime.now() - self.start_time
        elapsed_seconds = elapsed.total_seconds()

        if current_progress > 0:
            avg_time_per_row = elapsed_seconds / current_progress
            remaining_rows = total_rows - current_progress
            remaining_seconds = remaining_rows * avg_time_per_row
            remaining_time = timedelta(seconds=int(remaining_seconds))
            return str(remaining_time)

        return "Calculating..."

    def process_data_file(self, input_file: str, output_file: str, delay: float = 1.0,
                         save_every: int = 50, debug_first_n: int = 3,
                         start_row: int = 0, end_row: Optional[int] = None):
        """
        Process the input TSV file and generate training data with checkpointing and row slicing

        Args:
            input_file: Path to input TSV file
            output_file: Path to output TSV file
            delay: Delay between API calls to respect rate limits
            save_every: Save checkpoint every N processed rows
            debug_first_n: Print full input/output for first N generations for QC
            start_row: Starting row index (0-based)
            end_row: Ending row index (0-based, None for all remaining rows)
        """
        self.start_time = datetime.now()

        # Setup checkpoint file
        checkpoint_file = output_file.replace('.tsv', '_checkpoint.tsv')

        # Load existing checkpoint
        output_data, processed_indices = self.load_checkpoint(checkpoint_file)

        # Read input data
        try:
            df = pd.read_csv(input_file, sep='\t')
        except Exception as e:
            print(f"Error reading input file: {e}")
            return

        # Apply row slicing
        original_length = len(df)
        if end_row is None:
            end_row = original_length
        else:
            end_row = min(end_row, original_length)

        if start_row >= original_length:
            print(f"❌ Error: start_row {start_row} is >= total rows {original_length}")
            return

        df_slice = df.iloc[start_row:end_row].copy()
        total_rows = len(df_slice)

        initial_processed = len(output_data)

        print(f"📊 Processing Overview:")
        print(f"   Input file total rows: {original_length}")
        print(f"   Processing slice: rows {start_row} to {end_row-1}")
        print(f"   Rows in slice: {total_rows}")
        print(f"   Already processed: {initial_processed}")
        print(f"   Remaining: {total_rows - initial_processed}")
        print(f"   Checkpoint saves every {save_every} rows")
        print(f"   Estimated cost: ~${total_rows * 0.0014:.2f}")
        print(f"   Estimated time: ~{total_rows * 1.5 / 3600:.1f} hours")
        print(f"   Debug mode: First {debug_first_n} generations will show detailed output")
        print("-" * 80)

        successful_processed = 0
        failed_processed = 0
        generations_count = 0
        processed_this_run = 0

        for index, row in df_slice.iterrows():
            original_index = str(row.get('Index', index))

            # Skip if already processed
            if original_index in processed_indices:
                continue

            concatenated_abstracts = str(row.get('ConcatenatedAbstracts', ''))
            keywords = str(row.get('TopKeywords', ''))

            # Skip if no content
            if not concatenated_abstracts or concatenated_abstracts == 'nan':
                print(f"[{processed_this_run + 1}/{total_rows}] Skipping empty cluster {original_index}")
                continue

            actual_row_num = start_row + processed_this_run
            print(f"[{processed_this_run + 1}/{total_rows}] Processing row {actual_row_num} (cluster {original_index})...")

            # Create prompt
            prompt = self.create_few_shot_prompt(concatenated_abstracts, keywords)

            # DEBUG: Print detailed input/output for first few generations
            if generations_count < debug_first_n:
                print("\n" + "="*80)
                print(f"🔍 DEBUG OUTPUT FOR GENERATION #{generations_count + 1}")
                print("="*80)
                print(f"📋 CLUSTER INDEX: {original_index} (Row {actual_row_num})")
                print(f"🔑 KEYWORDS: {keywords}")
                print(f"📄 ABSTRACTS (first 500 chars): {concatenated_abstracts[:500]}...")
                print("\n" + "-"*60)
                print("📤 FULL PROMPT BEING SENT TO API:")
                print("-"*60)
                print(prompt)
                print("-"*60)

            # Call API
            response = self.call_deepseek_api(prompt)

            # Continue debug printing
            if generations_count < debug_first_n:
                print("📥 RAW API RESPONSE:")
                print("-"*60)
                print(response if response else "❌ NO RESPONSE / ERROR")
                print("-"*60)

            if response:
                # Parse response (now includes integrated cleanup)
                abstract_summary, short_summary, title = self.parse_response(response)

                # Continue debug printing
                if generations_count < debug_first_n:
                    print("🔧 PARSED & CLEANED COMPONENTS:")
                    print("-"*60)
                    print(f"📝 ABSTRACT SUMMARY:\n{abstract_summary}\n")
                    print(f"⚡ SHORT SUMMARY:\n{short_summary}\n")
                    print(f"🏷️  TITLE:\n{title}\n")
                    print("="*80 + "\n")

                # Add to output data
                output_data.append({
                    'OriginalIndex': original_index,
                    'SourceRow': actual_row_num,  # Track original row number
                    'AbstractSummary': abstract_summary,
                    'ShortSummary': short_summary,
                    'Title': title,
                    'OriginalKeywords': keywords,
                    'OriginalText': concatenated_abstracts[:1000] + "..." if len(concatenated_abstracts) > 1000 else concatenated_abstracts
                })

                successful_processed += 1
                print(f"✓ Success! ({successful_processed} total successes)")
            else:
                if generations_count < debug_first_n:
                    print("❌ FAILED TO PARSE OR GET RESPONSE")
                    print("="*80 + "\n")

                print(f"✗ Failed to process cluster {original_index}")
                # Add empty entry to maintain tracking
                output_data.append({
                    'OriginalIndex': original_index,
                    'SourceRow': actual_row_num,
                    'AbstractSummary': 'Failed to generate',
                    'ShortSummary': 'Failed to generate',
                    'Title': 'Failed to generate',
                    'OriginalKeywords': keywords,
                    'OriginalText': concatenated_abstracts[:1000] + "..." if len(concatenated_abstracts) > 1000 else concatenated_abstracts
                })
                failed_processed += 1

            generations_count += 1
            processed_this_run += 1

            # Update processed set
            processed_indices.add(original_index)

            # Save checkpoint periodically
            if len(output_data) % save_every == 0:
                self.save_checkpoint(output_data, checkpoint_file)
                time_remaining = self.estimate_time_remaining(processed_this_run, total_rows)
                print(f"📁 Checkpoint saved! Progress: {processed_this_run}/{total_rows} | ETA: {time_remaining}")

            # Rate limiting
            time.sleep(delay)

        # Final save
        try:
            output_df = pd.DataFrame(output_data)
            output_df.to_csv(output_file, sep='\t', index=False, quoting=csv.QUOTE_ALL)

            print(f"\n🎉 GENERATION COMPLETED!")
            print(f"✓ Successfully processed: {successful_processed}")
            print(f"✗ Failed: {failed_processed}")
            print(f"📄 Total entries saved: {len(output_data)}")
            print(f"💾 Final output saved to: {output_file}")
            print(f"💰 Estimated cost: ~${successful_processed * 0.0014:.2f}")
            print(f"📊 Processed rows {start_row} to {end_row-1} from source file")

            # Clean up checkpoint file
            if os.path.exists(checkpoint_file):
                os.remove(checkpoint_file)
                print(f"🗑️  Checkpoint file cleaned up")

        except Exception as e:
            print(f"Error saving final output file: {e}")
            print(f"Your data is still safe in checkpoint: {checkpoint_file}")

def main():
    """
    Main function to run the training data generation with row slicing
    """
    # Configuration for processing all 30,000 examples
    API_KEY = "sk-6185ef64c68d473d984963356ab0378e"  # Replace with your actual API key
    INPUT_FILE = "/home/joneill/pubmed_clustered_data_sciner.tsv"  # Your input TSV file

    # Row slicing configuration - MODIFY THESE FOR YOUR BATCHES
    START_ROW = 0          # Starting row (0-based)
    END_ROW = 30000        # Ending row (None for all rows, or specify number)
    BATCH_NAME = "full"    # Used in output filename

    # You can also run in batches, e.g.:
    # Batch 1: START_ROW = 0, END_ROW = 5000, BATCH_NAME = "batch1"
    # Batch 2: START_ROW = 5000, END_ROW = 10000, BATCH_NAME = "batch2"
    # Batch 3: START_ROW = 10000, END_ROW = 15000, BATCH_NAME = "batch3"
    # etc.

    OUTPUT_FILE = f"bsg_training_data_{BATCH_NAME}.tsv"  # Output file for training data
    DELAY_BETWEEN_CALLS = 1.0  # Seconds between API calls
    SAVE_EVERY = 50  # Save checkpoint every N rows
    DEBUG_FIRST_N = 3  # Print full input/output for first N generations for QC

    # Initialize generator
    generator = EnhancedDeepSeekTrainingDataGenerator(API_KEY)

    # Calculate batch info
    total_rows_to_process = END_ROW - START_ROW if END_ROW else "all remaining"

    # Process data
    print("🚀 Starting Enhanced DeepSeek Training Data Generation")
    print("="*80)
    print(f"🎯 Processing: {total_rows_to_process} rows (from row {START_ROW} to {END_ROW-1 if END_ROW else 'end'})")
    print(f"💰 Estimated cost: ~${(END_ROW - START_ROW if END_ROW else 30000) * 0.0014:.2f}")
    print(f"⏱️  Estimated time: ~{(END_ROW - START_ROW if END_ROW else 30000) * 1.5 / 3600:.1f} hours")
    print(f"🔍 Debug mode: Will show detailed input/output for first {DEBUG_FIRST_N} generations")
    print(f"💾 Automatic checkpointing every {SAVE_EVERY} rows")
    print(f"🔄 Auto-resume: Restart script to continue from checkpoint")
    print(f"🧹 Integrated cleanup: All outputs automatically cleaned of formatting artifacts")
    print("="*80)

    generator.process_data_file(
        INPUT_FILE, OUTPUT_FILE, DELAY_BETWEEN_CALLS, SAVE_EVERY, DEBUG_FIRST_N,
        START_ROW, END_ROW
    )

    print("\n🎉 Training data generation completed!")
    print(f"📁 Output file: {OUTPUT_FILE}")
    print("✨ Data is automatically cleaned and ready for training! 🧪")

if __name__ == "__main__":
    main()