bsg_training_data_gen.py · jimnoneill/BSG_CyLlama at e6a3f1955a13e98ae8ca7d32a44fb2f7bed7e2f3

File size: 30,685 Bytes

9bd985f

#!/usr/bin/env python3
"""
Enhanced DeepSeek Training Data Generator for Scientific Summarization
Generates high-quality training data with integrated cleanup and row slicing
"""

import requests
import json
import pandas as pd
import time
import csv
import os
import re
import hashlib
from pathlib import Path
from typing import List, Tuple, Dict, Optional
from datetime import datetime, timedelta

class EnhancedDeepSeekTrainingDataGenerator:
    """Generate training data using DeepSeek API with integrated cleanup and row slicing"""

    def __init__(self, api_key: str, base_url: str = "https://api.deepseek.com/v1"):
        """
        Initialize DeepSeek API client

        Args:
            api_key: Your DeepSeek API key
            base_url: DeepSeek API base URL
        """
        self.api_key = api_key
        self.base_url = base_url
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }
        self.start_time = None
        self.processed_count = 0

    def clean_deepseek_output(self, text: str) -> str:
        """
        Clean up DeepSeek output to remove formatting artifacts

        Args:
            text: Raw text from DeepSeek API

        Returns:
            Cleaned text without formatting artifacts
        """
        if not text or pd.isna(text):
            return text

        text = str(text).strip()

        # Remove numbered prefixes (1., 2., 3.)
        text = re.sub(r'^\d+\.\s*', '', text)

        # Remove component labels
        text = re.sub(r'^(ABSTRACT[_\s]*SUMMARY:?|SHORT[_\s]*SUMMARY:?|TITLE:?)', '', text, flags=re.IGNORECASE)

        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)

        # Remove trailing colons or dashes
        text = re.sub(r'[:\-]+$', '', text)

        # Remove markdown formatting
        text = re.sub(r'\*+', '', text)

        # Remove quotes that sometimes wrap the entire response
        text = re.sub(r'^["\']+|["\']+$', '', text)

        return text.strip()

    def create_few_shot_prompt(self, concatenated_abstracts: str, keywords: str) -> str:
        """
        Create optimized few-shot prompt for DeepSeek with clean output formatting
        """
        prompt = (
            "You are an expert scientific summarization assistant. Generate exactly three components separated by '|||':\n"
            "1. ABSTRACT_SUMMARY: A detailed 4-6 sentence summary highlighting key findings, methods, and implications\n"
            "2. SHORT_SUMMARY: A concise 2-3 sentence summary capturing the core essence\n"
            "3. TITLE: A sophisticated, detailed title reflecting the research scope and methods\n\n"
            "CRITICAL: Respond ONLY with the three components separated by '|||'. Do not include conversational text, explanations, or markdown formatting.\n\n"
            "Format: ABSTRACT_SUMMARY|||SHORT_SUMMARY|||TITLE\n\n"
            "Focus on:\n"
            "- Specific computational methods, techniques, and approaches\n"
            "- Key biological processes and mechanisms\n"
            "- Research methodologies and experimental designs\n"
            "- Clinical or therapeutic implications\n"
            "- Be specific and detailed; avoid generic terms\n\n"
        )

        # Few-shot Example 1 - Immunology/Antimicrobial Research
        example1_text = (
            "Studies investigated mammary gland candidiasis models using immunocompetent and immunodeficient mice "
            "treated with amphotericin B. Complement activation analysis revealed tissue inflammation patterns. "
            "Research on antigen processing examined proteasome mutants lacking specific protease activities for "
            "peptide generation. Novel ankyrin-repeat family member MAIL was identified with nuclear localization "
            "potentiating IL-6 expression. Antimicrobial peptides pseudins 1-4 were isolated from frog skin showing "
            "activity against various pathogens."
        )
        example1_keywords = "MAIL; proteasome; antimicrobial peptides; complement activation; mammary glands"

        prompt += (
            f"INPUT: {example1_text}\n"
            f"KEYWORDS: {example1_keywords}\n"
            "OUTPUT: "
            "Comprehensive investigation of innate immune responses utilizing murine mammary gland candidiasis models "
            "with complement activation analysis and proteasome-mediated antigen processing pathways, complemented by "
            "characterization of novel antimicrobial peptides and nuclear transcription modulators. Research demonstrates "
            "the critical role of specific protease activities in MHC class I-restricted peptide generation while identifying "
            "MAIL as a nuclear factor potentiating cytokine expression and pseudins as promising therapeutic antimicrobials. "
            "These findings advance understanding of immunopathological mechanisms and provide validated experimental models "
            "for antifungal compound evaluation.|||"
            "Studies utilized murine models to investigate immune responses in candidiasis while characterizing novel "
            "antimicrobial compounds and antigen processing mechanisms. Research identified critical protease activities "
            "and nuclear factors regulating immune responses.|||"
            "Integrated Immunological Modeling and Antimicrobial Peptide Discovery: Proteasome-Mediated Antigen Processing "
            "and Complement-Dependent Host Defense Mechanisms\n\n"
        )

        # Few-shot Example 2 - Biotechnology/Tissue Engineering
        example2_text = (
            "Biotechnology development focused on hematopoietic stem cell expansion using cytokine combinations. "
            "Temperature-responsive polymers enabled designed cell sheet engineering for tissue applications. "
            "Vascular anastomosis techniques using titanium clips reduced neointimal hyperplasia. Endothelial cell "
            "seeding protocols for vascular grafts were optimized. Gene transfer therapies for therapeutic angiogenesis "
            "showed clinical promise in cardiovascular applications."
        )
        example2_keywords = "biotechnology; tissue engineering; vascular grafts; stem cells; angiogenesis"

        prompt += (
            f"INPUT: {example2_text}\n"
            f"KEYWORDS: {example2_keywords}\n"
            "OUTPUT: "
            "Advanced biotechnology approaches combining cytokine-mediated hematopoietic stem cell expansion protocols "
            "with temperature-responsive polymer systems for precision cell sheet engineering and vascular reconstruction. "
            "Integration of titanium clip anastomosis techniques and optimized endothelial cell seeding methodologies "
            "demonstrates significant reduction in neointimal hyperplasia while enhancing graft patency. Gene transfer "
            "strategies for therapeutic angiogenesis represent promising clinical interventions for cardiovascular disease "
            "treatment, establishing proof-of-concept for growth factor-mediated collateral vessel development.|||"
            "Research combines stem cell expansion technologies with polymer-based cell engineering and vascular "
            "reconstruction techniques. Gene therapy approaches show clinical promise for treating cardiovascular disease "
            "through enhanced angiogenesis.|||"
            "Multiscale Biotechnology Integration: Cytokine-Mediated Stem Cell Engineering and Polymer-Assisted "
            "Vascular Reconstruction with Gene Transfer-Enhanced Therapeutic Angiogenesis\n\n"
        )

        # User query
        prompt += (
            f"INPUT: {concatenated_abstracts}\n"
            f"KEYWORDS: {keywords}\n"
            "OUTPUT:"
        )

        return prompt

    def call_deepseek_api(self, prompt: str, max_retries: int = 3) -> str:
        """
        Call DeepSeek API with enhanced retry logic and timeout handling
        """
        for attempt in range(max_retries):
            try:
                payload = {
                    "model": "deepseek-chat",  # DeepSeek-V3 instruct model
                    "messages": [
                        {
                            "role": "user",
                            "content": prompt
                        }
                    ],
                    "max_tokens": 800,
                    "temperature": 0.7,
                    "top_p": 0.9,
                    "stream": False
                }

                # Enhanced timeout handling
                response = requests.post(
                    f"{self.base_url}/chat/completions",
                    headers=self.headers,
                    json=payload,
                    timeout=(10, 60)  # (connection timeout, read timeout)
                )

                if response.status_code == 200:
                    result = response.json()
                    return result['choices'][0]['message']['content'].strip()
                elif response.status_code == 429:  # Rate limit
                    wait_time = min(60, 2 ** attempt * 30)
                    print(f"Rate limit hit. Waiting {wait_time} seconds...")
                    time.sleep(wait_time)
                    continue
                elif response.status_code >= 500:  # Server errors
                    wait_time = min(30, 2 ** attempt * 5)
                    print(f"Server error {response.status_code}. Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"API Error {response.status_code}: {response.text}")
                    if attempt < max_retries - 1:
                        time.sleep(2 ** attempt)
                        continue
                    else:
                        return ""

            except requests.exceptions.Timeout as e:
                print(f"Timeout error on attempt {attempt + 1}: {e}")
                if attempt < max_retries - 1:
                    wait_time = min(30, 2 ** attempt * 10)
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"Max retries exceeded due to timeout")
                    return ""
            except requests.exceptions.ConnectionError as e:
                print(f"Connection error on attempt {attempt + 1}: {e}")
                if attempt < max_retries - 1:
                    wait_time = min(30, 2 ** attempt * 10)
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                    continue
                else:
                    print(f"Max retries exceeded due to connection error")
                    return ""
            except Exception as e:
                print(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)
                    continue
                else:
                    return ""

        return ""

    def parse_response(self, response: str) -> Tuple[str, str, str]:
        """
        Enhanced parsing for DeepSeek responses with integrated cleanup
        """
        if not response:
            return "Failed to generate", "Failed to generate", "Failed to generate"

        # Clean the response first
        response = response.strip()

        # Remove common DeepSeek conversational elements
        conversational_starters = [
            "Here are the structured outputs",
            "Here's the structured output",
            "Based on the provided keywords",
            "Let me know if you'd like",
            "Would you like me to",
            "I can help you",
            "Here's my analysis"
        ]

        for starter in conversational_starters:
            if response.startswith(starter):
                # Find the actual content after conversational part
                lines = response.split('\n')
                content_lines = []
                found_content = False
                for line in lines:
                    if any(marker in line.upper() for marker in ['ABSTRACT_SUMMARY:', 'ABSTRACT:', '1.', '**1.']):
                        found_content = True
                    if found_content:
                        content_lines.append(line)
                if content_lines:
                    response = '\n'.join(content_lines)
                break

        # Remove markdown formatting
        response = re.sub(r'\*\*(\d+\.)\*\*', r'\1', response)  # **1.** -> 1.
        response = re.sub(r'\*\*(.*?)\*\*', r'\1', response)    # **text** -> text
        response = re.sub(r'^\s*---\s*$', '', response, flags=re.MULTILINE)  # Remove --- lines

        abstract_summary = ""
        short_summary = ""
        title = ""

        try:
            # Method 1: Look for standard ||| separator
            if '|||' in response:
                parts = [part.strip() for part in response.split('|||')]
                if len(parts) >= 3:
                    abstract_summary = parts[0]
                    short_summary = parts[1]
                    title = parts[2]
                elif len(parts) == 2:
                    abstract_summary = parts[0]
                    title = parts[1]
                    # Generate short summary from abstract
                    sentences = re.split(r'[.!?]+', abstract_summary)
                    short_summary = '. '.join(sentences[:2]).strip() + '.'

            # Method 2: Look for numbered sections (DeepSeek's preferred format)
            elif "1. ABSTRACT_SUMMARY:" in response or "1.ABSTRACT_SUMMARY:" in response:
                # Extract by numbered sections
                abstract_match = re.search(r'1\.?\s*ABSTRACT_SUMMARY:\s*(.*?)(?=2\.|3\.|$)', response, re.DOTALL | re.IGNORECASE)
                short_match = re.search(r'2\.?\s*SHORT_SUMMARY:\s*(.*?)(?=3\.|$)', response, re.DOTALL | re.IGNORECASE)
                title_match = re.search(r'3\.?\s*TITLE:\s*(.*?)(?=\n\n|$)', response, re.DOTALL | re.IGNORECASE)

                if abstract_match:
                    abstract_summary = abstract_match.group(1).strip()
                if short_match:
                    short_summary = short_match.group(1).strip()
                if title_match:
                    title = title_match.group(1).strip()

            # Method 3: Look for any mention of the three components
            else:
                # Try to find ABSTRACT_SUMMARY, SHORT_SUMMARY, TITLE anywhere
                abstract_match = re.search(r'ABSTRACT[_\s]*SUMMARY:?\s*(.*?)(?=SHORT|TITLE|$)', response, re.DOTALL | re.IGNORECASE)
                short_match = re.search(r'SHORT[_\s]*SUMMARY:?\s*(.*?)(?=TITLE|$)', response, re.DOTALL | re.IGNORECASE)
                title_match = re.search(r'TITLE:?\s*(.*?)(?=\n|$)', response, re.DOTALL | re.IGNORECASE)

                if abstract_match:
                    abstract_summary = abstract_match.group(1).strip()
                if short_match:
                    short_summary = short_match.group(1).strip()
                if title_match:
                    title = title_match.group(1).strip()

        except Exception as e:
            print(f"Error in enhanced parsing: {e}")

        # Fallback: if still no content, try to extract from the full response
        if not abstract_summary and not short_summary and not title:
            # Split response into sentences and distribute intelligently
            sentences = re.split(r'[.!?]+', response)
            sentences = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10]

            if len(sentences) >= 6:
                abstract_summary = '. '.join(sentences[:4]) + '.'
                short_summary = '. '.join(sentences[4:6]) + '.'
                title = sentences[6] if len(sentences) > 6 else "Advanced Scientific Research Analysis"
            elif len(sentences) >= 3:
                abstract_summary = '. '.join(sentences[:2]) + '.'
                short_summary = sentences[2] + '.'
                title = sentences[-1] if len(sentences) > 3 else "Scientific Research Study"
            elif len(sentences) >= 1:
                abstract_summary = sentences[0]
                short_summary = sentences[0][:100] + "..." if len(sentences[0]) > 100 else sentences[0]
                title = "Scientific Analysis"
            else:
                abstract_summary = response[:200] + "..." if len(response) > 200 else response
                short_summary = response[:100] + "..." if len(response) > 100 else response
                title = "Research Summary"

        # Apply integrated cleanup to all components
        abstract_summary = self.clean_deepseek_output(abstract_summary)
        short_summary = self.clean_deepseek_output(short_summary)
        title = self.clean_deepseek_output(title)

        # Ensure reasonable lengths after cleanup
        if len(abstract_summary.split()) > 150:
            abstract_summary = ' '.join(abstract_summary.split()[:150]) + "..."

        if len(short_summary.split()) > 75:
            short_summary = ' '.join(short_summary.split()[:75]) + "..."

        if len(title.split()) > 25:
            title = ' '.join(title.split()[:25]) + "..."

        # Final validation - ensure we have actual content
        if not abstract_summary or abstract_summary in ["", "Content not extracted", "Content not properly extracted"]:
            abstract_summary = "Content generation failed"
        if not short_summary or short_summary in ["", "Content not extracted", "Content not properly extracted"]:
            short_summary = "Content generation failed"
        if not title or title in ["", "Content not extracted", "Content not properly extracted"]:
            title = "Content generation failed"

        return abstract_summary, short_summary, title

    def load_checkpoint(self, checkpoint_file: str) -> Tuple[List[Dict], set]:
        """
        Load existing checkpoint data and return processed data + processed indices
        """
        if os.path.exists(checkpoint_file):
            try:
                df = pd.read_csv(checkpoint_file, sep='\t')
                processed_data = df.to_dict('records')
                processed_indices = set(df['OriginalIndex'].astype(str))
                print(f"✓ Loaded checkpoint with {len(processed_data)} processed entries")
                return processed_data, processed_indices
            except Exception as e:
                print(f"Error loading checkpoint: {e}")
                return [], set()
        return [], set()

    def save_checkpoint(self, output_data: List[Dict], checkpoint_file: str):
        """
        Save current progress to checkpoint file
        """
        try:
            df = pd.DataFrame(output_data)
            df.to_csv(checkpoint_file, sep='\t', index=False, quoting=csv.QUOTE_ALL)
            print(f"💾 Checkpoint saved: {len(output_data)} entries")
        except Exception as e:
            print(f"Error saving checkpoint: {e}")

    def estimate_time_remaining(self, current_progress: int, total_rows: int) -> str:
        """
        Estimate time remaining based on current progress
        """
        if self.start_time is None or current_progress == 0:
            return "Calculating..."

        elapsed = datetime.now() - self.start_time
        elapsed_seconds = elapsed.total_seconds()

        if current_progress > 0:
            avg_time_per_row = elapsed_seconds / current_progress
            remaining_rows = total_rows - current_progress
            remaining_seconds = remaining_rows * avg_time_per_row
            remaining_time = timedelta(seconds=int(remaining_seconds))
            return str(remaining_time)

        return "Calculating..."

    def process_data_file(self, input_file: str, output_file: str, delay: float = 1.0,
                         save_every: int = 50, debug_first_n: int = 3,
                         start_row: int = 0, end_row: Optional[int] = None):
        """
        Process the input TSV file and generate training data with checkpointing and row slicing

        Args:
            input_file: Path to input TSV file
            output_file: Path to output TSV file
            delay: Delay between API calls to respect rate limits
            save_every: Save checkpoint every N processed rows
            debug_first_n: Print full input/output for first N generations for QC
            start_row: Starting row index (0-based)
            end_row: Ending row index (0-based, None for all remaining rows)
        """
        self.start_time = datetime.now()

        # Setup checkpoint file
        checkpoint_file = output_file.replace('.tsv', '_checkpoint.tsv')

        # Load existing checkpoint
        output_data, processed_indices = self.load_checkpoint(checkpoint_file)

        # Read input data
        try:
            df = pd.read_csv(input_file, sep='\t')
        except Exception as e:
            print(f"Error reading input file: {e}")
            return

        # Apply row slicing
        original_length = len(df)
        if end_row is None:
            end_row = original_length
        else:
            end_row = min(end_row, original_length)

        if start_row >= original_length:
            print(f"❌ Error: start_row {start_row} is >= total rows {original_length}")
            return

        df_slice = df.iloc[start_row:end_row].copy()
        total_rows = len(df_slice)

        initial_processed = len(output_data)

        print(f"📊 Processing Overview:")
        print(f"   Input file total rows: {original_length}")
        print(f"   Processing slice: rows {start_row} to {end_row-1}")
        print(f"   Rows in slice: {total_rows}")
        print(f"   Already processed: {initial_processed}")
        print(f"   Remaining: {total_rows - initial_processed}")
        print(f"   Checkpoint saves every {save_every} rows")
        print(f"   Estimated cost: ~${total_rows * 0.0014:.2f}")
        print(f"   Estimated time: ~{total_rows * 1.5 / 3600:.1f} hours")
        print(f"   Debug mode: First {debug_first_n} generations will show detailed output")
        print("-" * 80)

        successful_processed = 0
        failed_processed = 0
        generations_count = 0
        processed_this_run = 0

        for index, row in df_slice.iterrows():
            original_index = str(row.get('Index', index))

            # Skip if already processed
            if original_index in processed_indices:
                continue

            concatenated_abstracts = str(row.get('ConcatenatedAbstracts', ''))
            keywords = str(row.get('TopKeywords', ''))

            # Skip if no content
            if not concatenated_abstracts or concatenated_abstracts == 'nan':
                print(f"[{processed_this_run + 1}/{total_rows}] Skipping empty cluster {original_index}")
                continue

            actual_row_num = start_row + processed_this_run
            print(f"[{processed_this_run + 1}/{total_rows}] Processing row {actual_row_num} (cluster {original_index})...")

            # Create prompt
            prompt = self.create_few_shot_prompt(concatenated_abstracts, keywords)

            # DEBUG: Print detailed input/output for first few generations
            if generations_count < debug_first_n:
                print("\n" + "="*80)
                print(f"🔍 DEBUG OUTPUT FOR GENERATION #{generations_count + 1}")
                print("="*80)
                print(f"📋 CLUSTER INDEX: {original_index} (Row {actual_row_num})")
                print(f"🔑 KEYWORDS: {keywords}")
                print(f"📄 ABSTRACTS (first 500 chars): {concatenated_abstracts[:500]}...")
                print("\n" + "-"*60)
                print("📤 FULL PROMPT BEING SENT TO API:")
                print("-"*60)
                print(prompt)
                print("-"*60)

            # Call API
            response = self.call_deepseek_api(prompt)

            # Continue debug printing
            if generations_count < debug_first_n:
                print("📥 RAW API RESPONSE:")
                print("-"*60)
                print(response if response else "❌ NO RESPONSE / ERROR")
                print("-"*60)

            if response:
                # Parse response (now includes integrated cleanup)
                abstract_summary, short_summary, title = self.parse_response(response)

                # Continue debug printing
                if generations_count < debug_first_n:
                    print("🔧 PARSED & CLEANED COMPONENTS:")
                    print("-"*60)
                    print(f"📝 ABSTRACT SUMMARY:\n{abstract_summary}\n")
                    print(f"⚡ SHORT SUMMARY:\n{short_summary}\n")
                    print(f"🏷️  TITLE:\n{title}\n")
                    print("="*80 + "\n")

                # Add to output data
                output_data.append({
                    'OriginalIndex': original_index,
                    'SourceRow': actual_row_num,  # Track original row number
                    'AbstractSummary': abstract_summary,
                    'ShortSummary': short_summary,
                    'Title': title,
                    'OriginalKeywords': keywords,
                    'OriginalText': concatenated_abstracts[:1000] + "..." if len(concatenated_abstracts) > 1000 else concatenated_abstracts
                })

                successful_processed += 1
                print(f"✓ Success! ({successful_processed} total successes)")
            else:
                if generations_count < debug_first_n:
                    print("❌ FAILED TO PARSE OR GET RESPONSE")
                    print("="*80 + "\n")

                print(f"✗ Failed to process cluster {original_index}")
                # Add empty entry to maintain tracking
                output_data.append({
                    'OriginalIndex': original_index,
                    'SourceRow': actual_row_num,
                    'AbstractSummary': 'Failed to generate',
                    'ShortSummary': 'Failed to generate',
                    'Title': 'Failed to generate',
                    'OriginalKeywords': keywords,
                    'OriginalText': concatenated_abstracts[:1000] + "..." if len(concatenated_abstracts) > 1000 else concatenated_abstracts
                })
                failed_processed += 1

            generations_count += 1
            processed_this_run += 1

            # Update processed set
            processed_indices.add(original_index)

            # Save checkpoint periodically
            if len(output_data) % save_every == 0:
                self.save_checkpoint(output_data, checkpoint_file)
                time_remaining = self.estimate_time_remaining(processed_this_run, total_rows)
                print(f"📁 Checkpoint saved! Progress: {processed_this_run}/{total_rows} | ETA: {time_remaining}")

            # Rate limiting
            time.sleep(delay)

        # Final save
        try:
            output_df = pd.DataFrame(output_data)
            output_df.to_csv(output_file, sep='\t', index=False, quoting=csv.QUOTE_ALL)

            print(f"\n🎉 GENERATION COMPLETED!")
            print(f"✓ Successfully processed: {successful_processed}")
            print(f"✗ Failed: {failed_processed}")
            print(f"📄 Total entries saved: {len(output_data)}")
            print(f"💾 Final output saved to: {output_file}")
            print(f"💰 Estimated cost: ~${successful_processed * 0.0014:.2f}")
            print(f"📊 Processed rows {start_row} to {end_row-1} from source file")

            # Clean up checkpoint file
            if os.path.exists(checkpoint_file):
                os.remove(checkpoint_file)
                print(f"🗑️  Checkpoint file cleaned up")

        except Exception as e:
            print(f"Error saving final output file: {e}")
            print(f"Your data is still safe in checkpoint: {checkpoint_file}")

def main():
    """
    Main function to run the training data generation with row slicing
    """
    # Configuration for processing all 30,000 examples
    API_KEY = "sk-6185ef64c68d473d984963356ab0378e"  # Replace with your actual API key
    INPUT_FILE = "/home/joneill/pubmed_clustered_data_sciner.tsv"  # Your input TSV file

    # Row slicing configuration - MODIFY THESE FOR YOUR BATCHES
    START_ROW = 0          # Starting row (0-based)
    END_ROW = 30000        # Ending row (None for all rows, or specify number)
    BATCH_NAME = "full"    # Used in output filename

    # You can also run in batches, e.g.:
    # Batch 1: START_ROW = 0, END_ROW = 5000, BATCH_NAME = "batch1"
    # Batch 2: START_ROW = 5000, END_ROW = 10000, BATCH_NAME = "batch2"
    # Batch 3: START_ROW = 10000, END_ROW = 15000, BATCH_NAME = "batch3"
    # etc.

    OUTPUT_FILE = f"bsg_training_data_{BATCH_NAME}.tsv"  # Output file for training data
    DELAY_BETWEEN_CALLS = 1.0  # Seconds between API calls
    SAVE_EVERY = 50  # Save checkpoint every N rows
    DEBUG_FIRST_N = 3  # Print full input/output for first N generations for QC

    # Initialize generator
    generator = EnhancedDeepSeekTrainingDataGenerator(API_KEY)

    # Calculate batch info
    total_rows_to_process = END_ROW - START_ROW if END_ROW else "all remaining"

    # Process data
    print("🚀 Starting Enhanced DeepSeek Training Data Generation")
    print("="*80)
    print(f"🎯 Processing: {total_rows_to_process} rows (from row {START_ROW} to {END_ROW-1 if END_ROW else 'end'})")
    print(f"💰 Estimated cost: ~${(END_ROW - START_ROW if END_ROW else 30000) * 0.0014:.2f}")
    print(f"⏱️  Estimated time: ~{(END_ROW - START_ROW if END_ROW else 30000) * 1.5 / 3600:.1f} hours")
    print(f"🔍 Debug mode: Will show detailed input/output for first {DEBUG_FIRST_N} generations")
    print(f"💾 Automatic checkpointing every {SAVE_EVERY} rows")
    print(f"🔄 Auto-resume: Restart script to continue from checkpoint")
    print(f"🧹 Integrated cleanup: All outputs automatically cleaned of formatting artifacts")
    print("="*80)

    generator.process_data_file(
        INPUT_FILE, OUTPUT_FILE, DELAY_BETWEEN_CALLS, SAVE_EVERY, DEBUG_FIRST_N,
        START_ROW, END_ROW
    )

    print("\n🎉 Training data generation completed!")
    print(f"📁 Output file: {OUTPUT_FILE}")
    print("✨ Data is automatically cleaned and ready for training! 🧪")

if __name__ == "__main__":
    main()