bsg_training_data_gen.py · jimnoneill/BSG_CyLlama at e6a3f1955a13e98ae8ca7d32a44fb2f7bed7e2f3

BSG_CyLlama / bsg_training_data_gen.py

Upload bsg_training_data_gen.py with huggingface_hub

9bd985f verified 5 days ago

30.7 kB

	#!/usr/bin/env python3
	"""
	Enhanced DeepSeek Training Data Generator for Scientific Summarization
	Generates high-quality training data with integrated cleanup and row slicing
	"""

	import requests
	import json
	import pandas as pd
	import time
	import csv
	import os
	import re
	import hashlib
	from pathlib import Path
	from typing import List, Tuple, Dict, Optional
	from datetime import datetime, timedelta

	class EnhancedDeepSeekTrainingDataGenerator:
	"""Generate training data using DeepSeek API with integrated cleanup and row slicing"""

	def __init__(self, api_key: str, base_url: str = "https://api.deepseek.com/v1"):
	"""
	Initialize DeepSeek API client

	Args:
	api_key: Your DeepSeek API key
	base_url: DeepSeek API base URL
	"""
	self.api_key = api_key
	self.base_url = base_url
	self.headers = {
	"Authorization": f"Bearer {api_key}",
	"Content-Type": "application/json"
	}
	self.start_time = None
	self.processed_count = 0

	def clean_deepseek_output(self, text: str) -> str:
	"""
	Clean up DeepSeek output to remove formatting artifacts

	Args:
	text: Raw text from DeepSeek API

	Returns:
	Cleaned text without formatting artifacts
	"""
	if not text or pd.isna(text):
	return text

	text = str(text).strip()

	# Remove numbered prefixes (1., 2., 3.)
	text = re.sub(r'^\d+\.\s*', '', text)

	# Remove component labels
	text = re.sub(r'^(ABSTRACT[_\s]SUMMARY:?\|SHORT[_\s]SUMMARY:?\|TITLE:?)', '', text, flags=re.IGNORECASE)

	# Remove excessive whitespace
	text = re.sub(r'\s+', ' ', text)

	# Remove trailing colons or dashes
	text = re.sub(r'[:\-]+$', '', text)

	# Remove markdown formatting
	text = re.sub(r'\*+', '', text)

	# Remove quotes that sometimes wrap the entire response
	text = re.sub(r'^["\']+\|["\']+$', '', text)

	return text.strip()

	def create_few_shot_prompt(self, concatenated_abstracts: str, keywords: str) -> str:
	"""
	Create optimized few-shot prompt for DeepSeek with clean output formatting
	"""
	prompt = (
	"You are an expert scientific summarization assistant. Generate exactly three components separated by '\|\|\|':\n"
	"1. ABSTRACT_SUMMARY: A detailed 4-6 sentence summary highlighting key findings, methods, and implications\n"
	"2. SHORT_SUMMARY: A concise 2-3 sentence summary capturing the core essence\n"
	"3. TITLE: A sophisticated, detailed title reflecting the research scope and methods\n\n"
	"CRITICAL: Respond ONLY with the three components separated by '\|\|\|'. Do not include conversational text, explanations, or markdown formatting.\n\n"
	"Format: ABSTRACT_SUMMARY\|\|\|SHORT_SUMMARY\|\|\|TITLE\n\n"
	"Focus on:\n"
	"- Specific computational methods, techniques, and approaches\n"
	"- Key biological processes and mechanisms\n"
	"- Research methodologies and experimental designs\n"
	"- Clinical or therapeutic implications\n"
	"- Be specific and detailed; avoid generic terms\n\n"
	)

	# Few-shot Example 1 - Immunology/Antimicrobial Research
	example1_text = (
	"Studies investigated mammary gland candidiasis models using immunocompetent and immunodeficient mice "
	"treated with amphotericin B. Complement activation analysis revealed tissue inflammation patterns. "
	"Research on antigen processing examined proteasome mutants lacking specific protease activities for "
	"peptide generation. Novel ankyrin-repeat family member MAIL was identified with nuclear localization "
	"potentiating IL-6 expression. Antimicrobial peptides pseudins 1-4 were isolated from frog skin showing "
	"activity against various pathogens."
	)
	example1_keywords = "MAIL; proteasome; antimicrobial peptides; complement activation; mammary glands"

	prompt += (
	f"INPUT: {example1_text}\n"
	f"KEYWORDS: {example1_keywords}\n"
	"OUTPUT: "
	"Comprehensive investigation of innate immune responses utilizing murine mammary gland candidiasis models "
	"with complement activation analysis and proteasome-mediated antigen processing pathways, complemented by "
	"characterization of novel antimicrobial peptides and nuclear transcription modulators. Research demonstrates "
	"the critical role of specific protease activities in MHC class I-restricted peptide generation while identifying "
	"MAIL as a nuclear factor potentiating cytokine expression and pseudins as promising therapeutic antimicrobials. "
	"These findings advance understanding of immunopathological mechanisms and provide validated experimental models "
	"for antifungal compound evaluation.\|\|\|"
	"Studies utilized murine models to investigate immune responses in candidiasis while characterizing novel "
	"antimicrobial compounds and antigen processing mechanisms. Research identified critical protease activities "
	"and nuclear factors regulating immune responses.\|\|\|"
	"Integrated Immunological Modeling and Antimicrobial Peptide Discovery: Proteasome-Mediated Antigen Processing "
	"and Complement-Dependent Host Defense Mechanisms\n\n"
	)

	# Few-shot Example 2 - Biotechnology/Tissue Engineering
	example2_text = (
	"Biotechnology development focused on hematopoietic stem cell expansion using cytokine combinations. "
	"Temperature-responsive polymers enabled designed cell sheet engineering for tissue applications. "
	"Vascular anastomosis techniques using titanium clips reduced neointimal hyperplasia. Endothelial cell "
	"seeding protocols for vascular grafts were optimized. Gene transfer therapies for therapeutic angiogenesis "
	"showed clinical promise in cardiovascular applications."
	)
	example2_keywords = "biotechnology; tissue engineering; vascular grafts; stem cells; angiogenesis"

	prompt += (
	f"INPUT: {example2_text}\n"
	f"KEYWORDS: {example2_keywords}\n"
	"OUTPUT: "
	"Advanced biotechnology approaches combining cytokine-mediated hematopoietic stem cell expansion protocols "
	"with temperature-responsive polymer systems for precision cell sheet engineering and vascular reconstruction. "
	"Integration of titanium clip anastomosis techniques and optimized endothelial cell seeding methodologies "
	"demonstrates significant reduction in neointimal hyperplasia while enhancing graft patency. Gene transfer "
	"strategies for therapeutic angiogenesis represent promising clinical interventions for cardiovascular disease "
	"treatment, establishing proof-of-concept for growth factor-mediated collateral vessel development.\|\|\|"
	"Research combines stem cell expansion technologies with polymer-based cell engineering and vascular "
	"reconstruction techniques. Gene therapy approaches show clinical promise for treating cardiovascular disease "
	"through enhanced angiogenesis.\|\|\|"
	"Multiscale Biotechnology Integration: Cytokine-Mediated Stem Cell Engineering and Polymer-Assisted "
	"Vascular Reconstruction with Gene Transfer-Enhanced Therapeutic Angiogenesis\n\n"
	)

	# User query
	prompt += (
	f"INPUT: {concatenated_abstracts}\n"
	f"KEYWORDS: {keywords}\n"
	"OUTPUT:"
	)

	return prompt

	def call_deepseek_api(self, prompt: str, max_retries: int = 3) -> str:
	"""
	Call DeepSeek API with enhanced retry logic and timeout handling
	"""
	for attempt in range(max_retries):
	try:
	payload = {
	"model": "deepseek-chat", # DeepSeek-V3 instruct model
	"messages": [
	{
	"role": "user",
	"content": prompt
	}
	],
	"max_tokens": 800,
	"temperature": 0.7,
	"top_p": 0.9,
	"stream": False
	}

	# Enhanced timeout handling
	response = requests.post(
	f"{self.base_url}/chat/completions",
	headers=self.headers,
	json=payload,
	timeout=(10, 60) # (connection timeout, read timeout)
	)

	if response.status_code == 200:
	result = response.json()
	return result['choices'][0]['message']['content'].strip()
	elif response.status_code == 429: # Rate limit
	wait_time = min(60, 2 ** attempt * 30)
	print(f"Rate limit hit. Waiting {wait_time} seconds...")
	time.sleep(wait_time)
	continue
	elif response.status_code >= 500: # Server errors
	wait_time = min(30, 2 ** attempt * 5)
	print(f"Server error {response.status_code}. Retrying in {wait_time} seconds...")
	time.sleep(wait_time)
	continue
	else:
	print(f"API Error {response.status_code}: {response.text}")
	if attempt < max_retries - 1:
	time.sleep(2 ** attempt)
	continue
	else:
	return ""

	except requests.exceptions.Timeout as e:
	print(f"Timeout error on attempt {attempt + 1}: {e}")
	if attempt < max_retries - 1:
	wait_time = min(30, 2 ** attempt * 10)
	print(f"Retrying in {wait_time} seconds...")
	time.sleep(wait_time)
	continue
	else:
	print(f"Max retries exceeded due to timeout")
	return ""
	except requests.exceptions.ConnectionError as e:
	print(f"Connection error on attempt {attempt + 1}: {e}")
	if attempt < max_retries - 1:
	wait_time = min(30, 2 ** attempt * 10)
	print(f"Retrying in {wait_time} seconds...")
	time.sleep(wait_time)
	continue
	else:
	print(f"Max retries exceeded due to connection error")
	return ""
	except Exception as e:
	print(f"Attempt {attempt + 1} failed: {str(e)}")
	if attempt < max_retries - 1:
	time.sleep(2 ** attempt)
	continue
	else:
	return ""

	return ""

	def parse_response(self, response: str) -> Tuple[str, str, str]:
	"""
	Enhanced parsing for DeepSeek responses with integrated cleanup
	"""
	if not response:
	return "Failed to generate", "Failed to generate", "Failed to generate"

	# Clean the response first
	response = response.strip()

	# Remove common DeepSeek conversational elements
	conversational_starters = [
	"Here are the structured outputs",
	"Here's the structured output",
	"Based on the provided keywords",
	"Let me know if you'd like",
	"Would you like me to",
	"I can help you",
	"Here's my analysis"
	]

	for starter in conversational_starters:
	if response.startswith(starter):
	# Find the actual content after conversational part
	lines = response.split('\n')
	content_lines = []
	found_content = False
	for line in lines:
	if any(marker in line.upper() for marker in ['ABSTRACT_SUMMARY:', 'ABSTRACT:', '1.', '**1.']):
	found_content = True
	if found_content:
	content_lines.append(line)
	if content_lines:
	response = '\n'.join(content_lines)
	break

	# Remove markdown formatting
	response = re.sub(r'\\(\d+\.)\\', r'\1', response) # 1. -> 1.
	response = re.sub(r'\\(.?)\\', r'\1', response) # text* -> text
	response = re.sub(r'^\s---\s$', '', response, flags=re.MULTILINE) # Remove --- lines

	abstract_summary = ""
	short_summary = ""
	title = ""

	try:
	# Method 1: Look for standard \|\|\| separator
	if '\|\|\|' in response:
	parts = [part.strip() for part in response.split('\|\|\|')]
	if len(parts) >= 3:
	abstract_summary = parts[0]
	short_summary = parts[1]
	title = parts[2]
	elif len(parts) == 2:
	abstract_summary = parts[0]
	title = parts[1]
	# Generate short summary from abstract
	sentences = re.split(r'[.!?]+', abstract_summary)
	short_summary = '. '.join(sentences[:2]).strip() + '.'

	# Method 2: Look for numbered sections (DeepSeek's preferred format)
	elif "1. ABSTRACT_SUMMARY:" in response or "1.ABSTRACT_SUMMARY:" in response:
	# Extract by numbered sections
	abstract_match = re.search(r'1\.?\sABSTRACT_SUMMARY:\s(.*?)(?=2\.\|3\.\|$)', response, re.DOTALL \| re.IGNORECASE)
	short_match = re.search(r'2\.?\sSHORT_SUMMARY:\s(.*?)(?=3\.\|$)', response, re.DOTALL \| re.IGNORECASE)
	title_match = re.search(r'3\.?\sTITLE:\s(.*?)(?=\n\n\|$)', response, re.DOTALL \| re.IGNORECASE)

	if abstract_match:
	abstract_summary = abstract_match.group(1).strip()
	if short_match:
	short_summary = short_match.group(1).strip()
	if title_match:
	title = title_match.group(1).strip()

	# Method 3: Look for any mention of the three components
	else:
	# Try to find ABSTRACT_SUMMARY, SHORT_SUMMARY, TITLE anywhere
	abstract_match = re.search(r'ABSTRACT[_\s]SUMMARY:?\s(.*?)(?=SHORT\|TITLE\|$)', response, re.DOTALL \| re.IGNORECASE)
	short_match = re.search(r'SHORT[_\s]SUMMARY:?\s(.*?)(?=TITLE\|$)', response, re.DOTALL \| re.IGNORECASE)
	title_match = re.search(r'TITLE:?\s(.?)(?=\n\|$)', response, re.DOTALL \| re.IGNORECASE)

	if abstract_match:
	abstract_summary = abstract_match.group(1).strip()
	if short_match:
	short_summary = short_match.group(1).strip()
	if title_match:
	title = title_match.group(1).strip()

	except Exception as e:
	print(f"Error in enhanced parsing: {e}")

	# Fallback: if still no content, try to extract from the full response
	if not abstract_summary and not short_summary and not title:
	# Split response into sentences and distribute intelligently
	sentences = re.split(r'[.!?]+', response)
	sentences = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 10]

	if len(sentences) >= 6:
	abstract_summary = '. '.join(sentences[:4]) + '.'
	short_summary = '. '.join(sentences[4:6]) + '.'
	title = sentences[6] if len(sentences) > 6 else "Advanced Scientific Research Analysis"
	elif len(sentences) >= 3:
	abstract_summary = '. '.join(sentences[:2]) + '.'
	short_summary = sentences[2] + '.'
	title = sentences[-1] if len(sentences) > 3 else "Scientific Research Study"
	elif len(sentences) >= 1:
	abstract_summary = sentences[0]
	short_summary = sentences[0][:100] + "..." if len(sentences[0]) > 100 else sentences[0]
	title = "Scientific Analysis"
	else:
	abstract_summary = response[:200] + "..." if len(response) > 200 else response
	short_summary = response[:100] + "..." if len(response) > 100 else response
	title = "Research Summary"

	# Apply integrated cleanup to all components
	abstract_summary = self.clean_deepseek_output(abstract_summary)
	short_summary = self.clean_deepseek_output(short_summary)
	title = self.clean_deepseek_output(title)

	# Ensure reasonable lengths after cleanup
	if len(abstract_summary.split()) > 150:
	abstract_summary = ' '.join(abstract_summary.split()[:150]) + "..."

	if len(short_summary.split()) > 75:
	short_summary = ' '.join(short_summary.split()[:75]) + "..."

	if len(title.split()) > 25:
	title = ' '.join(title.split()[:25]) + "..."

	# Final validation - ensure we have actual content
	if not abstract_summary or abstract_summary in ["", "Content not extracted", "Content not properly extracted"]:
	abstract_summary = "Content generation failed"
	if not short_summary or short_summary in ["", "Content not extracted", "Content not properly extracted"]:
	short_summary = "Content generation failed"
	if not title or title in ["", "Content not extracted", "Content not properly extracted"]:
	title = "Content generation failed"

	return abstract_summary, short_summary, title

	def load_checkpoint(self, checkpoint_file: str) -> Tuple[List[Dict], set]:
	"""
	Load existing checkpoint data and return processed data + processed indices
	"""
	if os.path.exists(checkpoint_file):
	try:
	df = pd.read_csv(checkpoint_file, sep='\t')
	processed_data = df.to_dict('records')
	processed_indices = set(df['OriginalIndex'].astype(str))
	print(f"✓ Loaded checkpoint with {len(processed_data)} processed entries")
	return processed_data, processed_indices
	except Exception as e:
	print(f"Error loading checkpoint: {e}")
	return [], set()
	return [], set()

	def save_checkpoint(self, output_data: List[Dict], checkpoint_file: str):
	"""
	Save current progress to checkpoint file
	"""
	try:
	df = pd.DataFrame(output_data)
	df.to_csv(checkpoint_file, sep='\t', index=False, quoting=csv.QUOTE_ALL)
	print(f"💾 Checkpoint saved: {len(output_data)} entries")
	except Exception as e:
	print(f"Error saving checkpoint: {e}")

	def estimate_time_remaining(self, current_progress: int, total_rows: int) -> str:
	"""
	Estimate time remaining based on current progress
	"""
	if self.start_time is None or current_progress == 0:
	return "Calculating..."

	elapsed = datetime.now() - self.start_time
	elapsed_seconds = elapsed.total_seconds()

	if current_progress > 0:
	avg_time_per_row = elapsed_seconds / current_progress
	remaining_rows = total_rows - current_progress
	remaining_seconds = remaining_rows * avg_time_per_row
	remaining_time = timedelta(seconds=int(remaining_seconds))
	return str(remaining_time)

	return "Calculating..."

	def process_data_file(self, input_file: str, output_file: str, delay: float = 1.0,
	save_every: int = 50, debug_first_n: int = 3,
	start_row: int = 0, end_row: Optional[int] = None):
	"""
	Process the input TSV file and generate training data with checkpointing and row slicing

	Args:
	input_file: Path to input TSV file
	output_file: Path to output TSV file
	delay: Delay between API calls to respect rate limits
	save_every: Save checkpoint every N processed rows
	debug_first_n: Print full input/output for first N generations for QC
	start_row: Starting row index (0-based)
	end_row: Ending row index (0-based, None for all remaining rows)
	"""
	self.start_time = datetime.now()

	# Setup checkpoint file
	checkpoint_file = output_file.replace('.tsv', '_checkpoint.tsv')

	# Load existing checkpoint
	output_data, processed_indices = self.load_checkpoint(checkpoint_file)

	# Read input data
	try:
	df = pd.read_csv(input_file, sep='\t')
	except Exception as e:
	print(f"Error reading input file: {e}")
	return

	# Apply row slicing
	original_length = len(df)
	if end_row is None:
	end_row = original_length
	else:
	end_row = min(end_row, original_length)

	if start_row >= original_length:
	print(f"❌ Error: start_row {start_row} is >= total rows {original_length}")
	return

	df_slice = df.iloc[start_row:end_row].copy()
	total_rows = len(df_slice)

	initial_processed = len(output_data)

	print(f"📊 Processing Overview:")
	print(f" Input file total rows: {original_length}")
	print(f" Processing slice: rows {start_row} to {end_row-1}")
	print(f" Rows in slice: {total_rows}")
	print(f" Already processed: {initial_processed}")
	print(f" Remaining: {total_rows - initial_processed}")
	print(f" Checkpoint saves every {save_every} rows")
	print(f" Estimated cost: ~${total_rows * 0.0014:.2f}")
	print(f" Estimated time: ~{total_rows * 1.5 / 3600:.1f} hours")
	print(f" Debug mode: First {debug_first_n} generations will show detailed output")
	print("-" * 80)

	successful_processed = 0
	failed_processed = 0
	generations_count = 0
	processed_this_run = 0

	for index, row in df_slice.iterrows():
	original_index = str(row.get('Index', index))

	# Skip if already processed
	if original_index in processed_indices:
	continue

	concatenated_abstracts = str(row.get('ConcatenatedAbstracts', ''))
	keywords = str(row.get('TopKeywords', ''))

	# Skip if no content
	if not concatenated_abstracts or concatenated_abstracts == 'nan':
	print(f"[{processed_this_run + 1}/{total_rows}] Skipping empty cluster {original_index}")
	continue

	actual_row_num = start_row + processed_this_run
	print(f"[{processed_this_run + 1}/{total_rows}] Processing row {actual_row_num} (cluster {original_index})...")

	# Create prompt
	prompt = self.create_few_shot_prompt(concatenated_abstracts, keywords)

	# DEBUG: Print detailed input/output for first few generations
	if generations_count < debug_first_n:
	print("\n" + "="*80)
	print(f"🔍 DEBUG OUTPUT FOR GENERATION #{generations_count + 1}")
	print("="*80)
	print(f"📋 CLUSTER INDEX: {original_index} (Row {actual_row_num})")
	print(f"🔑 KEYWORDS: {keywords}")
	print(f"📄 ABSTRACTS (first 500 chars): {concatenated_abstracts[:500]}...")
	print("\n" + "-"*60)
	print("📤 FULL PROMPT BEING SENT TO API:")
	print("-"*60)
	print(prompt)
	print("-"*60)

	# Call API
	response = self.call_deepseek_api(prompt)

	# Continue debug printing
	if generations_count < debug_first_n:
	print("📥 RAW API RESPONSE:")
	print("-"*60)
	print(response if response else "❌ NO RESPONSE / ERROR")
	print("-"*60)

	if response:
	# Parse response (now includes integrated cleanup)
	abstract_summary, short_summary, title = self.parse_response(response)

	# Continue debug printing
	if generations_count < debug_first_n:
	print("🔧 PARSED & CLEANED COMPONENTS:")
	print("-"*60)
	print(f"📝 ABSTRACT SUMMARY:\n{abstract_summary}\n")
	print(f"⚡ SHORT SUMMARY:\n{short_summary}\n")
	print(f"🏷️ TITLE:\n{title}\n")
	print("="*80 + "\n")

	# Add to output data
	output_data.append({
	'OriginalIndex': original_index,
	'SourceRow': actual_row_num, # Track original row number
	'AbstractSummary': abstract_summary,
	'ShortSummary': short_summary,
	'Title': title,
	'OriginalKeywords': keywords,
	'OriginalText': concatenated_abstracts[:1000] + "..." if len(concatenated_abstracts) > 1000 else concatenated_abstracts
	})

	successful_processed += 1
	print(f"✓ Success! ({successful_processed} total successes)")
	else:
	if generations_count < debug_first_n:
	print("❌ FAILED TO PARSE OR GET RESPONSE")
	print("="*80 + "\n")

	print(f"✗ Failed to process cluster {original_index}")
	# Add empty entry to maintain tracking
	output_data.append({
	'OriginalIndex': original_index,
	'SourceRow': actual_row_num,
	'AbstractSummary': 'Failed to generate',
	'ShortSummary': 'Failed to generate',
	'Title': 'Failed to generate',
	'OriginalKeywords': keywords,
	'OriginalText': concatenated_abstracts[:1000] + "..." if len(concatenated_abstracts) > 1000 else concatenated_abstracts
	})
	failed_processed += 1

	generations_count += 1
	processed_this_run += 1

	# Update processed set
	processed_indices.add(original_index)

	# Save checkpoint periodically
	if len(output_data) % save_every == 0:
	self.save_checkpoint(output_data, checkpoint_file)
	time_remaining = self.estimate_time_remaining(processed_this_run, total_rows)
	print(f"📁 Checkpoint saved! Progress: {processed_this_run}/{total_rows} \| ETA: {time_remaining}")

	# Rate limiting
	time.sleep(delay)

	# Final save
	try:
	output_df = pd.DataFrame(output_data)
	output_df.to_csv(output_file, sep='\t', index=False, quoting=csv.QUOTE_ALL)

	print(f"\n🎉 GENERATION COMPLETED!")
	print(f"✓ Successfully processed: {successful_processed}")
	print(f"✗ Failed: {failed_processed}")
	print(f"📄 Total entries saved: {len(output_data)}")
	print(f"💾 Final output saved to: {output_file}")
	print(f"💰 Estimated cost: ~${successful_processed * 0.0014:.2f}")
	print(f"📊 Processed rows {start_row} to {end_row-1} from source file")

	# Clean up checkpoint file
	if os.path.exists(checkpoint_file):
	os.remove(checkpoint_file)
	print(f"🗑️ Checkpoint file cleaned up")

	except Exception as e:
	print(f"Error saving final output file: {e}")
	print(f"Your data is still safe in checkpoint: {checkpoint_file}")

	def main():
	"""
	Main function to run the training data generation with row slicing
	"""
	# Configuration for processing all 30,000 examples
	API_KEY = "sk-6185ef64c68d473d984963356ab0378e" # Replace with your actual API key
	INPUT_FILE = "/home/joneill/pubmed_clustered_data_sciner.tsv" # Your input TSV file

	# Row slicing configuration - MODIFY THESE FOR YOUR BATCHES
	START_ROW = 0 # Starting row (0-based)
	END_ROW = 30000 # Ending row (None for all rows, or specify number)
	BATCH_NAME = "full" # Used in output filename

	# You can also run in batches, e.g.:
	# Batch 1: START_ROW = 0, END_ROW = 5000, BATCH_NAME = "batch1"
	# Batch 2: START_ROW = 5000, END_ROW = 10000, BATCH_NAME = "batch2"
	# Batch 3: START_ROW = 10000, END_ROW = 15000, BATCH_NAME = "batch3"
	# etc.

	OUTPUT_FILE = f"bsg_training_data_{BATCH_NAME}.tsv" # Output file for training data
	DELAY_BETWEEN_CALLS = 1.0 # Seconds between API calls
	SAVE_EVERY = 50 # Save checkpoint every N rows
	DEBUG_FIRST_N = 3 # Print full input/output for first N generations for QC

	# Initialize generator
	generator = EnhancedDeepSeekTrainingDataGenerator(API_KEY)

	# Calculate batch info
	total_rows_to_process = END_ROW - START_ROW if END_ROW else "all remaining"

	# Process data
	print("🚀 Starting Enhanced DeepSeek Training Data Generation")
	print("="*80)
	print(f"🎯 Processing: {total_rows_to_process} rows (from row {START_ROW} to {END_ROW-1 if END_ROW else 'end'})")
	print(f"💰 Estimated cost: ~${(END_ROW - START_ROW if END_ROW else 30000) * 0.0014:.2f}")
	print(f"⏱️ Estimated time: ~{(END_ROW - START_ROW if END_ROW else 30000) * 1.5 / 3600:.1f} hours")
	print(f"🔍 Debug mode: Will show detailed input/output for first {DEBUG_FIRST_N} generations")
	print(f"💾 Automatic checkpointing every {SAVE_EVERY} rows")
	print(f"🔄 Auto-resume: Restart script to continue from checkpoint")
	print(f"🧹 Integrated cleanup: All outputs automatically cleaned of formatting artifacts")
	print("="*80)

	generator.process_data_file(
	INPUT_FILE, OUTPUT_FILE, DELAY_BETWEEN_CALLS, SAVE_EVERY, DEBUG_FIRST_N,
	START_ROW, END_ROW
	)

	print("\n🎉 Training data generation completed!")
	print(f"📁 Output file: {OUTPUT_FILE}")
	print("✨ Data is automatically cleaned and ready for training! 🧪")

	if __name__ == "__main__":
	main()