BSG_CyLlama / compile_complete_training_data.py
jimnoneill's picture
Upload compile_complete_training_data.py with huggingface_hub
a04d370 verified
raw
history blame
5 kB
#!/usr/bin/env python3
"""
Script to compile the complete BSG CyLLama training data
Combines the cluster-aligned data with the remaining records from the full dataset
"""
import pandas as pd
import numpy as np
from pathlib import Path
def compile_complete_training_data():
"""
Compile the complete training data by combining cluster-aligned data
with remaining records from the full dataset
"""
print("Loading training data files...")
# Load the datasets
cluster_aligned_df = pd.read_csv('bsg_training_data_cluster_aligned.tsv', sep='\t')
full_df = pd.read_csv('bsg_training_data_full.tsv', sep='\t')
pubmed_clustered_df = pd.read_csv('pubmed_clustered_data_sciner.tsv', sep='\t')
print(f"Cluster aligned data: {len(cluster_aligned_df)} records")
print(f"Full data: {len(full_df)} records")
print(f"PubMed clustered data: {len(pubmed_clustered_df)} records")
# Get the cluster-aligned columns
cluster_columns = cluster_aligned_df.columns.tolist()
base_columns = ['OriginalIndex', 'SourceRow', 'AbstractSummary', 'ShortSummary',
'Title', 'OriginalKeywords', 'OriginalText']
print(f"Cluster aligned columns: {cluster_columns}")
print(f"Base columns: {base_columns}")
# Extract SourceRow values that are already in the cluster-aligned data
aligned_source_rows = set(cluster_aligned_df['SourceRow'].values)
print(f"Already aligned source rows: {len(aligned_source_rows)}")
# Find records in the full dataset that are not in the cluster-aligned data
missing_records = full_df[~full_df['SourceRow'].isin(aligned_source_rows)].copy()
print(f"Missing records to be added: {len(missing_records)}")
if len(missing_records) > 0:
# For the missing records, we need to add the clustering columns
# We'll use placeholder values for now, similar to what might be in the aligned data
# Add missing columns with placeholder values
missing_records['cluster_num_x'] = 'cluster_' + missing_records['SourceRow'].astype(str)
missing_records['Index'] = missing_records['SourceRow']
missing_records['ConcatenatedAbstracts'] = missing_records['AbstractSummary']
missing_records['TopKeywords'] = missing_records['OriginalKeywords']
missing_records['cluster_num_y'] = missing_records['SourceRow']
# Reorder columns to match cluster_aligned_df
missing_records = missing_records[cluster_columns]
# Combine the datasets
complete_training_data = pd.concat([cluster_aligned_df, missing_records], ignore_index=True)
print(f"Complete training data: {len(complete_training_data)} records")
# Save the complete dataset
output_file = 'bsg_training_data_complete_aligned.tsv'
complete_training_data.to_csv(output_file, sep='\t', index=False)
print(f"Complete training data saved to: {output_file}")
# Also check if we can use any clustering information from pubmed_clustered_data
if len(pubmed_clustered_df) == len(full_df):
print("PubMed clustered data has same length as full data - checking for additional clustering info...")
# Check column overlap
pubmed_columns = pubmed_clustered_df.columns.tolist()
print(f"PubMed columns: {pubmed_columns}")
# If there are useful clustering columns in pubmed data, merge them
if 'cluster_num' in pubmed_columns or any('cluster' in col.lower() for col in pubmed_columns):
print("Found clustering information in PubMed data, will create enhanced version...")
# Create enhanced version with proper clustering from pubmed data
enhanced_data = complete_training_data.copy()
# Merge with pubmed clustering data based on SourceRow
if 'SourceRow' in pubmed_columns:
pubmed_subset = pubmed_clustered_df[['SourceRow'] + [col for col in pubmed_columns if 'cluster' in col.lower()]]
enhanced_data = enhanced_data.merge(pubmed_subset, on='SourceRow', how='left', suffixes=('', '_pubmed'))
enhanced_output_file = 'bsg_training_data_complete_enhanced.tsv'
enhanced_data.to_csv(enhanced_output_file, sep='\t', index=False)
print(f"Enhanced training data saved to: {enhanced_output_file}")
return complete_training_data
else:
print("No missing records found - cluster aligned data is already complete!")
return cluster_aligned_df
if __name__ == "__main__":
# Change to the script directory
import os
os.chdir('/home/joneill/bsg_cyllama')
complete_data = compile_complete_training_data()
print(f"Compilation complete! Final dataset has {len(complete_data)} records.")