compile_complete_training_data.py · jimnoneill/BSG_CyLlama at e6a3f1955a13e98ae8ca7d32a44fb2f7bed7e2f3

File size: 4,999 Bytes

a04d370

#!/usr/bin/env python3
"""
Script to compile the complete BSG CyLLama training data
Combines the cluster-aligned data with the remaining records from the full dataset
"""

import pandas as pd
import numpy as np
from pathlib import Path

def compile_complete_training_data():
    """
    Compile the complete training data by combining cluster-aligned data 
    with remaining records from the full dataset
    """
    print("Loading training data files...")
    
    # Load the datasets
    cluster_aligned_df = pd.read_csv('bsg_training_data_cluster_aligned.tsv', sep='\t')
    full_df = pd.read_csv('bsg_training_data_full.tsv', sep='\t')
    pubmed_clustered_df = pd.read_csv('pubmed_clustered_data_sciner.tsv', sep='\t')
    
    print(f"Cluster aligned data: {len(cluster_aligned_df)} records")
    print(f"Full data: {len(full_df)} records") 
    print(f"PubMed clustered data: {len(pubmed_clustered_df)} records")
    
    # Get the cluster-aligned columns
    cluster_columns = cluster_aligned_df.columns.tolist()
    base_columns = ['OriginalIndex', 'SourceRow', 'AbstractSummary', 'ShortSummary', 
                   'Title', 'OriginalKeywords', 'OriginalText']
    
    print(f"Cluster aligned columns: {cluster_columns}")
    print(f"Base columns: {base_columns}")
    
    # Extract SourceRow values that are already in the cluster-aligned data
    aligned_source_rows = set(cluster_aligned_df['SourceRow'].values)
    print(f"Already aligned source rows: {len(aligned_source_rows)}")
    
    # Find records in the full dataset that are not in the cluster-aligned data
    missing_records = full_df[~full_df['SourceRow'].isin(aligned_source_rows)].copy()
    print(f"Missing records to be added: {len(missing_records)}")
    
    if len(missing_records) > 0:
        # For the missing records, we need to add the clustering columns
        # We'll use placeholder values for now, similar to what might be in the aligned data
        
        # Add missing columns with placeholder values
        missing_records['cluster_num_x'] = 'cluster_' + missing_records['SourceRow'].astype(str)
        missing_records['Index'] = missing_records['SourceRow']
        missing_records['ConcatenatedAbstracts'] = missing_records['AbstractSummary']
        missing_records['TopKeywords'] = missing_records['OriginalKeywords'] 
        missing_records['cluster_num_y'] = missing_records['SourceRow']
        
        # Reorder columns to match cluster_aligned_df
        missing_records = missing_records[cluster_columns]
        
        # Combine the datasets
        complete_training_data = pd.concat([cluster_aligned_df, missing_records], ignore_index=True)
        
        print(f"Complete training data: {len(complete_training_data)} records")
        
        # Save the complete dataset
        output_file = 'bsg_training_data_complete_aligned.tsv'
        complete_training_data.to_csv(output_file, sep='\t', index=False)
        print(f"Complete training data saved to: {output_file}")
        
        # Also check if we can use any clustering information from pubmed_clustered_data
        if len(pubmed_clustered_df) == len(full_df):
            print("PubMed clustered data has same length as full data - checking for additional clustering info...")
            
            # Check column overlap
            pubmed_columns = pubmed_clustered_df.columns.tolist()
            print(f"PubMed columns: {pubmed_columns}")
            
            # If there are useful clustering columns in pubmed data, merge them
            if 'cluster_num' in pubmed_columns or any('cluster' in col.lower() for col in pubmed_columns):
                print("Found clustering information in PubMed data, will create enhanced version...")
                
                # Create enhanced version with proper clustering from pubmed data
                enhanced_data = complete_training_data.copy()
                
                # Merge with pubmed clustering data based on SourceRow
                if 'SourceRow' in pubmed_columns:
                    pubmed_subset = pubmed_clustered_df[['SourceRow'] + [col for col in pubmed_columns if 'cluster' in col.lower()]]
                    enhanced_data = enhanced_data.merge(pubmed_subset, on='SourceRow', how='left', suffixes=('', '_pubmed'))
                
                enhanced_output_file = 'bsg_training_data_complete_enhanced.tsv'
                enhanced_data.to_csv(enhanced_output_file, sep='\t', index=False)
                print(f"Enhanced training data saved to: {enhanced_output_file}")
        
        return complete_training_data
    else:
        print("No missing records found - cluster aligned data is already complete!")
        return cluster_aligned_df

if __name__ == "__main__":
    # Change to the script directory
    import os
    os.chdir('/home/joneill/bsg_cyllama')
    
    complete_data = compile_complete_training_data()
    print(f"Compilation complete! Final dataset has {len(complete_data)} records.")