Upload compile_complete_training_data.py with huggingface_hub

Browse files

Files changed (1) hide show

compile_complete_training_data.py +105 -0

compile_complete_training_data.py ADDED Viewed

	@@ -0,0 +1,105 @@

+#!/usr/bin/env python3
+"""
+Script to compile the complete BSG CyLLama training data
+Combines the cluster-aligned data with the remaining records from the full dataset
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+def compile_complete_training_data():
+    """
+    Compile the complete training data by combining cluster-aligned data
+    with remaining records from the full dataset
+    """
+    print("Loading training data files...")
+    # Load the datasets
+    cluster_aligned_df = pd.read_csv('bsg_training_data_cluster_aligned.tsv', sep='\t')
+    full_df = pd.read_csv('bsg_training_data_full.tsv', sep='\t')
+    pubmed_clustered_df = pd.read_csv('pubmed_clustered_data_sciner.tsv', sep='\t')
+    print(f"Cluster aligned data: {len(cluster_aligned_df)} records")
+    print(f"Full data: {len(full_df)} records")
+    print(f"PubMed clustered data: {len(pubmed_clustered_df)} records")
+    # Get the cluster-aligned columns
+    cluster_columns = cluster_aligned_df.columns.tolist()
+    base_columns = ['OriginalIndex', 'SourceRow', 'AbstractSummary', 'ShortSummary',
+                   'Title', 'OriginalKeywords', 'OriginalText']
+    print(f"Cluster aligned columns: {cluster_columns}")
+    print(f"Base columns: {base_columns}")
+    # Extract SourceRow values that are already in the cluster-aligned data
+    aligned_source_rows = set(cluster_aligned_df['SourceRow'].values)
+    print(f"Already aligned source rows: {len(aligned_source_rows)}")
+    # Find records in the full dataset that are not in the cluster-aligned data
+    missing_records = full_df[~full_df['SourceRow'].isin(aligned_source_rows)].copy()
+    print(f"Missing records to be added: {len(missing_records)}")
+    if len(missing_records) > 0:
+        # For the missing records, we need to add the clustering columns
+        # We'll use placeholder values for now, similar to what might be in the aligned data
+        # Add missing columns with placeholder values
+        missing_records['cluster_num_x'] = 'cluster_' + missing_records['SourceRow'].astype(str)
+        missing_records['Index'] = missing_records['SourceRow']
+        missing_records['ConcatenatedAbstracts'] = missing_records['AbstractSummary']
+        missing_records['TopKeywords'] = missing_records['OriginalKeywords']
+        missing_records['cluster_num_y'] = missing_records['SourceRow']
+        # Reorder columns to match cluster_aligned_df
+        missing_records = missing_records[cluster_columns]
+        # Combine the datasets
+        complete_training_data = pd.concat([cluster_aligned_df, missing_records], ignore_index=True)
+        print(f"Complete training data: {len(complete_training_data)} records")
+        # Save the complete dataset
+        output_file = 'bsg_training_data_complete_aligned.tsv'
+        complete_training_data.to_csv(output_file, sep='\t', index=False)
+        print(f"Complete training data saved to: {output_file}")
+        # Also check if we can use any clustering information from pubmed_clustered_data
+        if len(pubmed_clustered_df) == len(full_df):
+            print("PubMed clustered data has same length as full data - checking for additional clustering info...")
+            # Check column overlap
+            pubmed_columns = pubmed_clustered_df.columns.tolist()
+            print(f"PubMed columns: {pubmed_columns}")
+            # If there are useful clustering columns in pubmed data, merge them
+            if 'cluster_num' in pubmed_columns or any('cluster' in col.lower() for col in pubmed_columns):
+                print("Found clustering information in PubMed data, will create enhanced version...")
+                # Create enhanced version with proper clustering from pubmed data
+                enhanced_data = complete_training_data.copy()
+                # Merge with pubmed clustering data based on SourceRow
+                if 'SourceRow' in pubmed_columns:
+                    pubmed_subset = pubmed_clustered_df[['SourceRow'] + [col for col in pubmed_columns if 'cluster' in col.lower()]]
+                    enhanced_data = enhanced_data.merge(pubmed_subset, on='SourceRow', how='left', suffixes=('', '_pubmed'))
+                enhanced_output_file = 'bsg_training_data_complete_enhanced.tsv'
+                enhanced_data.to_csv(enhanced_output_file, sep='\t', index=False)
+                print(f"Enhanced training data saved to: {enhanced_output_file}")
+        return complete_training_data
+    else:
+        print("No missing records found - cluster aligned data is already complete!")
+        return cluster_aligned_df
+if __name__ == "__main__":
+    # Change to the script directory
+    import os
+    os.chdir('/home/joneill/bsg_cyllama')
+    complete_data = compile_complete_training_data()
+    print(f"Compilation complete! Final dataset has {len(complete_data)} records.")