|
|
|
""" |
|
Script to compile the complete BSG CyLLama training data |
|
Combines the cluster-aligned data with the remaining records from the full dataset |
|
""" |
|
|
|
import pandas as pd |
|
import numpy as np |
|
from pathlib import Path |
|
|
|
def compile_complete_training_data(): |
|
""" |
|
Compile the complete training data by combining cluster-aligned data |
|
with remaining records from the full dataset |
|
""" |
|
print("Loading training data files...") |
|
|
|
|
|
cluster_aligned_df = pd.read_csv('bsg_training_data_cluster_aligned.tsv', sep='\t') |
|
full_df = pd.read_csv('bsg_training_data_full.tsv', sep='\t') |
|
pubmed_clustered_df = pd.read_csv('pubmed_clustered_data_sciner.tsv', sep='\t') |
|
|
|
print(f"Cluster aligned data: {len(cluster_aligned_df)} records") |
|
print(f"Full data: {len(full_df)} records") |
|
print(f"PubMed clustered data: {len(pubmed_clustered_df)} records") |
|
|
|
|
|
cluster_columns = cluster_aligned_df.columns.tolist() |
|
base_columns = ['OriginalIndex', 'SourceRow', 'AbstractSummary', 'ShortSummary', |
|
'Title', 'OriginalKeywords', 'OriginalText'] |
|
|
|
print(f"Cluster aligned columns: {cluster_columns}") |
|
print(f"Base columns: {base_columns}") |
|
|
|
|
|
aligned_source_rows = set(cluster_aligned_df['SourceRow'].values) |
|
print(f"Already aligned source rows: {len(aligned_source_rows)}") |
|
|
|
|
|
missing_records = full_df[~full_df['SourceRow'].isin(aligned_source_rows)].copy() |
|
print(f"Missing records to be added: {len(missing_records)}") |
|
|
|
if len(missing_records) > 0: |
|
|
|
|
|
|
|
|
|
missing_records['cluster_num_x'] = 'cluster_' + missing_records['SourceRow'].astype(str) |
|
missing_records['Index'] = missing_records['SourceRow'] |
|
missing_records['ConcatenatedAbstracts'] = missing_records['AbstractSummary'] |
|
missing_records['TopKeywords'] = missing_records['OriginalKeywords'] |
|
missing_records['cluster_num_y'] = missing_records['SourceRow'] |
|
|
|
|
|
missing_records = missing_records[cluster_columns] |
|
|
|
|
|
complete_training_data = pd.concat([cluster_aligned_df, missing_records], ignore_index=True) |
|
|
|
print(f"Complete training data: {len(complete_training_data)} records") |
|
|
|
|
|
output_file = 'bsg_training_data_complete_aligned.tsv' |
|
complete_training_data.to_csv(output_file, sep='\t', index=False) |
|
print(f"Complete training data saved to: {output_file}") |
|
|
|
|
|
if len(pubmed_clustered_df) == len(full_df): |
|
print("PubMed clustered data has same length as full data - checking for additional clustering info...") |
|
|
|
|
|
pubmed_columns = pubmed_clustered_df.columns.tolist() |
|
print(f"PubMed columns: {pubmed_columns}") |
|
|
|
|
|
if 'cluster_num' in pubmed_columns or any('cluster' in col.lower() for col in pubmed_columns): |
|
print("Found clustering information in PubMed data, will create enhanced version...") |
|
|
|
|
|
enhanced_data = complete_training_data.copy() |
|
|
|
|
|
if 'SourceRow' in pubmed_columns: |
|
pubmed_subset = pubmed_clustered_df[['SourceRow'] + [col for col in pubmed_columns if 'cluster' in col.lower()]] |
|
enhanced_data = enhanced_data.merge(pubmed_subset, on='SourceRow', how='left', suffixes=('', '_pubmed')) |
|
|
|
enhanced_output_file = 'bsg_training_data_complete_enhanced.tsv' |
|
enhanced_data.to_csv(enhanced_output_file, sep='\t', index=False) |
|
print(f"Enhanced training data saved to: {enhanced_output_file}") |
|
|
|
return complete_training_data |
|
else: |
|
print("No missing records found - cluster aligned data is already complete!") |
|
return cluster_aligned_df |
|
|
|
if __name__ == "__main__": |
|
|
|
import os |
|
os.chdir('/home/joneill/bsg_cyllama') |
|
|
|
complete_data = compile_complete_training_data() |
|
print(f"Compilation complete! Final dataset has {len(complete_data)} records.") |
|
|
|
|
|
|
|
|