Upload compile_complete_training_data.py with huggingface_hub
Browse files
compile_complete_training_data.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Script to compile the complete BSG CyLLama training data
|
4 |
+
Combines the cluster-aligned data with the remaining records from the full dataset
|
5 |
+
"""
|
6 |
+
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
from pathlib import Path
|
10 |
+
|
11 |
+
def compile_complete_training_data():
|
12 |
+
"""
|
13 |
+
Compile the complete training data by combining cluster-aligned data
|
14 |
+
with remaining records from the full dataset
|
15 |
+
"""
|
16 |
+
print("Loading training data files...")
|
17 |
+
|
18 |
+
# Load the datasets
|
19 |
+
cluster_aligned_df = pd.read_csv('bsg_training_data_cluster_aligned.tsv', sep='\t')
|
20 |
+
full_df = pd.read_csv('bsg_training_data_full.tsv', sep='\t')
|
21 |
+
pubmed_clustered_df = pd.read_csv('pubmed_clustered_data_sciner.tsv', sep='\t')
|
22 |
+
|
23 |
+
print(f"Cluster aligned data: {len(cluster_aligned_df)} records")
|
24 |
+
print(f"Full data: {len(full_df)} records")
|
25 |
+
print(f"PubMed clustered data: {len(pubmed_clustered_df)} records")
|
26 |
+
|
27 |
+
# Get the cluster-aligned columns
|
28 |
+
cluster_columns = cluster_aligned_df.columns.tolist()
|
29 |
+
base_columns = ['OriginalIndex', 'SourceRow', 'AbstractSummary', 'ShortSummary',
|
30 |
+
'Title', 'OriginalKeywords', 'OriginalText']
|
31 |
+
|
32 |
+
print(f"Cluster aligned columns: {cluster_columns}")
|
33 |
+
print(f"Base columns: {base_columns}")
|
34 |
+
|
35 |
+
# Extract SourceRow values that are already in the cluster-aligned data
|
36 |
+
aligned_source_rows = set(cluster_aligned_df['SourceRow'].values)
|
37 |
+
print(f"Already aligned source rows: {len(aligned_source_rows)}")
|
38 |
+
|
39 |
+
# Find records in the full dataset that are not in the cluster-aligned data
|
40 |
+
missing_records = full_df[~full_df['SourceRow'].isin(aligned_source_rows)].copy()
|
41 |
+
print(f"Missing records to be added: {len(missing_records)}")
|
42 |
+
|
43 |
+
if len(missing_records) > 0:
|
44 |
+
# For the missing records, we need to add the clustering columns
|
45 |
+
# We'll use placeholder values for now, similar to what might be in the aligned data
|
46 |
+
|
47 |
+
# Add missing columns with placeholder values
|
48 |
+
missing_records['cluster_num_x'] = 'cluster_' + missing_records['SourceRow'].astype(str)
|
49 |
+
missing_records['Index'] = missing_records['SourceRow']
|
50 |
+
missing_records['ConcatenatedAbstracts'] = missing_records['AbstractSummary']
|
51 |
+
missing_records['TopKeywords'] = missing_records['OriginalKeywords']
|
52 |
+
missing_records['cluster_num_y'] = missing_records['SourceRow']
|
53 |
+
|
54 |
+
# Reorder columns to match cluster_aligned_df
|
55 |
+
missing_records = missing_records[cluster_columns]
|
56 |
+
|
57 |
+
# Combine the datasets
|
58 |
+
complete_training_data = pd.concat([cluster_aligned_df, missing_records], ignore_index=True)
|
59 |
+
|
60 |
+
print(f"Complete training data: {len(complete_training_data)} records")
|
61 |
+
|
62 |
+
# Save the complete dataset
|
63 |
+
output_file = 'bsg_training_data_complete_aligned.tsv'
|
64 |
+
complete_training_data.to_csv(output_file, sep='\t', index=False)
|
65 |
+
print(f"Complete training data saved to: {output_file}")
|
66 |
+
|
67 |
+
# Also check if we can use any clustering information from pubmed_clustered_data
|
68 |
+
if len(pubmed_clustered_df) == len(full_df):
|
69 |
+
print("PubMed clustered data has same length as full data - checking for additional clustering info...")
|
70 |
+
|
71 |
+
# Check column overlap
|
72 |
+
pubmed_columns = pubmed_clustered_df.columns.tolist()
|
73 |
+
print(f"PubMed columns: {pubmed_columns}")
|
74 |
+
|
75 |
+
# If there are useful clustering columns in pubmed data, merge them
|
76 |
+
if 'cluster_num' in pubmed_columns or any('cluster' in col.lower() for col in pubmed_columns):
|
77 |
+
print("Found clustering information in PubMed data, will create enhanced version...")
|
78 |
+
|
79 |
+
# Create enhanced version with proper clustering from pubmed data
|
80 |
+
enhanced_data = complete_training_data.copy()
|
81 |
+
|
82 |
+
# Merge with pubmed clustering data based on SourceRow
|
83 |
+
if 'SourceRow' in pubmed_columns:
|
84 |
+
pubmed_subset = pubmed_clustered_df[['SourceRow'] + [col for col in pubmed_columns if 'cluster' in col.lower()]]
|
85 |
+
enhanced_data = enhanced_data.merge(pubmed_subset, on='SourceRow', how='left', suffixes=('', '_pubmed'))
|
86 |
+
|
87 |
+
enhanced_output_file = 'bsg_training_data_complete_enhanced.tsv'
|
88 |
+
enhanced_data.to_csv(enhanced_output_file, sep='\t', index=False)
|
89 |
+
print(f"Enhanced training data saved to: {enhanced_output_file}")
|
90 |
+
|
91 |
+
return complete_training_data
|
92 |
+
else:
|
93 |
+
print("No missing records found - cluster aligned data is already complete!")
|
94 |
+
return cluster_aligned_df
|
95 |
+
|
96 |
+
if __name__ == "__main__":
|
97 |
+
# Change to the script directory
|
98 |
+
import os
|
99 |
+
os.chdir('/home/joneill/bsg_cyllama')
|
100 |
+
|
101 |
+
complete_data = compile_complete_training_data()
|
102 |
+
print(f"Compilation complete! Final dataset has {len(complete_data)} records.")
|
103 |
+
|
104 |
+
|
105 |
+
|