jimnoneill commited on
Commit
a04d370
·
verified ·
1 Parent(s): 9bd985f

Upload compile_complete_training_data.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. compile_complete_training_data.py +105 -0
compile_complete_training_data.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to compile the complete BSG CyLLama training data
4
+ Combines the cluster-aligned data with the remaining records from the full dataset
5
+ """
6
+
7
+ import pandas as pd
8
+ import numpy as np
9
+ from pathlib import Path
10
+
11
+ def compile_complete_training_data():
12
+ """
13
+ Compile the complete training data by combining cluster-aligned data
14
+ with remaining records from the full dataset
15
+ """
16
+ print("Loading training data files...")
17
+
18
+ # Load the datasets
19
+ cluster_aligned_df = pd.read_csv('bsg_training_data_cluster_aligned.tsv', sep='\t')
20
+ full_df = pd.read_csv('bsg_training_data_full.tsv', sep='\t')
21
+ pubmed_clustered_df = pd.read_csv('pubmed_clustered_data_sciner.tsv', sep='\t')
22
+
23
+ print(f"Cluster aligned data: {len(cluster_aligned_df)} records")
24
+ print(f"Full data: {len(full_df)} records")
25
+ print(f"PubMed clustered data: {len(pubmed_clustered_df)} records")
26
+
27
+ # Get the cluster-aligned columns
28
+ cluster_columns = cluster_aligned_df.columns.tolist()
29
+ base_columns = ['OriginalIndex', 'SourceRow', 'AbstractSummary', 'ShortSummary',
30
+ 'Title', 'OriginalKeywords', 'OriginalText']
31
+
32
+ print(f"Cluster aligned columns: {cluster_columns}")
33
+ print(f"Base columns: {base_columns}")
34
+
35
+ # Extract SourceRow values that are already in the cluster-aligned data
36
+ aligned_source_rows = set(cluster_aligned_df['SourceRow'].values)
37
+ print(f"Already aligned source rows: {len(aligned_source_rows)}")
38
+
39
+ # Find records in the full dataset that are not in the cluster-aligned data
40
+ missing_records = full_df[~full_df['SourceRow'].isin(aligned_source_rows)].copy()
41
+ print(f"Missing records to be added: {len(missing_records)}")
42
+
43
+ if len(missing_records) > 0:
44
+ # For the missing records, we need to add the clustering columns
45
+ # We'll use placeholder values for now, similar to what might be in the aligned data
46
+
47
+ # Add missing columns with placeholder values
48
+ missing_records['cluster_num_x'] = 'cluster_' + missing_records['SourceRow'].astype(str)
49
+ missing_records['Index'] = missing_records['SourceRow']
50
+ missing_records['ConcatenatedAbstracts'] = missing_records['AbstractSummary']
51
+ missing_records['TopKeywords'] = missing_records['OriginalKeywords']
52
+ missing_records['cluster_num_y'] = missing_records['SourceRow']
53
+
54
+ # Reorder columns to match cluster_aligned_df
55
+ missing_records = missing_records[cluster_columns]
56
+
57
+ # Combine the datasets
58
+ complete_training_data = pd.concat([cluster_aligned_df, missing_records], ignore_index=True)
59
+
60
+ print(f"Complete training data: {len(complete_training_data)} records")
61
+
62
+ # Save the complete dataset
63
+ output_file = 'bsg_training_data_complete_aligned.tsv'
64
+ complete_training_data.to_csv(output_file, sep='\t', index=False)
65
+ print(f"Complete training data saved to: {output_file}")
66
+
67
+ # Also check if we can use any clustering information from pubmed_clustered_data
68
+ if len(pubmed_clustered_df) == len(full_df):
69
+ print("PubMed clustered data has same length as full data - checking for additional clustering info...")
70
+
71
+ # Check column overlap
72
+ pubmed_columns = pubmed_clustered_df.columns.tolist()
73
+ print(f"PubMed columns: {pubmed_columns}")
74
+
75
+ # If there are useful clustering columns in pubmed data, merge them
76
+ if 'cluster_num' in pubmed_columns or any('cluster' in col.lower() for col in pubmed_columns):
77
+ print("Found clustering information in PubMed data, will create enhanced version...")
78
+
79
+ # Create enhanced version with proper clustering from pubmed data
80
+ enhanced_data = complete_training_data.copy()
81
+
82
+ # Merge with pubmed clustering data based on SourceRow
83
+ if 'SourceRow' in pubmed_columns:
84
+ pubmed_subset = pubmed_clustered_df[['SourceRow'] + [col for col in pubmed_columns if 'cluster' in col.lower()]]
85
+ enhanced_data = enhanced_data.merge(pubmed_subset, on='SourceRow', how='left', suffixes=('', '_pubmed'))
86
+
87
+ enhanced_output_file = 'bsg_training_data_complete_enhanced.tsv'
88
+ enhanced_data.to_csv(enhanced_output_file, sep='\t', index=False)
89
+ print(f"Enhanced training data saved to: {enhanced_output_file}")
90
+
91
+ return complete_training_data
92
+ else:
93
+ print("No missing records found - cluster aligned data is already complete!")
94
+ return cluster_aligned_df
95
+
96
+ if __name__ == "__main__":
97
+ # Change to the script directory
98
+ import os
99
+ os.chdir('/home/joneill/bsg_cyllama')
100
+
101
+ complete_data = compile_complete_training_data()
102
+ print(f"Compilation complete! Final dataset has {len(complete_data)} records.")
103
+
104
+
105
+