compile_complete_training_data.py · jimnoneill/BSG_CyLlama at e6a3f1955a13e98ae8ca7d32a44fb2f7bed7e2f3

BSG_CyLlama / compile_complete_training_data.py

Upload compile_complete_training_data.py with huggingface_hub

a04d370 verified 6 days ago

5 kB

	#!/usr/bin/env python3
	"""
	Script to compile the complete BSG CyLLama training data
	Combines the cluster-aligned data with the remaining records from the full dataset
	"""

	import pandas as pd
	import numpy as np
	from pathlib import Path

	def compile_complete_training_data():
	"""
	Compile the complete training data by combining cluster-aligned data
	with remaining records from the full dataset
	"""
	print("Loading training data files...")

	# Load the datasets
	cluster_aligned_df = pd.read_csv('bsg_training_data_cluster_aligned.tsv', sep='\t')
	full_df = pd.read_csv('bsg_training_data_full.tsv', sep='\t')
	pubmed_clustered_df = pd.read_csv('pubmed_clustered_data_sciner.tsv', sep='\t')

	print(f"Cluster aligned data: {len(cluster_aligned_df)} records")
	print(f"Full data: {len(full_df)} records")
	print(f"PubMed clustered data: {len(pubmed_clustered_df)} records")

	# Get the cluster-aligned columns
	cluster_columns = cluster_aligned_df.columns.tolist()
	base_columns = ['OriginalIndex', 'SourceRow', 'AbstractSummary', 'ShortSummary',
	'Title', 'OriginalKeywords', 'OriginalText']

	print(f"Cluster aligned columns: {cluster_columns}")
	print(f"Base columns: {base_columns}")

	# Extract SourceRow values that are already in the cluster-aligned data
	aligned_source_rows = set(cluster_aligned_df['SourceRow'].values)
	print(f"Already aligned source rows: {len(aligned_source_rows)}")

	# Find records in the full dataset that are not in the cluster-aligned data
	missing_records = full_df[~full_df['SourceRow'].isin(aligned_source_rows)].copy()
	print(f"Missing records to be added: {len(missing_records)}")

	if len(missing_records) > 0:
	# For the missing records, we need to add the clustering columns
	# We'll use placeholder values for now, similar to what might be in the aligned data

	# Add missing columns with placeholder values
	missing_records['cluster_num_x'] = 'cluster_' + missing_records['SourceRow'].astype(str)
	missing_records['Index'] = missing_records['SourceRow']
	missing_records['ConcatenatedAbstracts'] = missing_records['AbstractSummary']
	missing_records['TopKeywords'] = missing_records['OriginalKeywords']
	missing_records['cluster_num_y'] = missing_records['SourceRow']

	# Reorder columns to match cluster_aligned_df
	missing_records = missing_records[cluster_columns]

	# Combine the datasets
	complete_training_data = pd.concat([cluster_aligned_df, missing_records], ignore_index=True)

	print(f"Complete training data: {len(complete_training_data)} records")

	# Save the complete dataset
	output_file = 'bsg_training_data_complete_aligned.tsv'
	complete_training_data.to_csv(output_file, sep='\t', index=False)
	print(f"Complete training data saved to: {output_file}")

	# Also check if we can use any clustering information from pubmed_clustered_data
	if len(pubmed_clustered_df) == len(full_df):
	print("PubMed clustered data has same length as full data - checking for additional clustering info...")

	# Check column overlap
	pubmed_columns = pubmed_clustered_df.columns.tolist()
	print(f"PubMed columns: {pubmed_columns}")

	# If there are useful clustering columns in pubmed data, merge them
	if 'cluster_num' in pubmed_columns or any('cluster' in col.lower() for col in pubmed_columns):
	print("Found clustering information in PubMed data, will create enhanced version...")

	# Create enhanced version with proper clustering from pubmed data
	enhanced_data = complete_training_data.copy()

	# Merge with pubmed clustering data based on SourceRow
	if 'SourceRow' in pubmed_columns:
	pubmed_subset = pubmed_clustered_df[['SourceRow'] + [col for col in pubmed_columns if 'cluster' in col.lower()]]
	enhanced_data = enhanced_data.merge(pubmed_subset, on='SourceRow', how='left', suffixes=('', '_pubmed'))

	enhanced_output_file = 'bsg_training_data_complete_enhanced.tsv'
	enhanced_data.to_csv(enhanced_output_file, sep='\t', index=False)
	print(f"Enhanced training data saved to: {enhanced_output_file}")

	return complete_training_data
	else:
	print("No missing records found - cluster aligned data is already complete!")
	return cluster_aligned_df

	if __name__ == "__main__":
	# Change to the script directory
	import os
	os.chdir('/home/joneill/bsg_cyllama')

	complete_data = compile_complete_training_data()
	print(f"Compilation complete! Final dataset has {len(complete_data)} records.")