|
""" |
|
Example usage script for LLM2Vec4CXR model. |
|
This demonstrates how to load and use the model for chest X-ray report analysis. |
|
|
|
Prerequisites: |
|
1. Install the LLM2Vec4CXR package: |
|
pip install git+https://github.com/lukeingawesome/llm2vec4cxr.git |
|
|
|
Or clone and install in development mode: |
|
git clone https://github.com/lukeingawesome/llm2vec4cxr.git |
|
cd llm2vec4cxr |
|
pip install -e . |
|
|
|
2. The model will be automatically downloaded from Hugging Face when first used. |
|
""" |
|
|
|
import torch |
|
import torch.nn.functional as F |
|
from llm2vec_wrapper import LLM2VecWrapper as LLM2Vec |
|
|
|
def load_llm2vec4cxr_model(model_name_or_path="lukeingawesome/llm2vec4cxr"): |
|
""" |
|
Load the LLM2Vec4CXR model with proper configuration. |
|
|
|
Args: |
|
model_name_or_path (str): Hugging Face model path or local path |
|
|
|
Returns: |
|
tuple: (model, tokenizer) |
|
""" |
|
|
|
model = LLM2Vec.from_pretrained( |
|
base_model_name_or_path=model_name_or_path, |
|
enable_bidirectional=True, |
|
pooling_mode="latent_attention", |
|
max_length=512, |
|
torch_dtype=torch.bfloat16, |
|
) |
|
|
|
|
|
tokenizer = model.tokenizer |
|
tokenizer.padding_side = 'left' |
|
|
|
return model, tokenizer |
|
|
|
def tokenize_with_separator(texts, tokenizer, max_length=512): |
|
""" |
|
Tokenize texts with special handling for separator-based splitting. |
|
This is useful for instruction-following tasks. |
|
|
|
Args: |
|
texts (list): List of texts to tokenize |
|
tokenizer: The tokenizer to use |
|
max_length (int): Maximum sequence length |
|
|
|
Returns: |
|
dict: Tokenized inputs with attention masks and embed masks |
|
""" |
|
texts_2 = [] |
|
original_texts = [] |
|
separator = '!@#$%^&*()' |
|
|
|
for text in texts: |
|
parts = text.split(separator) |
|
texts_2.append(parts[1] if len(parts) > 1 else "") |
|
original_texts.append("".join(parts)) |
|
|
|
|
|
tokenized = tokenizer( |
|
original_texts, |
|
return_tensors="pt", |
|
padding=True, |
|
truncation=True, |
|
max_length=max_length, |
|
) |
|
|
|
|
|
embed_mask = None |
|
for t_i, t in enumerate(texts_2): |
|
ids = tokenizer( |
|
[t], |
|
return_tensors="pt", |
|
padding=True, |
|
truncation=True, |
|
max_length=max_length, |
|
add_special_tokens=False, |
|
) |
|
|
|
e_m = torch.zeros_like(tokenized["attention_mask"][t_i]) |
|
if len(ids["input_ids"][0]) > 0: |
|
e_m[-len(ids["input_ids"][0]):] = torch.ones(len(ids["input_ids"][0])) |
|
|
|
if embed_mask is None: |
|
embed_mask = e_m.unsqueeze(0) |
|
else: |
|
embed_mask = torch.cat((embed_mask, e_m.unsqueeze(0)), dim=0) |
|
|
|
tokenized["embed_mask"] = embed_mask |
|
return tokenized |
|
|
|
def compute_similarities(model, tokenizer, texts, device): |
|
""" |
|
Compute similarity scores between the first text and all other texts. |
|
|
|
Args: |
|
model: The LLM2Vec model |
|
tokenizer: The tokenizer |
|
texts (list): List of texts to compare (first text is the reference) |
|
device: The device to run computations on |
|
|
|
Returns: |
|
tuple: (embeddings, similarities) |
|
""" |
|
with torch.no_grad(): |
|
|
|
if any('!@#$%^&*()' in text for text in texts): |
|
tokenized = tokenize_with_separator(texts, tokenizer, 512) |
|
else: |
|
tokenized = tokenizer( |
|
texts, |
|
return_tensors="pt", |
|
padding=True, |
|
truncation=True, |
|
max_length=512, |
|
) |
|
|
|
tokenized = tokenized.to(device) |
|
if hasattr(tokenized, 'to'): |
|
tokenized = tokenized.to(torch.bfloat16) |
|
else: |
|
|
|
for key in tokenized: |
|
if torch.is_tensor(tokenized[key]): |
|
tokenized[key] = tokenized[key].to(torch.bfloat16) |
|
|
|
embeddings = model(tokenized) |
|
|
|
|
|
similarities = F.cosine_similarity(embeddings[0], embeddings[1:], dim=1) |
|
|
|
return embeddings, similarities |
|
|
|
def main(): |
|
""" |
|
Example usage of the LLM2Vec4CXR model for chest X-ray report analysis. |
|
""" |
|
|
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
print(f"Using device: {device}") |
|
|
|
|
|
print("Loading LLM2Vec4CXR model...") |
|
model, tokenizer = load_llm2vec4cxr_model() |
|
model = model.to(device).to(torch.bfloat16) |
|
model.eval() |
|
|
|
|
|
print("\n" + "="*60) |
|
print("Example 1: Basic Text Embedding (Built-in Method)") |
|
print("="*60) |
|
|
|
report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases." |
|
|
|
|
|
embedding = model.encode_text(report) |
|
|
|
print(f"Report: {report}") |
|
print(f"Embedding shape: {embedding.shape}") |
|
print(f"Embedding norm: {torch.norm(embedding).item():.4f}") |
|
|
|
|
|
print("\n" + "="*60) |
|
print("Example 2: Instruction-based Similarity Comparison") |
|
print("="*60) |
|
|
|
separator = '!@#$%^&*()' |
|
instruction = 'Determine the change or the status of the pleural effusion.' |
|
report = 'There is a small increase in the left-sided effusion. There continues to be volume loss at both bases.' |
|
text = instruction + separator + report |
|
|
|
comparison_options = [ |
|
'No pleural effusion', |
|
'Pleural effusion', |
|
'Effusion is seen in the right', |
|
'Effusion is seen in the left', |
|
'Pleural effusion is improving', |
|
'Pleural effusion is stable', |
|
'Pleural effusion is worsening' |
|
] |
|
|
|
all_texts = [text] + comparison_options |
|
|
|
|
|
embeddings = model.encode_with_instruction(all_texts) |
|
similarities = F.cosine_similarity(embeddings[0], embeddings[1:], dim=1) |
|
|
|
print(f"Original text: {report}") |
|
print(f"Instruction: {instruction}") |
|
print("\nSimilarity Scores:") |
|
print("-" * 50) |
|
|
|
for option, score in zip(comparison_options, similarities): |
|
print(f"{option:<35} | {score.item():.4f}") |
|
|
|
|
|
best_match_idx = torch.argmax(similarities).item() |
|
print(f"\nBest match: {comparison_options[best_match_idx]} (score: {similarities[best_match_idx].item():.4f})") |
|
|
|
|
|
print("\n" + "="*60) |
|
print("Example 3: Multiple Report Comparison") |
|
print("="*60) |
|
|
|
reports = [ |
|
"No acute cardiopulmonary abnormality.", |
|
"Small bilateral pleural effusions.", |
|
"Large left pleural effusion with compressive atelectasis.", |
|
"Interval improvement in bilateral pleural effusions.", |
|
"Worsening bilateral pleural effusions." |
|
] |
|
|
|
print("Computing embeddings for multiple reports...") |
|
|
|
embeddings = model.encode_text(reports) |
|
|
|
|
|
similarity_matrix = F.cosine_similarity( |
|
embeddings.unsqueeze(1), |
|
embeddings.unsqueeze(0), |
|
dim=2 |
|
) |
|
|
|
print("\nPairwise Similarity Matrix:") |
|
print("-" * 30) |
|
for i, report1 in enumerate(reports): |
|
print(f"Report {i+1}: {report1[:30]}...") |
|
for j, report2 in enumerate(reports): |
|
print(f" vs Report {j+1}: {similarity_matrix[i][j].item():.4f}") |
|
print() |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|