lhallee commited on
Commit
d6fcec3
·
verified ·
1 Parent(s): 56fe2cc

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +15 -7
README.md CHANGED
@@ -32,7 +32,7 @@ tokenized = tokenizer(sequences, padding=True, return_tensors='pt')
32
  with torch.no_grad():
33
  embeddings = model(**tokenized).last_hidden_state
34
 
35
- print(embeddings.shape) # (1, 11, 1280)
36
  ```
37
 
38
  ### For working with sequence logits
@@ -40,18 +40,26 @@ print(embeddings.shape) # (1, 11, 1280)
40
  import torch
41
  from transformers import AutoModelForMaskedLM, AutoTokenizer
42
 
43
- model_path = 'Synthyra/FastESM2_650'
44
  model = AutoModelForMaskedLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True).eval()
45
- tokenizer = model.tokenizer
46
-
47
- sequences = ['MPRTEIN', 'MSEQWENCE']
48
- tokenized = tokenizer(sequences, padding=True, return_tensors='pt')
49
  with torch.no_grad():
50
  logits = model(**tokenized).logits
51
 
52
- print(logits.shape) # (1, 11, 33)
53
  ```
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  ## Embed entire datasets with no new code
56
  To embed a list of protein sequences **fast**, just call embed_dataset. Sequences are sorted to reduce padding tokens, so the initial progress bar estimation is usually much longer than the actual time.
57
  ```python
 
32
  with torch.no_grad():
33
  embeddings = model(**tokenized).last_hidden_state
34
 
35
+ print(embeddings.shape) # (2, 11, 1280)
36
  ```
37
 
38
  ### For working with sequence logits
 
40
  import torch
41
  from transformers import AutoModelForMaskedLM, AutoTokenizer
42
 
 
43
  model = AutoModelForMaskedLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True).eval()
 
 
 
 
44
  with torch.no_grad():
45
  logits = model(**tokenized).logits
46
 
47
+ print(logits.shape) # (2, 11, 33)
48
  ```
49
 
50
+ ### For working with attention maps
51
+ ```python
52
+ import torch
53
+ from transformers import AutoModel, AutoTokenizer
54
+
55
+ model = AutoModel.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True).eval()
56
+ with torch.no_grad():
57
+ attentions = model(**tokenized, output_attentions).attentions # tuples of (batch_size, num_heads, seq_len, seq_len)
58
+
59
+ print(attentions[-1].shape) # (2, 20, 11, 11)
60
+ ```
61
+
62
+
63
  ## Embed entire datasets with no new code
64
  To embed a list of protein sequences **fast**, just call embed_dataset. Sequences are sorted to reduce padding tokens, so the initial progress bar estimation is usually much longer than the actual time.
65
  ```python