Update README.md
Browse files
README.md
CHANGED
@@ -32,7 +32,7 @@ tokenized = tokenizer(sequences, padding=True, return_tensors='pt')
|
|
32 |
with torch.no_grad():
|
33 |
embeddings = model(**tokenized).last_hidden_state
|
34 |
|
35 |
-
print(embeddings.shape) # (
|
36 |
```
|
37 |
|
38 |
### For working with sequence logits
|
@@ -40,18 +40,26 @@ print(embeddings.shape) # (1, 11, 1280)
|
|
40 |
import torch
|
41 |
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
42 |
|
43 |
-
model_path = 'Synthyra/FastESM2_650'
|
44 |
model = AutoModelForMaskedLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True).eval()
|
45 |
-
tokenizer = model.tokenizer
|
46 |
-
|
47 |
-
sequences = ['MPRTEIN', 'MSEQWENCE']
|
48 |
-
tokenized = tokenizer(sequences, padding=True, return_tensors='pt')
|
49 |
with torch.no_grad():
|
50 |
logits = model(**tokenized).logits
|
51 |
|
52 |
-
print(logits.shape) # (
|
53 |
```
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
## Embed entire datasets with no new code
|
56 |
To embed a list of protein sequences **fast**, just call embed_dataset. Sequences are sorted to reduce padding tokens, so the initial progress bar estimation is usually much longer than the actual time.
|
57 |
```python
|
|
|
32 |
with torch.no_grad():
|
33 |
embeddings = model(**tokenized).last_hidden_state
|
34 |
|
35 |
+
print(embeddings.shape) # (2, 11, 1280)
|
36 |
```
|
37 |
|
38 |
### For working with sequence logits
|
|
|
40 |
import torch
|
41 |
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
42 |
|
|
|
43 |
model = AutoModelForMaskedLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True).eval()
|
|
|
|
|
|
|
|
|
44 |
with torch.no_grad():
|
45 |
logits = model(**tokenized).logits
|
46 |
|
47 |
+
print(logits.shape) # (2, 11, 33)
|
48 |
```
|
49 |
|
50 |
+
### For working with attention maps
|
51 |
+
```python
|
52 |
+
import torch
|
53 |
+
from transformers import AutoModel, AutoTokenizer
|
54 |
+
|
55 |
+
model = AutoModel.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True).eval()
|
56 |
+
with torch.no_grad():
|
57 |
+
attentions = model(**tokenized, output_attentions).attentions # tuples of (batch_size, num_heads, seq_len, seq_len)
|
58 |
+
|
59 |
+
print(attentions[-1].shape) # (2, 20, 11, 11)
|
60 |
+
```
|
61 |
+
|
62 |
+
|
63 |
## Embed entire datasets with no new code
|
64 |
To embed a list of protein sequences **fast**, just call embed_dataset. Sequences are sorted to reduce padding tokens, so the initial progress bar estimation is usually much longer than the actual time.
|
65 |
```python
|