Update README.md
Browse files
README.md
CHANGED
@@ -51,13 +51,23 @@ Ankh3 is a protein language model that is jointly optimized on two objectives:
|
|
51 |
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5EncoderModel
|
52 |
import torch
|
53 |
|
|
|
54 |
sequence = "MDTAYPREDTRAPTPSKAGAHTALTLGAPHPPPRDHLIWSVFSTLYLNLCCLGFLALAYSIKARDQKVVGDLEAARRFGSKAKCYNILAAMWTLVPPLLLLGLVVTGALHLARLAKDSAAFFSTKFDDADYD"
|
55 |
|
56 |
-
ckpt = "
|
|
|
|
|
57 |
tokenizer = T5Tokenizer.from_pretrained(ckpt)
|
|
|
58 |
# To use the encoder representation using the NLU prefix:
|
59 |
encoder_model = T5EncoderModel.from_pretrained(ckpt).eval()
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
nlu_sequence = "[NLU]" + sequence
|
62 |
encoded_nlu_sequence = tokenizer(nlu_sequence, add_special_tokens=True, return_tensors="pt", is_split_into_words=False)
|
63 |
|
@@ -73,7 +83,7 @@ import torch
|
|
73 |
|
74 |
sequence = "MDTAYPREDTRAPTPSKAGAHTALTLGAPHPPPRDHLIWSVFSTLYLNLCCLGFLALAYSIKARDQKVVGDLEAARRFGSKAKCYNILAAMWTLVPPLLLLGLVVTGALHLARLAKDSAAFFSTKFDDADYD"
|
75 |
|
76 |
-
ckpt = "
|
77 |
tokenizer = T5Tokenizer.from_pretrained(ckpt)
|
78 |
# To use the sequence to sequence task using the S2S prefix:
|
79 |
model = T5ForConditionalGeneration.from_pretrained(ckpt).eval()
|
|
|
51 |
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5EncoderModel
|
52 |
import torch
|
53 |
|
54 |
+
# Random sequence from uniprot, most likely Ankh3 saw it during pre-training.
|
55 |
sequence = "MDTAYPREDTRAPTPSKAGAHTALTLGAPHPPPRDHLIWSVFSTLYLNLCCLGFLALAYSIKARDQKVVGDLEAARRFGSKAKCYNILAAMWTLVPPLLLLGLVVTGALHLARLAKDSAAFFSTKFDDADYD"
|
56 |
|
57 |
+
ckpt = "ElnaggarLab/ankh3-large"
|
58 |
+
|
59 |
+
# Make sure that you must use `T5Tokenizer` not `AutoTokenizer`.
|
60 |
tokenizer = T5Tokenizer.from_pretrained(ckpt)
|
61 |
+
|
62 |
# To use the encoder representation using the NLU prefix:
|
63 |
encoder_model = T5EncoderModel.from_pretrained(ckpt).eval()
|
64 |
+
|
65 |
+
|
66 |
+
# For extracting embeddings, consider trying the '[S2S]' prefix.
|
67 |
+
# Since this prefix was specifically used to denote sequence completion
|
68 |
+
# during the model's pre-training, its use can sometimes
|
69 |
+
# lead to improved embedding quality.
|
70 |
+
|
71 |
nlu_sequence = "[NLU]" + sequence
|
72 |
encoded_nlu_sequence = tokenizer(nlu_sequence, add_special_tokens=True, return_tensors="pt", is_split_into_words=False)
|
73 |
|
|
|
83 |
|
84 |
sequence = "MDTAYPREDTRAPTPSKAGAHTALTLGAPHPPPRDHLIWSVFSTLYLNLCCLGFLALAYSIKARDQKVVGDLEAARRFGSKAKCYNILAAMWTLVPPLLLLGLVVTGALHLARLAKDSAAFFSTKFDDADYD"
|
85 |
|
86 |
+
ckpt = "ElnaggarLab/ankh3-large"
|
87 |
tokenizer = T5Tokenizer.from_pretrained(ckpt)
|
88 |
# To use the sequence to sequence task using the S2S prefix:
|
89 |
model = T5ForConditionalGeneration.from_pretrained(ckpt).eval()
|