hazemessam commited on
Commit
13b5d09
·
verified ·
1 Parent(s): e7cc24d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +13 -3
README.md CHANGED
@@ -51,13 +51,23 @@ Ankh3 is a protein language model that is jointly optimized on two objectives:
51
  from transformers import T5ForConditionalGeneration, T5Tokenizer, T5EncoderModel
52
  import torch
53
 
 
54
  sequence = "MDTAYPREDTRAPTPSKAGAHTALTLGAPHPPPRDHLIWSVFSTLYLNLCCLGFLALAYSIKARDQKVVGDLEAARRFGSKAKCYNILAAMWTLVPPLLLLGLVVTGALHLARLAKDSAAFFSTKFDDADYD"
55
 
56
- ckpt = "proteinea-ea/ankh3-large"
 
 
57
  tokenizer = T5Tokenizer.from_pretrained(ckpt)
 
58
  # To use the encoder representation using the NLU prefix:
59
  encoder_model = T5EncoderModel.from_pretrained(ckpt).eval()
60
- # Random sequence from uniprot, most likely Ankh3 saw it during pre-training.
 
 
 
 
 
 
61
  nlu_sequence = "[NLU]" + sequence
62
  encoded_nlu_sequence = tokenizer(nlu_sequence, add_special_tokens=True, return_tensors="pt", is_split_into_words=False)
63
 
@@ -73,7 +83,7 @@ import torch
73
 
74
  sequence = "MDTAYPREDTRAPTPSKAGAHTALTLGAPHPPPRDHLIWSVFSTLYLNLCCLGFLALAYSIKARDQKVVGDLEAARRFGSKAKCYNILAAMWTLVPPLLLLGLVVTGALHLARLAKDSAAFFSTKFDDADYD"
75
 
76
- ckpt = "proteinea-ea/ankh3-large"
77
  tokenizer = T5Tokenizer.from_pretrained(ckpt)
78
  # To use the sequence to sequence task using the S2S prefix:
79
  model = T5ForConditionalGeneration.from_pretrained(ckpt).eval()
 
51
  from transformers import T5ForConditionalGeneration, T5Tokenizer, T5EncoderModel
52
  import torch
53
 
54
+ # Random sequence from uniprot, most likely Ankh3 saw it during pre-training.
55
  sequence = "MDTAYPREDTRAPTPSKAGAHTALTLGAPHPPPRDHLIWSVFSTLYLNLCCLGFLALAYSIKARDQKVVGDLEAARRFGSKAKCYNILAAMWTLVPPLLLLGLVVTGALHLARLAKDSAAFFSTKFDDADYD"
56
 
57
+ ckpt = "ElnaggarLab/ankh3-large"
58
+
59
+ # Make sure that you must use `T5Tokenizer` not `AutoTokenizer`.
60
  tokenizer = T5Tokenizer.from_pretrained(ckpt)
61
+
62
  # To use the encoder representation using the NLU prefix:
63
  encoder_model = T5EncoderModel.from_pretrained(ckpt).eval()
64
+
65
+
66
+ # For extracting embeddings, consider trying the '[S2S]' prefix.
67
+ # Since this prefix was specifically used to denote sequence completion
68
+ # during the model's pre-training, its use can sometimes
69
+ # lead to improved embedding quality.
70
+
71
  nlu_sequence = "[NLU]" + sequence
72
  encoded_nlu_sequence = tokenizer(nlu_sequence, add_special_tokens=True, return_tensors="pt", is_split_into_words=False)
73
 
 
83
 
84
  sequence = "MDTAYPREDTRAPTPSKAGAHTALTLGAPHPPPRDHLIWSVFSTLYLNLCCLGFLALAYSIKARDQKVVGDLEAARRFGSKAKCYNILAAMWTLVPPLLLLGLVVTGALHLARLAKDSAAFFSTKFDDADYD"
85
 
86
+ ckpt = "ElnaggarLab/ankh3-large"
87
  tokenizer = T5Tokenizer.from_pretrained(ckpt)
88
  # To use the sequence to sequence task using the S2S prefix:
89
  model = T5ForConditionalGeneration.from_pretrained(ckpt).eval()