Upload 5 files

Browse files

Files changed (4) hide show

.gitattributes +1 -0
imgs/inference_architecture.png +3 -0
scripts/evaluate/run_evaluate_long_embed.py +161 -0
scripts/evaluate/run_evaluate_mteb_dewey_en_beta.py +112 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+imgs/inference_architecture.png filter=lfs diff=lfs merge=lfs -text

imgs/inference_architecture.png ADDED Viewed

Git LFS Details

SHA256: b921665746ae629386f81c2161ed498bbdfe4e6a0b1bef7fd31fe4627ec49706
Pointer size: 131 Bytes
Size of remote file: 208 kB

scripts/evaluate/run_evaluate_long_embed.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import os
+os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+os.environ["OPENBLAS_NUM_THREADS"] = "32"
+import numpy as np
+import torch
+import mteb
+from mteb.encoder_interface import PromptType
+from sentence_transformers import SentenceTransformer
+from mteb.models.wrapper import Wrapper
+from typing import Sequence
+from typing import Any
+from transformers import AutoTokenizer, AutoModel
+class DeweySingleVectorWrapper:
+    def __init__(self, model_dir, batch_size: int = 8):
+        self.model = SentenceTransformer(
+            model_dir,
+            trust_remote_code=True,
+            model_kwargs={
+                "torch_dtype": torch.bfloat16,  # fp16 容易计算出nan
+                "attn_implementation": "flash_attention_2"
+            },
+            config_kwargs={"single_vector_type": "mean"}
+        ).cuda().bfloat16().eval()
+        self.model.max_seq_length = max_seq_length
+        self.pool = self.model.start_multi_process_pool()
+        self.batch_size = batch_size
+    def encode(
+            self,
+            sentences: list[str],
+            task_name: str,
+            prompt_type: PromptType | None = None,
+            **kwargs,
+    ) -> np.ndarray:
+        if prompt_type.value == "query":
+            prompt = RETRIEVE_Q_PROMPT
+        else:
+            prompt = RETRIEVE_P_PROMPT
+        vectors = self.model.encode_multi_process(
+            sentences=sentences,
+            pool=self.pool,
+            show_progress_bar=True,
+            batch_size=self.batch_size,
+            normalize_embeddings=True,
+            prompt=prompt,
+            precision="float32"
+        )
+        return vectors
+class DeweyMultiVectorWrapper(Wrapper):
+    def __init__(
+            self,
+            model_dir: str,
+            batch_size: int = 8,
+            *args,
+            **kwargs,
+    ) -> None:
+        self.model = AutoModel.from_pretrained(
+            model_dir,
+            trust_remote_code=True,
+            attn_implementation="flash_attention_2"
+        ).cuda().bfloat16()
+        self.batch_size = batch_size
+        self.model.tokenizer = AutoTokenizer.from_pretrained(model_dir)
+    def encode(
+            self,
+            sentences: Sequence[str],
+            *,
+            task_name: str,
+            prompt_type: PromptType | None = None,
+            **kwargs: Any,
+    ) -> np.ndarray:
+        if prompt_type.value == "query":
+            prompt = RETRIEVE_Q_PROMPT
+        else:
+            prompt = RETRIEVE_P_PROMPT
+        if prompt_type.value == "query":
+            pred = self.model.encode(
+                sentences=list(sentences),
+                use_cuda=True,
+                show_progress_bar=True,
+                chunk_size=-1,
+                chunk_overlap=32,
+                convert_to_tensor=True,
+                max_seq_length=max_seq_length,
+                batch_size=self.batch_size,
+                normalize_embeddings=True,
+                prompt=prompt,
+                fast_chunk=False
+            )[0]
+            # query vector do not need multi vector, we only use mean as final one vector
+            pred = [vecs[1:2, :] for vecs in pred]
+        else:
+            pred = self.model.encode(
+                sentences=list(sentences),
+                use_cuda=True,
+                show_progress_bar=True,
+                chunk_size=256,
+                chunk_overlap=32,
+                convert_to_tensor=True,
+                max_seq_length=max_seq_length,
+                batch_size=self.batch_size,
+                normalize_embeddings=True,
+                prompt=prompt,
+                fast_chunk=True,
+            )[0]
+        pred = torch.nn.utils.rnn.pad_sequence(pred, batch_first=True, padding_value=0)
+        return pred.cpu().numpy()
+    def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
+        if not isinstance(a, torch.Tensor):
+            a = torch.tensor(a, dtype=torch.float32)
+        if not isinstance(b, torch.Tensor):
+            b = torch.tensor(b, dtype=torch.float32)
+        if len(a.shape) == 2:
+            a = a.unsqueeze(0)
+        if len(b.shape) == 2:
+            b = b.unsqueeze(0)
+        scores = torch.einsum(
+            "ash,bth->abst",
+            a,
+            b,
+        )
+        return scores.max(axis=-1).values.sum(axis=-1)
+RETRIEVE_Q_PROMPT = "<|START_INSTRUCTION|>Answer the question<|END_INSTRUCTION|>"
+RETRIEVE_P_PROMPT = "<|START_INSTRUCTION|>Candidate document<|END_INSTRUCTION|>"
+if __name__ == "__main__":
+    #################  evaluate single vector  #################
+    # batch_size = 4
+    # max_seq_length = 128 * 1024
+    # model = DeweySingleVectorWrapper("infgrad/dewey_en_beta", batch_size=batch_size)
+    # output_folder = f"./long_embed_benchmark/dewey_en_beta_single_vector_128k"
+    # tasks = list(mteb.get_benchmark("LongEmbed"))
+    # evaluation = mteb.MTEB(tasks=tasks)
+    # evaluation.run(model, output_folder=output_folder, verbosity=2, overwrite_results=False)
+    #################  evaluate multi vectors  #################
+    batch_size = 4
+    max_seq_length = 128 * 1024
+    model = DeweyMultiVectorWrapper("infgrad/dewey_en_beta", batch_size=batch_size)
+    output_folder = f"./long_embed_benchmark/dewey_en_beta_multi_vectors"
+    tasks = list(mteb.get_benchmark("LongEmbed"))
+    evaluation = mteb.MTEB(tasks=tasks)
+    evaluation.run(model, output_folder=output_folder, verbosity=2, overwrite_results=False)

scripts/evaluate/run_evaluate_mteb_dewey_en_beta.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import os
+os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+os.environ["OPENBLAS_NUM_THREADS"] = "32"
+import mteb
+import torch
+import numpy as np
+from mteb.encoder_interface import PromptType
+from sentence_transformers import SentenceTransformer
+TASK_NAME2TYPE = {
+    'ArguAna': 'Retrieval', 'ArXivHierarchicalClusteringP2P': 'Clustering',
+    'ArXivHierarchicalClusteringS2S': 'Clustering', 'AskUbuntuDupQuestions': 'Reranking',
+    'BIOSSES': 'STS', 'Banking77Classification': 'Classification',
+    'BiorxivClusteringP2P.v2': 'Clustering', 'CQADupstackGamingRetrieval': 'Retrieval',
+    'CQADupstackUnixRetrieval': 'Retrieval', 'ClimateFEVERHardNegatives': 'Retrieval',
+    'FEVERHardNegatives': 'Retrieval', 'FiQA2018': 'Retrieval', 'HotpotQAHardNegatives': 'Retrieval',
+    'ImdbClassification': 'Classification', 'MTOPDomainClassification': 'Classification',
+    'MassiveIntentClassification': 'Classification', 'MassiveScenarioClassification': 'Classification',
+    'MedrxivClusteringP2P.v2': 'Clustering', 'MedrxivClusteringS2S.v2': 'Clustering',
+    'MindSmallReranking': 'Reranking', 'SCIDOCS': 'Retrieval', 'SICK-R': 'STS', 'STS12': 'STS',
+    'STS13': 'STS', 'STS14': 'STS', 'STS15': 'STS', 'STSBenchmark': 'STS',
+    'SprintDuplicateQuestions': 'PairClassification', 'StackExchangeClustering.v2': 'Clustering',
+    'StackExchangeClusteringP2P.v2': 'Clustering', 'TRECCOVID': 'Retrieval',
+    'Touche2020Retrieval.v3': 'Retrieval', 'ToxicConversationsClassification': 'Classification',
+    'TweetSentimentExtractionClassification': 'Classification',
+    'TwentyNewsgroupsClustering.v2': 'Clustering', 'TwitterSemEval2015': 'PairClassification',
+    'TwitterURLCorpus': 'PairClassification', 'SummEvalSummarization.v2': 'Summarization',
+    'AmazonCounterfactualClassification': 'Classification', 'STS17': 'STS', 'STS22.v2': 'STS'
+}
+RETRIEVE_Q_PROMPT = "<|START_INSTRUCTION|>Answer the question<|END_INSTRUCTION|>"
+RETRIEVE_P_PROMPT = "<|START_INSTRUCTION|>Candidate document<|END_INSTRUCTION|>"
+STS_PROMPT = "<|START_INSTRUCTION|>Generate semantically similar text<|END_INSTRUCTION|>"
+TASK_NAME2PROMPT = {
+    # Classification
+    "Banking77Classification": "<|START_INSTRUCTION|>Classify text into intents<|END_INSTRUCTION|>",
+    "ImdbClassification": "<|START_INSTRUCTION|>Classify text into sentiment<|END_INSTRUCTION|>",
+    "MTOPDomainClassification": "<|START_INSTRUCTION|>Classify text into intent domain<|END_INSTRUCTION|>",
+    "MassiveIntentClassification": "<|START_INSTRUCTION|>Classify text into user intents<|END_INSTRUCTION|>",
+    "MassiveScenarioClassification": "<|START_INSTRUCTION|>Classify text into user scenarios<|END_INSTRUCTION|>",
+    "ToxicConversationsClassification": "<|START_INSTRUCTION|>Classify text into toxic or not toxic<|END_INSTRUCTION|>",
+    "TweetSentimentExtractionClassification": "<|START_INSTRUCTION|>Classify text into positive, negative, or neutral sentiment<|END_INSTRUCTION|>",
+    "AmazonCounterfactualClassification": "<|START_INSTRUCTION|>Classify text into counterfactual or not-counterfactual<|END_INSTRUCTION|>",
+    # Clustering
+    "ArXivHierarchicalClusteringP2P": "<|START_INSTRUCTION|>Output main and secondary category of Arxiv papers based on the titles and abstracts<|END_INSTRUCTION|>",
+    "ArXivHierarchicalClusteringS2S": "<|START_INSTRUCTION|>Output main and secondary category of Arxiv papers based on the titles<|END_INSTRUCTION|>",
+    "BiorxivClusteringP2P.v2": "<|START_INSTRUCTION|>Output main category of Biorxiv papers based on the titles and abstracts<|END_INSTRUCTION|>",
+    "MedrxivClusteringP2P.v2": "<|START_INSTRUCTION|>Output main category of Medrxiv papers based on the titles and abstracts<|END_INSTRUCTION|>",
+    "MedrxivClusteringS2S.v2": "<|START_INSTRUCTION|>Output main category of Medrxiv papers based on the titles<|END_INSTRUCTION|>",
+    "StackExchangeClustering.v2": "<|START_INSTRUCTION|>Output topic or theme of StackExchange posts based on the titles<|END_INSTRUCTION|>",
+    "StackExchangeClusteringP2P.v2": "<|START_INSTRUCTION|>Output topic or theme of StackExchange posts based on the given paragraphs<|END_INSTRUCTION|>",
+    "TwentyNewsgroupsClustering.v2": "<|START_INSTRUCTION|>Output topic or theme of news articles<|END_INSTRUCTION|>",
+}
+class DeweyWrapper:
+    def __init__(self, model_dir, max_seq_length: int = 1536, batch_size: int = 8):
+        self.model = SentenceTransformer(
+            model_dir,
+            trust_remote_code=True,
+            model_kwargs={
+                "torch_dtype": torch.bfloat16,  # fp16 容易计算出nan
+                "attn_implementation": "flash_attention_2"
+            },
+            config_kwargs={"single_vector_type": "cls_add_mean"}
+        ).cuda().bfloat16().eval()
+        self.model.max_seq_length = max_seq_length
+        self.pool = self.model.start_multi_process_pool()
+        self.batch_size = batch_size
+    def encode(
+            self,
+            sentences: list[str],
+            task_name: str,
+            prompt_type: PromptType | None = None,
+            **kwargs,
+    ) -> np.ndarray:
+        task_type = TASK_NAME2TYPE[task_name]
+        if task_type == "Retrieval":
+            if prompt_type.value == "query":
+                prompt = RETRIEVE_Q_PROMPT
+            else:
+                prompt = RETRIEVE_P_PROMPT
+        elif task_type in ["STS", "PairClassification", "Summarization", "Reranking"]:
+            prompt = STS_PROMPT
+        else:
+            prompt = TASK_NAME2PROMPT[task_name]
+        vectors = self.model.encode_multi_process(
+            sentences=sentences,
+            pool=self.pool,
+            show_progress_bar=True,
+            batch_size=self.batch_size,
+            normalize_embeddings=True,
+            prompt=prompt,
+            precision="float32"
+        )
+        return vectors
+if __name__ == "__main__":
+    max_seq_length = 1536
+    batch_szie = 8
+    model_dir_or_name = "infgrad/dewey_en_beta"
+    output_folder = f"./mteb_eng_results/dewey_en_beta"
+    model = DeweyWrapper(model_dir_or_name, max_seq_length=max_seq_length, batch_size=batch_szie)
+    tasks = list(mteb.get_benchmark("MTEB(eng, v2)"))
+    evaluation = mteb.MTEB(tasks=tasks)
+    evaluation.run(model, output_folder=output_folder, verbosity=2, overwrite_results=False)