add word segmentation before tokenization

Browse files

Files changed (4) hide show

README.md +26 -25
custom_tokenizer.py +11 -0
pipeline.py +76 -0
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -1,21 +1,22 @@
 ---
 pipeline_tag: sentence-similarity
 tags:
-- sentence-transformers
-- feature-extraction
-- sentence-similarity
-- transformers
 language:
-- vi
-- en
 widget:
-- source_sentence: "Anh ấy đang là sinh viên năm cuối"
-  sentences:
-    - "Anh ấy học tại Đại học Bách khoa Hà Nội, chuyên ngành Khoa học máy tính"
-    - "Anh ấy đang làm việc tại nhà máy sản xuất linh kiện điện tử"
-    - "Anh ấy chuẩn bị đi du học nước ngoài"
-    - "Anh ấy sắp mở cửa hàng bán mỹ phẩm"
-    - "Nhà anh ấy có rất nhiều cây cảnh"
 ---
 # bkai-foundation-models/vietnamese-bi-encoder
@@ -43,9 +44,8 @@ embeddings = model.encode(sentences)
 print(embeddings)
 ```
 ## Usage (HuggingFace Transformers)
 Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
 ```python
@@ -81,21 +81,20 @@ print("Sentence embeddings:")
 print(sentence_embeddings)
 ```
 ## Evaluation Results
 <!--- Describe how your model was evaluated -->
-For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name=bkai-foundation-models/vietnamese-bi-encoder)
 ## Training
 The model was trained with the parameters:
 **DataLoader**:
 `torch.utils.data.dataloader.DataLoader` of length 17584 with parameters:
 ```
 {'batch_size': 32, 'sampler': 'torch.utils.data.sampler.RandomSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
 ```
@@ -103,11 +102,13 @@ The model was trained with the parameters:
 **Loss**:
 `sentence_transformers.losses.MultipleNegativesRankingLoss.MultipleNegativesRankingLoss` with parameters:
-  ```
-  {'scale': 20.0, 'similarity_fct': 'cos_sim'}
-  ```
 Parameters of the fit()-Method:
 ```
 {
     "epochs": 15,
@@ -125,15 +126,15 @@ Parameters of the fit()-Method:
 }
 ```
 ## Full Model Architecture
 ```
 SentenceTransformer(
-  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: RobertaModel
   (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
 )
 ```
 ## Citing & Authors
-<!--- Describe where people can find more information -->

 ---
 pipeline_tag: sentence-similarity
 tags:
+  - sentence-transformers
+  - feature-extraction
+  - sentence-similarity
+  - transformers
+library_name: generic
 language:
+  - vi
+  - en
 widget:
+  - source_sentence: 'Anh ấy đang là sinh viên năm cuối'
+    sentences:
+      - 'Anh ấy học tại Đại học Bách khoa Hà Nội, chuyên ngành Khoa học máy tính'
+      - 'Anh ấy đang làm việc tại nhà máy sản xuất linh kiện điện tử'
+      - 'Anh ấy chuẩn bị đi du học nước ngoài'
+      - 'Anh ấy sắp mở cửa hàng bán mỹ phẩm'
+      - 'Nhà anh ấy có rất nhiều cây cảnh'
 ---
 # bkai-foundation-models/vietnamese-bi-encoder
 print(embeddings)
 ```
 ## Usage (HuggingFace Transformers)
 Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
 ```python
 print(sentence_embeddings)
 ```
 ## Evaluation Results
 <!--- Describe how your model was evaluated -->
+For an automated evaluation of this model, see the _Sentence Embeddings Benchmark_: [https://seb.sbert.net](https://seb.sbert.net?model_name=bkai-foundation-models/vietnamese-bi-encoder)
 ## Training
 The model was trained with the parameters:
 **DataLoader**:
 `torch.utils.data.dataloader.DataLoader` of length 17584 with parameters:
 ```
 {'batch_size': 32, 'sampler': 'torch.utils.data.sampler.RandomSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
 ```
 **Loss**:
 `sentence_transformers.losses.MultipleNegativesRankingLoss.MultipleNegativesRankingLoss` with parameters:
+```
+{'scale': 20.0, 'similarity_fct': 'cos_sim'}
+```
 Parameters of the fit()-Method:
 ```
 {
     "epochs": 15,
 }
 ```
 ## Full Model Architecture
 ```
 SentenceTransformer(
+  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: RobertaModel
   (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
 )
 ```
 ## Citing & Authors
+<!--- Describe where people can find more information -->

custom_tokenizer.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from transformers import PhobertTokenizer
+from pyvi import ViTokenizer
+class CustomPhobertTokenizer(PhobertTokenizer):
+    def rdr_segment(self, text):
+        return ViTokenizer.tokenize(text)
+    def _tokenize(self, text):
+        segmented_text = self.rdr_segment(text)
+        return super()._tokenize(segmented_text)

pipeline.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from typing import Dict, List, Union
+import torch
+from transformers import AutoModel
+from custom_tokenizer import CustomPhobertTokenizer
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[
+        0
+    ]  # First element of model_output contains all token embeddings
+    input_mask_expanded = (
+        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    )
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
+        input_mask_expanded.sum(1), min=1e-9
+    )
+class PreTrainedPipeline:
+    def __init__(self, path="."):
+        self.model = AutoModel.from_pretrained(path)
+        self.tokenizer = CustomPhobertTokenizer.from_pretrained(path)
+    def __call__(self, inputs: Dict[str, Union[str, List[str]]]) -> List[float]:
+        """
+        Args:
+            inputs (Dict[str, Union[str, List[str]]]):
+                a dictionary containing a query sentence and a list of key sentences
+        """
+        # Combine the query sentence and key sentences into one list
+        sentences = [inputs["source_sentence"]] + inputs["sentences"]
+        # Tokenize sentences
+        encoded_input = self.tokenizer(
+            sentences, padding=True, truncation=True, return_tensors="pt"
+        )
+        # Compute token embeddings
+        with torch.no_grad():
+            model_output = self.model(**encoded_input)
+        # Perform pooling to get sentence embeddings
+        sentence_embeddings = mean_pooling(
+            model_output, encoded_input["attention_mask"]
+        )
+        # Separate the query embedding from the key embeddings
+        query_embedding = sentence_embeddings[0]
+        key_embeddings = sentence_embeddings[1:]
+        # Compute cosine similarities (or any other comparison method you prefer)
+        cosine_similarities = torch.nn.functional.cosine_similarity(
+            query_embedding.unsqueeze(0), key_embeddings
+        )
+        # Convert the tensor of cosine similarities to a list of floats
+        scores = cosine_similarities.tolist()
+        return scores
+if __name__ == "__main__":
+    inputs = {
+        "source_sentence": "Anh ấy đang là sinh viên năm cuối",
+        "sentences": [
+            "Anh ấy học tại Đại học Bách khoa Hà Nội, chuyên ngành Khoa học máy tính",
+            "Anh ấy đang làm việc tại nhà máy sản xuất linh kiện điện tử",
+            "Anh ấy chuẩn bị đi du học nước ngoài",
+            "Anh ấy sắp mở cửa hàng bán mỹ phẩm",
+            "Nhà anh ấy có rất nhiều cây cảnh",
+        ],
+    }
+    pipeline = PreTrainedPipeline()
+    res = pipeline(inputs)

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ pyvi>=0.1.1