Add BLASER-REF model and config

Browse files

Files changed (4) hide show

README.md +288 -0
config.json +17 -0
model.safetensors +3 -0
modeling_blaser.py +136 -0

README.md ADDED Viewed

	@@ -0,0 +1,288 @@

+---
+license: cc-by-nc-4.0
+language:
+- ace
+- acm
+- acq
+- aeb
+- af
+- ajp
+- ak
+- am
+- apc
+- ar
+- ars
+- ary
+- arz
+- as
+- ast
+- awa
+- ay
+- azb
+- azj
+- ba
+- bm
+- ban
+- be
+- bem
+- bn
+- bho
+- bjn
+- bo
+- bs
+- bug
+- bg
+- ca
+- ceb
+- cs
+- cjk
+- ckb
+- crh
+- cy
+- da
+- de
+- dik
+- dyu
+- dz
+- el
+- en
+- eo
+- et
+- eu
+- ee
+- fo
+- fa
+- fj
+- fi
+- fon
+- fr
+- fur
+- ff
+- gd
+- ga
+- gl
+- gn
+- gu
+- ht
+- ha
+- he
+- hi
+- hne
+- hr
+- hu
+- hy
+- ig
+- ilo
+- id
+- is
+- it
+- jv
+- ja
+- kab
+- kac
+- kam
+- kn
+- ks
+- ka
+- kr
+- kk
+- kbp
+- kea
+- km
+- ki
+- rw
+- ky
+- kmb
+- kg
+- ko
+- kmr
+- lo
+- lv
+- lij
+- li
+- ln
+- lt
+- lmo
+- ltg
+- lb
+- lua
+- lg
+- luo
+- lus
+- mag
+- mai
+- ml
+- mr
+- min
+- mk
+- plt
+- mt
+- mni
+- mn
+- mos
+- mi
+- ms
+- my
+- nl
+- nn
+- nb
+- ne
+- nso
+- nus
+- ny
+- oc
+- gaz
+- ory
+- pag
+- pa
+- pap
+- pl
+- pt
+- prs
+- pbt
+- qu
+- ro
+- rn
+- ru
+- sg
+- sa
+- sat
+- scn
+- shn
+- si
+- sk
+- sl
+- sm
+- sn
+- sd
+- so
+- st
+- es
+- als
+- sc
+- sr
+- ss
+- su
+- sv
+- sw
+- szl
+- ta
+- tt
+- te
+- tg
+- tl
+- th
+- ti
+- taq
+- tpi
+- tn
+- ts
+- tk
+- tum
+- tr
+- tw
+- tzm
+- ug
+- uk
+- umb
+- ur
+- uz
+- vec
+- vi
+- war
+- wo
+- xh
+- yi
+- yo
+- yue
+- zh
+- zu
+language_details: >-
+  ace_Arab, ace_Latn, acm_Arab, acq_Arab, aeb_Arab, afr_Latn, ajp_Arab,
+  aka_Latn, amh_Ethi, apc_Arab, arb_Arab, ars_Arab, ary_Arab, arz_Arab,
+  asm_Beng, ast_Latn, awa_Deva, ayr_Latn, azb_Arab, azj_Latn, bak_Cyrl,
+  bam_Latn, ban_Latn, bel_Cyrl, bem_Latn, ben_Beng, bho_Deva, bjn_Arab,
+  bod_Tibt, bos_Latn, bug_Latn, bul_Cyrl, cat_Latn, ceb_Latn, ces_Latn,
+  cjk_Latn, ckb_Arab, crh_Latn, cym_Latn, dan_Latn, deu_Latn, dik_Latn,
+  dyu_Latn, dzo_Tibt, ell_Grek, eng_Latn, epo_Latn, est_Latn, eus_Latn,
+  ewe_Latn, fao_Latn, pes_Arab, fij_Latn, fin_Latn, fon_Latn, fra_Latn,
+  fur_Latn, fuv_Latn, gla_Latn, gle_Latn, glg_Latn, grn_Latn, guj_Gujr,
+  hat_Latn, hau_Latn, heb_Hebr, hin_Deva, hne_Deva, hrv_Latn, hun_Latn,
+  hye_Armn, ibo_Latn, ilo_Latn, ind_Latn, isl_Latn, ita_Latn, jav_Latn,
+  jpn_Jpan, kab_Latn, kac_Latn, kam_Latn, kan_Knda, kas_Arab, kas_Deva,
+  kat_Geor, knc_Arab, knc_Latn, kaz_Cyrl, kbp_Latn, kea_Latn, khm_Khmr,
+  kik_Latn, kin_Latn, kir_Cyrl, kmb_Latn, kon_Latn, kor_Hang, kmr_Latn,
+  lao_Laoo, lvs_Latn, lij_Latn, lim_Latn, lin_Latn, lit_Latn, lmo_Latn,
+  ltg_Latn, ltz_Latn, lua_Latn, lug_Latn, luo_Latn, lus_Latn, mag_Deva,
+  mai_Deva, mal_Mlym, mar_Deva, min_Latn, mkd_Cyrl, plt_Latn, mlt_Latn,
+  mni_Beng, khk_Cyrl, mos_Latn, mri_Latn, zsm_Latn, mya_Mymr, nld_Latn,
+  nno_Latn, nob_Latn, npi_Deva, nso_Latn, nus_Latn, nya_Latn, oci_Latn,
+  gaz_Latn, ory_Orya, pag_Latn, pan_Guru, pap_Latn, pol_Latn, por_Latn,
+  prs_Arab, pbt_Arab, quy_Latn, ron_Latn, run_Latn, rus_Cyrl, sag_Latn,
+  san_Deva, sat_Beng, scn_Latn, shn_Mymr, sin_Sinh, slk_Latn, slv_Latn,
+  smo_Latn, sna_Latn, snd_Arab, som_Latn, sot_Latn, spa_Latn, als_Latn,
+  srd_Latn, srp_Cyrl, ssw_Latn, sun_Latn, swe_Latn, swh_Latn, szl_Latn,
+  tam_Taml, tat_Cyrl, tel_Telu, tgk_Cyrl, tgl_Latn, tha_Thai, tir_Ethi,
+  taq_Latn, taq_Tfng, tpi_Latn, tsn_Latn, tso_Latn, tuk_Latn, tum_Latn,
+  tur_Latn, twi_Latn, tzm_Tfng, uig_Arab, ukr_Cyrl, umb_Latn, urd_Arab,
+  uzn_Latn, vec_Latn, vie_Latn, war_Latn, wol_Latn, xho_Latn, ydd_Hebr,
+  yor_Latn, yue_Hant, zho_Hans, zho_Hant, zul_Latn
+pipeline_tag: sentence-similarity
+---
+# BLASER QE (Ported)
+This is a **ported version of the BLASER quality estimation (REF) model** originally developed in [BLASER: Bilingual Language-Agnostic Sentence Representations](https://huggingface.co/facebook/blaser-2.0-ref).
+- **Ported to Hugging Face Transformers**: no dependency on Fairseq.
+- **Uses embeddings from the ported SONAR 200 multilingual text encoder** ([cointegrated/SONAR_200_text_encoder](https://huggingface.co/cointegrated/SONAR_200_text_encoder)).
+- **Supports the same 202 languages** as SONAR / NLLB-200.
+- **Outputs BLASER scores on a 1–5 scale** for a source–MT–REF triplet.
+> ⚠️ This is **not the original implementation**. Attribution goes to the original BLASER authors.
+---
+## How to compute QE scores
+```python
+import torch
+from transformers import AutoTokenizer, AutoModel
+from transformers.models.m2m_100.modeling_m2m_100 import M2M100Encoder
+# 1. Load SONAR encoder
+sonar_model_name = "cointegrated/SONAR_200_text_encoder"
+encoder = M2M100Encoder.from_pretrained(sonar_model_name)
+tokenizer = AutoTokenizer.from_pretrained(sonar_model_name)
+def encode_mean_pool(texts, tokenizer, encoder, lang='eng_Latn', norm=False):
+    tokenizer.src_lang = lang
+    with torch.inference_mode():
+        batch = tokenizer(texts, return_tensors='pt', padding=True)
+        seq_embs = encoder(**batch).last_hidden_state
+        mask = batch.attention_mask
+        mean_emb = (seq_embs * mask.unsqueeze(-1)).sum(1) / mask.unsqueeze(-1).sum(1)
+        if norm:
+            mean_emb = torch.nn.functional.normalize(mean_emb)
+    return mean_emb
+# Example sentences
+src_sentences = ["Le chat s'assit sur le tapis."]
+mt_sentences = ["The cat sat down on the carpet."]  # Example MT output
+ref_sentences = ["The cat sat on the mat."]  # Example reference translation
+# Encode source and MT sentences
+src_embs = encode_mean_pool(src_sentences, tokenizer, encoder, lang="fra_Latn")
+mt_embs = encode_mean_pool(mt_sentences, tokenizer, encoder, lang="eng_Latn")
+ref_embs = encode_mean_pool(ref_sentences, tokenizer, encoder, lang="eng_Latn")
+# 2. Load BLASER QE model (ported)
+ref_model_name = "oist/blaser-2.0-ref-ported"
+ref_model = AutoModel.from_pretrained(qe_model_name, trust_remote_code=True)
+ref_model.eval()  # set to evaluation mode
+# 3. Compute QE scores
+with torch.inference_mode():
+    ref_scores = ref_model(src_embs, mt_embs, ref_embs)  # expects source and MT embeddings, and ref embeddings
+    print("Blaser score shape:", ref_scores.shape)
+    print("Blaser scores:", ref_scores[0])

config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    "activation": "TANH",
+    "architectures": ["BlaserModel"],
+    "dropout": 0.1,
+    "embedding_dim": 1024,
+    "hidden_dims": [3072, 1536],
+    "input_form": "COMET",
+    "model_type": "blaser",
+    "norm_emb": true,
+    "output_act": false,
+    "output_dim": 1,
+    "transformers_version": "4.56.1",
+    "auto_map": {
+      "AutoConfig": "modeling_blaser.BlaserConfig",
+      "AutoModel": "modeling_blaser.BlaserModel"
+    }
+  }

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4836d62d1e5540890dad7a9ac6f41317522a71dd195f3a813c991c87522225c1
+size 94396980

modeling_blaser.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import List, Optional
+from torch import Tensor
+from transformers import PretrainedConfig, PreTrainedModel
+# ---------------- CONFIG ---------------- #
+class BlaserConfig(PretrainedConfig):
+    model_type = "blaser"
+    def __init__(
+        self,
+        embedding_dim=1024,
+        output_dim=1,
+        hidden_dims=None,
+        dropout=0.1,
+        activation="TANH",
+        input_form="COMET",
+        norm_emb=True,
+        output_act=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embedding_dim = embedding_dim
+        self.output_dim = output_dim
+        self.hidden_dims = hidden_dims if hidden_dims is not None else [3072, 1536]
+        self.dropout = dropout
+        self.activation = activation
+        self.input_form = input_form
+        self.norm_emb = norm_emb
+        self.output_act = output_act
+# ---------------- CORE MODEL ---------------- #
+ACTIVATIONS = {"TANH": nn.Tanh, "RELU": nn.ReLU}
+class BlaserCore(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        output_dim: int,
+        hidden_dims: List[int],
+        dropout: float,
+        activation: str,
+        input_form: str,
+        norm_emb: bool,
+        output_act: bool,
+    ):
+        super().__init__()
+        self.input_form = input_form
+        self.norm_emb = norm_emb
+        if input_form == "COMET":
+            embedding_dim *= 6
+        elif input_form == "QE":
+            embedding_dim *= 4
+        else:
+            raise ValueError(f"Unrecognized input_form: {input_form}")
+        if activation not in ACTIVATIONS:
+            raise ValueError(f"Unrecognized activation: {activation}")
+        modules: List[nn.Module] = []
+        if hidden_dims:
+            if dropout > 0:
+                modules.append(nn.Dropout(p=dropout))
+            nprev = embedding_dim
+            for h in hidden_dims:
+                modules.append(nn.Linear(nprev, h))
+                modules.append(ACTIVATIONS[activation]())
+                if dropout > 0:
+                    modules.append(nn.Dropout(p=dropout))
+                nprev = h
+            modules.append(nn.Linear(nprev, output_dim))
+            if output_act:
+                modules.append(nn.Tanh())
+        else:
+            modules.append(nn.Linear(embedding_dim, output_dim))
+        self.mlp = nn.Sequential(*modules)
+    def _norm(self, emb: Optional[Tensor]) -> Optional[Tensor]:
+        return F.normalize(emb) if (emb is not None and self.norm_emb) else emb
+    def _featurize(self, src: Tensor, mt: Tensor, ref: Optional[Tensor] = None) -> Tensor:
+        if self.input_form == "COMET":
+            if ref is None:
+                raise ValueError("COMET input_form requires reference embedding")
+            return torch.cat(
+                [ref, mt, src * mt, ref * mt, torch.abs(mt - src), torch.abs(mt - ref)],
+                dim=-1,
+            )
+        elif self.input_form == "QE":
+            return torch.cat([src, mt, src * mt, torch.abs(mt - src)], dim=-1)
+# ---------------- HF MODEL WRAPPER ---------------- #
+class BlaserModel(PreTrainedModel):
+    config_class = BlaserConfig
+    def __init__(self, config: BlaserConfig):
+        super().__init__(config)
+        # Directly assign the Sequential MLP to self.mlp
+        core = BlaserCore(
+            embedding_dim=config.embedding_dim,
+            output_dim=config.output_dim,
+            hidden_dims=config.hidden_dims,
+            dropout=config.dropout,
+            activation=config.activation,
+            input_form=config.input_form,
+            norm_emb=config.norm_emb,
+            output_act=config.output_act,
+        )
+        self.mlp = core.mlp
+        self.input_form = core.input_form
+        self.norm_emb = core.norm_emb
+    def forward(self, src, mt, ref=None):
+        # Use the same featurization as in BlaserCore
+        src = F.normalize(src) if self.norm_emb else src
+        mt = F.normalize(mt) if self.norm_emb else mt
+        ref = F.normalize(ref) if (ref is not None and self.norm_emb) else ref
+        if self.input_form == "COMET":
+            if ref is None:
+                raise ValueError("COMET input_form requires reference embedding")
+            proc = torch.cat(
+                [ref, mt, src * mt, ref * mt, torch.abs(mt - src), torch.abs(mt - ref)],
+                dim=-1,
+            )
+        else:  # QE
+            proc = torch.cat([src, mt, src * mt, torch.abs(mt - src)], dim=-1)
+        return self.mlp(proc)