NMIXX-bge-icl
This repository contains a Bge-iclโbased Embedding model fineโtuned with a tripletโloss setup on the nmixx-fin/NMIXX_train
dataset. It produces highโquality sentence embeddings for Korean financial text, optimized for semantic similarity tasks in the finance domain.
How to use
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
seq_lens = attention_mask.sum(dim=1) - 1
idx = torch.arange(last_hidden_states.size(0), device=last_hidden_states.device)
return last_hidden_states[idx, seq_lens]
def get_detailed_instruct(task: str, query: str) -> str:
return f"<instruct>{task}\n<query>{query}"
def get_detailed_example(task: str, query: str, response: str) -> str:
return f"<instruct>{task}\n<query>{query}\n<response>{response}"
def get_new_queries(queries, query_max_len, examples_prefix, tokenizer):
tmp = tokenizer(
queries,
max_length=query_max_len - len(tokenizer("<s>", add_special_tokens=False)["input_ids"]) - len(tokenizer("\n<response></s>", add_special_tokens=False)["input_ids"]),
truncation=True,
return_tensors=None,
add_special_tokens=False
)
prefix_ids = tokenizer(examples_prefix, add_special_tokens=False)["input_ids"]
suffix_ids = tokenizer("\n<response>", add_special_tokens=False)["input_ids"]
new_max = (len(prefix_ids) + len(suffix_ids) + query_max_len + 8) // 8 * 8 + 8
decoded = tokenizer.batch_decode(tmp["input_ids"])
return new_max, [examples_prefix + d + "\n<response>" for d in decoded]
model_name = "nmixx-fin/nmixx-bge-icl"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).eval().to("cuda" if torch.cuda.is_available() else "cpu")
task = "์ ์๋ ๊ธฐ์ค ๋ฌธ์ฅ๊ณผ ์๋ฏธ๊ฐ ๊ฐ์ฅ ์ ์ฌํ ๋ฌธ์ฅ์ ์ฐพ์ผ์ธ์."
examples = [
{
"query": "๋๋ ์ค๋ ๊ธฐ๋ถ์ด ์์ฃผ ์ข์",
"response": "์ค๋ ์ ๋ง ํ๊ธฐ์ฐจ๊ณ ํ๋ณตํ ํ๋ฃจ์์ด์."
},
{
"query": "๋ฐ๋์ด ๋ง์ด ๋ถ๋ ๋ ์จ",
"response": "๋ฐ๋์ด ์ธ์ฐจ๊ฒ ๋ถ์ด ๋จธ๋ฆฌ๊ฐ ํํด์ด์ก์ด์."
}
]
example_strs = [get_detailed_example(task, e["query"], e["response"]) for e in examples]
examples_prefix = "\n\n".join(example_strs) + "\n\n"
queries = [
get_detailed_instruct(task, "์ ์ฌ์ผ๋ก ํผ์๋ฅผ ๋จน์์ด์"),
get_detailed_instruct(task, "๋น๊ฐ ์ค๋ ค๋?")
]
documents = [
"์ค๋ ํ๋น์ด ์จ์จํด์ ์ฐ์ฑ
ํ๊ธฐ ๋ฑ ์ข์ ๋ ์จ์์ต๋๋ค.",
"์ด์ ์ ๋
์ ๋น๊ฐ ๋ด๋ ค์ ๊ธธ์ด ์กฐ๊ธ ์ ์ด ์์์ต๋๋ค."
]
device = model.device
q_max, new_queries = get_new_queries(queries, 512, examples_prefix, tokenizer)
q_batch = tokenizer(new_queries, max_length=q_max, padding=True, truncation=True, return_tensors="pt").to(device)
d_batch = tokenizer(documents, max_length=512, padding=True, truncation=True, return_tensors="pt").to(device)
with torch.no_grad():
q_out = model(**q_batch)
q_emb = last_token_pool(q_out.last_hidden_state, q_batch["attention_mask"])
d_out = model(**d_batch)
d_emb = last_token_pool(d_out.last_hidden_state, d_batch["attention_mask"])
q_emb = F.normalize(q_emb, p=2, dim=1)
d_emb = F.normalize(d_emb, p=2, dim=1)
scores = (q_emb @ d_emb.T) * 100
print(scores.tolist())
- Downloads last month
- 31
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
๐
Ask for provider support
Model tree for nmixx-fin/nmixx-bge-icl
Base model
BAAI/bge-en-icl