How to Use
1. load this model and tokenizer
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
)
import torch
from tqdm import tqdm
import json
model_name = "oxygen65/llm-jp-3-13b-finetune-3"
# QLoRA config
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=False,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto",
)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
2. load Eval Datasets
tasks = []
with open("./elyza-tasks-100-TV_0.jsonl", "r") as f:
item = ""
for line in f:
line = line.strip()
item += line
if item.endswith("}"):
tasks.append(json.loads(item))
item = ""
from datasets import load_dataset
sample_task_ds = load_dataset("elyza/ELYZA-tasks-100")
sample_tasks = sample_task_ds['test']
sample_tasks['input'][0]
3. set up retrievers
if you can't find "rank_bm25" python package in your environment
!pip install rank_bm25
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
# 必要なデータをダウンロード(初回のみ)
nltk.download('punkt')
nltk.download('punkt_tab')
def search_similar_documents_bm25(query, sample_tasks):
# トークン化(BM25はトークン化されたデータを要求します)
tokenized_documents = [word_tokenize(doc) for doc in sample_tasks['input']]
# BM25オブジェクトの作成
bm25 = BM25Okapi(tokenized_documents)
tokenized_query = word_tokenize(query)
# 類似度の計算
doc_scores = bm25.get_scores(tokenized_query)
# 類似度が高い順にソート
sorted_indexes = np.argsort(doc_scores)[::-1]
indexes = []
for i in range(len(doc_scores)):
if doc_scores[sorted_indexes[i]] < 20.0:
break
else:
indexes.append(sorted_indexes[i])
return indexes
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
SentTF = SentenceTransformer('all-MiniLM-L6-v2')
def seearch_similar_documents_neuralRetriver(query, sample_tasks):
global SentTF
emb1 = SentTF.encode([query])
emb2 = SentTF.encode(sample_tasks['input'])
# 全ての組み合わせで類似度を計算
similarity_matrix = cosine_similarity(emb1, emb2) #時間かかるので先に計算しておくべき
# 類似度が高い順にソート
sorted_indexes = np.argsort(similarity_matrix[0])[::-1]
#print(sorted_indexes)
indexes = []
for i in range(len(sample_tasks['input'])):
if similarity_matrix[0][sorted_indexes[i]] < 0.75:
break
else:
indexes.append(sorted_indexes[i])
return indexes
def create_icl_prompt(input, sample_tasks, task_id):
indexes_bm25 = search_similar_documents_bm25(input, sample_tasks)
indexes_neu = seearch_similar_documents_neuralRetriver(input, sample_tasks)
indexes = list(set(indexes_bm25 + indexes_neu))
icl_prompt = ""
if indexes == []:
return ""
icl_prompt = f"""## 例題\n"""
for i in range(len(indexes)):
icl_prompt += f"""### 指示
{sample_tasks["input"][indexes[i]]}
### 回答
{sample_tasks["output"][indexes[i]]}
"""
icl_prompt += f"""
## 本題: 以下の指示に従って回答してください。step by stepで回答してください。
"""
return icl_prompt
create_icl_prompt(tasks[2]["input"], sample_tasks, 0)
4. Inference
# llmjp
import re
pattern = r"^以下.*$"
# プロンプトの作成
sys_prompt = ""
icl_prompt = ""
results = []
loop = 0
for data in tqdm(tasks):
task_id = data["task_id"]
input = data["input"]
# in context learning用のプロンプト
icl_prompt = create_icl_prompt(input, sample_tasks, task_id)
prompt = f"""{sys_prompt}{icl_prompt}### 指示
{input}
### 回答
"""
tokenized_input = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
tokenized_input,
max_new_tokens=512,
do_sample=False,
repetition_penalty=1.2,
eos_token_id=tokenizer.eos_token_id,
)[0]
output = tokenizer.decode(outputs[tokenized_input.size(1):], skip_special_tokens=True)
while (True): #とりあえず出力。
line = output.splitlines()
if re.match(pattern, line[0]) and len(line) == 1:
print(f"#========================= Unexpected answer =========================#\n {line}")
outputs = model.generate(
tokenized_input,
max_new_tokens=512,
do_sample=True,
temperature=0.4,
repetition_penalty=1.2
)[0]
output = tokenizer.decode(outputs[tokenized_input.size(1):], skip_special_tokens=True)
else: break
results.append({"task_id": data["task_id"], "input": input, "output": output})
print(f"task_id: {data['task_id']}, prompt: {prompt}, output: {output}")
5. Dump results
import re
model_name = re.sub(".*/", "", model_name)
with open(f"./{model_name}-outputs.jsonl", 'w', encoding='utf-8') as f:
for result in results:
json.dump(result, f, ensure_ascii=False) # ensure_ascii=False for handling non-ASCII characters
f.write('\n')
Uploaded model
- Developed by: oxygen65
This llama model was trained 2x faster with Unsloth and Huggingface's TRL library.
Inference Providers
NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API:
The model has no pipeline_tag.
Model tree for oxygen65/llm-jp-3-13b-finetune-3
Base model
llm-jp/llm-jp-3-13b