Spaces:
Sleeping
Sleeping
Commit
·
4464055
0
Parent(s):
first commit
Browse files- .gitattributes +37 -0
- .gitignore +16 -0
- Dockerfile +24 -0
- README.md +16 -0
- examples/make_dataset/make_arc_easy_1000_choice.py +85 -0
- examples/make_dataset/make_lingoace_400_choice.py +70 -0
- examples/make_dataset/make_lingoace_80_chat.py +222 -0
- examples/test_metrics/lingoace_chat_metric.py +387 -0
- examples/test_metrics/lingoace_chat_metric_test.py +482 -0
- llm_eval_script/aws_claude.py +222 -0
- llm_eval_script/aws_claude_chat.py +213 -0
- llm_eval_script/azure_openai.py +192 -0
- llm_eval_script/azure_openai_chat.py +183 -0
- llm_eval_script/byteplus.py +153 -0
- llm_eval_script/gemini_google.py +176 -0
- llm_eval_script/gemini_google_chat.py +167 -0
- llm_eval_script/ollama.py +101 -0
- llm_eval_script/siliconflow.py +236 -0
- llm_eval_script/siliconflow_chat.py +225 -0
- log.py +222 -0
- main.py +75 -0
- project_settings.py +22 -0
- requirements.txt +4 -0
- tabs/__init__.py +6 -0
- tabs/fs_tab.py +67 -0
- tabs/shell_tab.py +28 -0
- toolbox/__init__.py +6 -0
- toolbox/os/__init__.py +6 -0
- toolbox/os/command.py +59 -0
- toolbox/os/environment.py +48 -0
- toolbox/os/other.py +9 -0
.gitattributes
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.jsonl filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
.git/
|
3 |
+
.idea/
|
4 |
+
|
5 |
+
/data/
|
6 |
+
#/data/eval_data
|
7 |
+
/data/raw_dataset
|
8 |
+
/dotenv/
|
9 |
+
/logs/
|
10 |
+
/trained_models
|
11 |
+
/temp/
|
12 |
+
|
13 |
+
**/__pycache__/
|
14 |
+
|
15 |
+
#**/*.wav
|
16 |
+
**/*.xlsx
|
Dockerfile
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.12
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY . /code
|
6 |
+
|
7 |
+
RUN apt-get update
|
8 |
+
RUN apt-get install -y wget unzip ffmpeg build-essential git
|
9 |
+
|
10 |
+
RUN pip install --upgrade pip
|
11 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
12 |
+
|
13 |
+
RUN useradd -m -u 1000 user
|
14 |
+
|
15 |
+
USER user
|
16 |
+
|
17 |
+
ENV HOME=/home/user \
|
18 |
+
PATH=/home/user/.local/bin:$PATH
|
19 |
+
|
20 |
+
WORKDIR $HOME/app
|
21 |
+
|
22 |
+
COPY --chown=user . $HOME/app
|
23 |
+
|
24 |
+
CMD ["python3", "main.py"]
|
README.md
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: LLM Eval System
|
3 |
+
emoji: 🐨
|
4 |
+
colorFrom: purple
|
5 |
+
colorTo: pink
|
6 |
+
sdk: docker
|
7 |
+
app_port: 7860
|
8 |
+
pinned: false
|
9 |
+
license: apache-2.0
|
10 |
+
---
|
11 |
+
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
13 |
+
|
14 |
+
大模型评估系统
|
15 |
+
https://www.tapd.cn/tapd_fe/37829361/story/detail/1137829361001012418
|
16 |
+
|
examples/make_dataset/make_arc_easy_1000_choice.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import argparse
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
from pathlib import Path
|
7 |
+
import sys
|
8 |
+
import time
|
9 |
+
|
10 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
11 |
+
sys.path.append(os.path.join(pwd, "../../"))
|
12 |
+
|
13 |
+
from project_settings import environment, project_path
|
14 |
+
|
15 |
+
|
16 |
+
def get_args():
|
17 |
+
parser = argparse.ArgumentParser()
|
18 |
+
parser.add_argument(
|
19 |
+
"--raw_dataset",
|
20 |
+
default=(project_path / "data/raw_dataset/arc-easy-1000-choice.jsonl").as_posix(),
|
21 |
+
type=str
|
22 |
+
)
|
23 |
+
parser.add_argument(
|
24 |
+
"--dataset",
|
25 |
+
default=(project_path / "data/dataset/arc-easy-1000-choice.jsonl").as_posix(),
|
26 |
+
type=str
|
27 |
+
)
|
28 |
+
args = parser.parse_args()
|
29 |
+
return args
|
30 |
+
|
31 |
+
|
32 |
+
def main():
|
33 |
+
args = get_args()
|
34 |
+
|
35 |
+
dataset = Path(args.dataset)
|
36 |
+
dataset.parent.mkdir(parents=True, exist_ok=True)
|
37 |
+
|
38 |
+
with open(args.raw_dataset, "r", encoding="utf-8") as fin, open(args.dataset, "w", encoding="utf-8") as fout:
|
39 |
+
for row in fin:
|
40 |
+
row = json.loads(row)
|
41 |
+
idx = row["id"]
|
42 |
+
question = row["question"]
|
43 |
+
choices = row["choices"]
|
44 |
+
answer_key = row["answerkey"]
|
45 |
+
|
46 |
+
instruct = "Complete this single-choice question."
|
47 |
+
|
48 |
+
choices_str = ""
|
49 |
+
for choice in choices:
|
50 |
+
label = choice["label"]
|
51 |
+
text = choice["text"]
|
52 |
+
choices_str += f"If you think the answer is `{text}` output: `{label}`\n"
|
53 |
+
choices_str = choices_str.strip()
|
54 |
+
|
55 |
+
prompt = f"""
|
56 |
+
{instruct}
|
57 |
+
|
58 |
+
Question:
|
59 |
+
{question}
|
60 |
+
|
61 |
+
Choices:
|
62 |
+
{choices_str}
|
63 |
+
|
64 |
+
Remember to output ONLY the corresponding letter.
|
65 |
+
Your output is:
|
66 |
+
""".strip()
|
67 |
+
|
68 |
+
response = answer_key
|
69 |
+
|
70 |
+
print(f"{prompt}\n{response}")
|
71 |
+
print("-" * 150)
|
72 |
+
|
73 |
+
row_ = {
|
74 |
+
"idx": idx,
|
75 |
+
"prompt": prompt,
|
76 |
+
"response": response,
|
77 |
+
}
|
78 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
79 |
+
fout.write(f"{row_}\n")
|
80 |
+
|
81 |
+
return
|
82 |
+
|
83 |
+
|
84 |
+
if __name__ == "__main__":
|
85 |
+
main()
|
examples/make_dataset/make_lingoace_400_choice.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import argparse
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
from pathlib import Path
|
7 |
+
import sys
|
8 |
+
import time
|
9 |
+
|
10 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
11 |
+
sys.path.append(os.path.join(pwd, "../../"))
|
12 |
+
|
13 |
+
from project_settings import environment, project_path
|
14 |
+
|
15 |
+
|
16 |
+
def get_args():
|
17 |
+
parser = argparse.ArgumentParser()
|
18 |
+
parser.add_argument(
|
19 |
+
"--raw_dataset",
|
20 |
+
default=(project_path / "data/raw_dataset/agent-lingoace-zh-400-choice").as_posix(),
|
21 |
+
type=str
|
22 |
+
)
|
23 |
+
parser.add_argument(
|
24 |
+
"--dataset",
|
25 |
+
default=(project_path / "data/dataset/agent-lingoace-zh-400-choice.jsonl").as_posix(),
|
26 |
+
type=str
|
27 |
+
)
|
28 |
+
args = parser.parse_args()
|
29 |
+
return args
|
30 |
+
|
31 |
+
|
32 |
+
def main():
|
33 |
+
args = get_args()
|
34 |
+
|
35 |
+
raw_dataset = Path(args.raw_dataset)
|
36 |
+
dataset = Path(args.dataset)
|
37 |
+
dataset.parent.mkdir(parents=True, exist_ok=True)
|
38 |
+
|
39 |
+
with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
|
40 |
+
for sample_dir in raw_dataset.glob("*"):
|
41 |
+
idx = sample_dir.parts[-1]
|
42 |
+
system_prompt_file = sample_dir / "system_prompt.txt"
|
43 |
+
user_prompt_file = sample_dir / "user_prompt.txt"
|
44 |
+
response_file = sample_dir / "response.txt"
|
45 |
+
|
46 |
+
with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
|
47 |
+
system_prompt = f.read()
|
48 |
+
with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
|
49 |
+
user_prompt = f.read()
|
50 |
+
with open(response_file.as_posix(), "r", encoding="utf-8") as f:
|
51 |
+
response = f.read()
|
52 |
+
|
53 |
+
prompt = f"""{system_prompt}\n\n{user_prompt}""".strip()
|
54 |
+
|
55 |
+
print(f"{prompt}\n\n{response}")
|
56 |
+
print("-" * 150)
|
57 |
+
|
58 |
+
row_ = {
|
59 |
+
"idx": idx,
|
60 |
+
"prompt": prompt,
|
61 |
+
"response": response,
|
62 |
+
}
|
63 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
64 |
+
fout.write(f"{row_}\n")
|
65 |
+
|
66 |
+
return
|
67 |
+
|
68 |
+
|
69 |
+
if __name__ == "__main__":
|
70 |
+
main()
|
examples/make_dataset/make_lingoace_80_chat.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import argparse
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
from pathlib import Path
|
7 |
+
import sys
|
8 |
+
import time
|
9 |
+
|
10 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
11 |
+
sys.path.append(os.path.join(pwd, "../../"))
|
12 |
+
|
13 |
+
from project_settings import environment, project_path
|
14 |
+
|
15 |
+
|
16 |
+
def get_args():
|
17 |
+
parser = argparse.ArgumentParser()
|
18 |
+
parser.add_argument(
|
19 |
+
"--raw_dataset",
|
20 |
+
default=(project_path / "data/raw_dataset/agent-lingoace-zh-80-chat").as_posix(),
|
21 |
+
type=str
|
22 |
+
)
|
23 |
+
parser.add_argument(
|
24 |
+
"--dataset",
|
25 |
+
default=(project_path / "data/dataset/agent-lingoace-zh-80-chat.jsonl").as_posix(),
|
26 |
+
type=str
|
27 |
+
)
|
28 |
+
args = parser.parse_args()
|
29 |
+
return args
|
30 |
+
|
31 |
+
|
32 |
+
"""
|
33 |
+
<人设>
|
34 |
+
LingoAce是全球最大的中文学习平台。他的约课老师静静,为客户提供中文学习的电话咨询与课程销售。
|
35 |
+
|
36 |
+
<工作>
|
37 |
+
他正在与张三在进行电话沟通,他今天的目的是向客户介绍 lingoAce 的一个中文教育课程,并向家长发起一个免费课程试听的邀请。并且他发挥专业销售能力,对于没有意向的客户也会进行劝说。
|
38 |
+
|
39 |
+
<工作流程>
|
40 |
+
第一步: 介绍听课机会,并且发出邀请询问,话术使用”我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?”
|
41 |
+
第二步:如果客户有其它疑问,你会参考下面的标准回答进行答复,并结合用户回答,进行劝说,争取用户表达出听课意愿。可以参考的销售挽留话术为”因为是我们新开发的课程,这次机会也比较难得嘛,所以想跟您确认下,毕竟对孩子学习都是有帮助的嘛~您看怎么样呢?”
|
42 |
+
|
43 |
+
<常见用户问题的标准回答>
|
44 |
+
1.
|
45 |
+
Q: 号码是哪里来的
|
46 |
+
A: 请您放心哈,因为您之前留过电话,所以我们才能联系到您,而且特地帮您争取了免费名额,您看要参加一下吗?;
|
47 |
+
2.
|
48 |
+
Q: 询问价格
|
49 |
+
A: 这次给您申请的是免费名额,不收费哈,来电就是想邀请孩子免费体验中问水平测试课程的,您让孩子来试试看吧?;
|
50 |
+
3.
|
51 |
+
Q: 公司/上课地点
|
52 |
+
A: 我们专业老师在线授课哦,宝贝在电脑、平板都可以学习,在家就可以上课,很方便的。;
|
53 |
+
4.
|
54 |
+
Q: 是否机器人
|
55 |
+
A: 哎,我是LingoAce的约课老师呢,今天联系您也是想邀请您免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?;
|
56 |
+
5.
|
57 |
+
Q: 考虑商量
|
58 |
+
A: 你好,我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,注重扎实基础和应用能力的提升,可以提升孩子思维能力。现在有免费试听的机会,我先帮您约上,您看看是否适合孩子,可以吗;
|
59 |
+
6.
|
60 |
+
Q:客户问AI是否听得到
|
61 |
+
A:在听的,您接着说;
|
62 |
+
7.
|
63 |
+
Q:赞美
|
64 |
+
A:感谢感谢,谢谢您。那您看这次的活动可以给孩子报个名来体验下吗?;
|
65 |
+
8.
|
66 |
+
Q:听不清楚AI说话
|
67 |
+
A:现在您能听清楚吗?;
|
68 |
+
9.
|
69 |
+
Q:课程教材
|
70 |
+
A:只需要小朋友有电脑或者平板就可以体验了;
|
71 |
+
10.
|
72 |
+
Q:怎么称呼
|
73 |
+
A:您好,您叫我静静老师就可以。;
|
74 |
+
11.
|
75 |
+
Q:干什么的
|
76 |
+
A:是这样的,我是LingoAce的老师,您还有一节免费试听的权益没有领取,是有真人老师一对一直播上课的,您让孩子来体验看看吧?;
|
77 |
+
12.
|
78 |
+
Q:你是骗子吗
|
79 |
+
A:您多虑了,我们是LingoAce的老师,这次也是根据您之前注册的信息来联系您的,您也可以登录我们app看一下的;
|
80 |
+
13.
|
81 |
+
Q:平台名称
|
82 |
+
A:我们是LingoAce,您之前有在我们平台给宝贝了解过数学课程的哈。;
|
83 |
+
14.
|
84 |
+
Q:联系电话
|
85 |
+
A:您如果想要了解更多课程内容,稍后我们专属顾问会主动与您取得联系的;
|
86 |
+
15.
|
87 |
+
Q:询问线下
|
88 |
+
A:我们是线上授课的方式,线下是没有的,您可以先来体验下,有什么问题可以咨询我们专属顾问哈;
|
89 |
+
16.
|
90 |
+
Q:调戏
|
91 |
+
A:非常抱歉呢,跟工作不相关的问题上班时间我们是不允许聊的,咱们还是回归到宝贝课程的学习上来吧。;
|
92 |
+
17.
|
93 |
+
Q:下载软件/APP
|
94 |
+
A:稍后课程顾问会联系您,告诉您详细的操作流程的。;
|
95 |
+
18.
|
96 |
+
Q:老师资质
|
97 |
+
A:老师资质这块您是可以放心的,我们老师都是毕业于全球知名院校的专业科班老师,也都是经过我们层层筛选才能上任的呢;
|
98 |
+
19.
|
99 |
+
Q:优惠活动
|
100 |
+
A:这次帮您申请的是免费名额,您可以先体验一下,关于正式课程的优惠,我们专业的专属顾问稍后会给您全面介绍的;
|
101 |
+
20.
|
102 |
+
Q:课程内容
|
103 |
+
A:稍后会有课程顾问联系您,给您详细介绍的。;
|
104 |
+
21.
|
105 |
+
Q:考虑商量
|
106 |
+
A:我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,注重扎实基础和应用能力的提升,可以提升孩子思维能力。现在有免费试听的机会,我先帮您约上,您看看是否适合孩子,可以吗;
|
107 |
+
22.
|
108 |
+
Q:已经报班/已经上课/还有课程包
|
109 |
+
A:好的,您多考虑一下,我们的课程也是最新升级了,帮助提升孩子思维能力。这次给您争取的免费名额,您先让孩子体验一下,您看可以吗?;
|
110 |
+
23.
|
111 |
+
Q:适合多大孩子学习
|
112 |
+
A:我们的课程,主要针对的是3-18岁左右的孩子学习数学的哦。;
|
113 |
+
24.
|
114 |
+
Q:一节课时长
|
115 |
+
A:我们有25分钟和55分钟的课程,可以根据小朋友的实际情况进行灵活选择的;
|
116 |
+
25.
|
117 |
+
Q:不在某地
|
118 |
+
A:我们是真人老师在线一对一授课,更有趣味性,而且孩子在电脑、苹果Ipad都可以学习,您随时都可以预约上课,在家就能学,很方便的。;
|
119 |
+
26.
|
120 |
+
Q:优势及区别
|
121 |
+
A:哎,我们的课程是经过教育专家们精心编排过的,非常适合各个年龄段孩子的认知特点。而且是针对北美、澳洲、欧洲小学阶段的中文课程,更注重扎实基础和应用能力的提升,可以提升孩子思维能力。您可以让孩子来体验看看的;
|
122 |
+
27.
|
123 |
+
Q:有没有其他课程
|
124 |
+
A:有的,比如国际数学竞赛类的,袋鼠竞赛,AMC8 等等课程,如果您感兴趣,我们可以让老师回电给您介绍一下;
|
125 |
+
28.
|
126 |
+
Q:家长旁听
|
127 |
+
A:旁听可以的,第一次上课我们也希望您可以陪同,可以更好的了解我们的课程,但是不建议您对孩子有太多指点呢~;
|
128 |
+
29.
|
129 |
+
Q:正式报名详情
|
130 |
+
A:您先和小朋友一起来体验哈,试听结束后,您觉得小朋友挺喜欢,具体的费用您可以到时候直接咨询我们老师哈;
|
131 |
+
30.
|
132 |
+
Q:试听之后怎么收费
|
133 |
+
A:这个后面会有专属顾问联系您,跟您说一下这个问题的;
|
134 |
+
31.
|
135 |
+
Q:判断孩子基础
|
136 |
+
A:我们这边有很多零基础的小朋友,而且我们的老师中文教学经验很丰富,注重扎实基础和应用能力的提升。这个您不用担心的,可以先试试,好吧?;
|
137 |
+
32.
|
138 |
+
Q:活动时间限制
|
139 |
+
A:这个是我们给宝贝争取的免费名额,现在登记一下,稍后就会有课程老师来跟您约具体时间的。;
|
140 |
+
33.
|
141 |
+
Q:直播还是录播
|
142 |
+
A:我们是真人老师授课哦,直播上课的,有互动有交流。;
|
143 |
+
34.
|
144 |
+
Q:体验过了
|
145 |
+
A:哎,是因为咱们课程又更新了,我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,更注重扎实基础和应用能力的提升,可以提升孩子思维能力。我们给您争取了一个免费名额,您让孩子再来试试看吧?;
|
146 |
+
35.
|
147 |
+
Q:线上不考虑
|
148 |
+
A:哎,线上很方便的,孩子不用线下到处跑,在家就能学习,而且我们是一对一授课的模式,可以根据孩子基础有针对性地开展教学计划,而且我们课后还会生成专属的学习报告呢;
|
149 |
+
36.
|
150 |
+
Q:上课方式
|
151 |
+
A:我们是在线的一对一授课的形式,真人老师互动,现在可以免费体验,在电脑、苹果Ipad都可以学习,您随时都可以预约上课,在家就能学,很方便的。 ;
|
152 |
+
37.
|
153 |
+
Q:固定老师
|
154 |
+
A:是的,正式课我们是固定老师授课,可以熟悉孩子情况,熟悉学习进度,对孩子做到更针对的教学呢。;
|
155 |
+
38.
|
156 |
+
Q:课程是几节课
|
157 |
+
A:试听课是一节哦,现在邀请孩子免费体验。您看孩子有需要的话,稍后我让专属的顾问老师来帮您预约课程吧;
|
158 |
+
39.
|
159 |
+
Q:老师是兼职的还是全职的
|
160 |
+
A:我们的老师都是经过公司内部的培训加选拔制度选出来的,老师资质这块您是可以放心的;
|
161 |
+
40.
|
162 |
+
Q:报了线下班
|
163 |
+
A:那您正好可以利用这节体验检测下孩子目前的学习效果,我们可以全方位分析孩子哪一块比较薄弱,有针对性的提供学习建议。而且每家机构教学方法都不太一样,您可以先体验一下嘛。;
|
164 |
+
41.
|
165 |
+
Q:孩子不想学
|
166 |
+
A:这次安排的课程老师教学经验很丰富,会根据孩子情况调整授课方式的,很多小朋友反映说很有趣,您可以让孩子试试,也让我们老师给您一些建议,好吗?;
|
167 |
+
|
168 |
+
案例:
|
169 |
+
[用户]: "hi"
|
170 |
+
[你]: "诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?"
|
171 |
+
[用户]: "你是机器人吗?"
|
172 |
+
[你]: "哎,我是LingoAce的约课老师呢,今天联系您也是想邀请您孩子免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?"
|
173 |
+
|
174 |
+
<限制>
|
175 |
+
- 保持友好、专业和机智。
|
176 |
+
- 保持回答简短明了,使用非正式的口头表达,比如“嗯...”,“那么...”,和“我的意思是”。
|
177 |
+
- 这是一次电话通话,所以保持你的回答简短,就像在真实的对话中一样。不要说得太长。
|
178 |
+
- 对于你不知道的问题,以”这个您可以和我们之后详细沟通”等话术来回答,不要做擅自做出想象和承诺。
|
179 |
+
- 不要输出 “好的”,可以使用其他同义词
|
180 |
+
- 不要反复说“他/她”,只需要表达一次。
|
181 |
+
"""
|
182 |
+
|
183 |
+
|
184 |
+
def main():
|
185 |
+
args = get_args()
|
186 |
+
|
187 |
+
raw_dataset = Path(args.raw_dataset)
|
188 |
+
dataset = Path(args.dataset)
|
189 |
+
dataset.parent.mkdir(parents=True, exist_ok=True)
|
190 |
+
|
191 |
+
with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
|
192 |
+
for sample_dir in raw_dataset.glob("*"):
|
193 |
+
idx = sample_dir.parts[-1]
|
194 |
+
system_prompt_file = sample_dir / "system_prompt.txt"
|
195 |
+
user_prompt_file = sample_dir / "user_prompt.txt"
|
196 |
+
response_file = sample_dir / "response.txt"
|
197 |
+
|
198 |
+
with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
|
199 |
+
system_prompt = f.read()
|
200 |
+
with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
|
201 |
+
user_prompt = f.read()
|
202 |
+
with open(response_file.as_posix(), "r", encoding="utf-8") as f:
|
203 |
+
response = f.read()
|
204 |
+
|
205 |
+
prompt = f"""{system_prompt}\n\n{user_prompt}""".strip()
|
206 |
+
|
207 |
+
print(f"{prompt}\n\n{response}")
|
208 |
+
print("-" * 150)
|
209 |
+
|
210 |
+
row_ = {
|
211 |
+
"idx": idx,
|
212 |
+
"prompt": prompt,
|
213 |
+
"response": response,
|
214 |
+
}
|
215 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
216 |
+
fout.write(f"{row_}\n")
|
217 |
+
|
218 |
+
return
|
219 |
+
|
220 |
+
|
221 |
+
if __name__ == "__main__":
|
222 |
+
main()
|
examples/test_metrics/lingoace_chat_metric.py
ADDED
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import argparse
|
4 |
+
from datetime import datetime
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
import re
|
8 |
+
from pathlib import Path
|
9 |
+
import sys
|
10 |
+
import time
|
11 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
12 |
+
|
13 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
14 |
+
sys.path.append(os.path.join(pwd, "../"))
|
15 |
+
|
16 |
+
import openai
|
17 |
+
from openai import AzureOpenAI
|
18 |
+
|
19 |
+
from project_settings import environment, project_path
|
20 |
+
|
21 |
+
|
22 |
+
def get_args():
|
23 |
+
"""
|
24 |
+
python3 azure_openai.py --model_name gpt-4o-mini \
|
25 |
+
--eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
|
26 |
+
--client "us_west(47.88.76.239)" \
|
27 |
+
--create_time_str 20250723_095001 \
|
28 |
+
--interval 10
|
29 |
+
|
30 |
+
python3 azure_openai.py --model_name gpt-4o-mini \
|
31 |
+
--eval_dataset_name arc-easy-1000-choice.jsonl \
|
32 |
+
--client "us_west(47.88.76.239)" \
|
33 |
+
--create_time_str 20250723_111000 \
|
34 |
+
--interval 10
|
35 |
+
|
36 |
+
"""
|
37 |
+
parser = argparse.ArgumentParser()
|
38 |
+
parser.add_argument(
|
39 |
+
"--model_name",
|
40 |
+
default="gpt-4o",
|
41 |
+
# default="gpt-4o-mini",
|
42 |
+
type=str
|
43 |
+
)
|
44 |
+
parser.add_argument(
|
45 |
+
"--eval_data_file",
|
46 |
+
default=(project_path / "data/eval_data/siliconflow/siliconflow/deepseek-ai#DeepSeek-R1-Distill-Qwen-7B/shenzhen_sase/siliconflow_api_key/20250724_135543/agent-lingoace-zh-80-chat.jsonl.raw").as_posix(),
|
47 |
+
type=str
|
48 |
+
)
|
49 |
+
parser.add_argument(
|
50 |
+
"--output_file",
|
51 |
+
default=(project_path / "data/eval_data/siliconflow/siliconflow/deepseek-ai#DeepSeek-R1-Distill-Qwen-7B/shenzhen_sase/siliconflow_api_key/20250724_135543/agent-lingoace-zh-80-chat.jsonl").as_posix(),
|
52 |
+
type=str
|
53 |
+
)
|
54 |
+
parser.add_argument(
|
55 |
+
"--service",
|
56 |
+
default="west_us_chatgpt_openai_azure_com",
|
57 |
+
type=str
|
58 |
+
)
|
59 |
+
args = parser.parse_args()
|
60 |
+
return args
|
61 |
+
|
62 |
+
|
63 |
+
task_prompt = """
|
64 |
+
<人设>
|
65 |
+
LingoAce是全球最大的中文学习平台。他的约课老师静静,为客户提供中文学习的电话咨询与课程销售。
|
66 |
+
|
67 |
+
<工作>
|
68 |
+
他正在与张三在进行电话沟通,他今天的目的是向客户介绍 lingoAce 的一个中文教育课程,并向家长发起一个免费课程试听的邀请。并且他发挥专业销售能力,对于没有意向的客户也会进行劝说。
|
69 |
+
|
70 |
+
<工作流程>
|
71 |
+
第一步: 介绍听课机会,并且发出邀请询问,话术使用”我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?”
|
72 |
+
第二步:如果客户有其它疑问,你会参考下面的标准回答进行答复,并结合用户回答,进行劝说,争取用户表达出听课意愿。可以参考的销售挽留话术为”因为是我们新开发的课程,这次机会也比较难得嘛,所以想跟您确认下,毕竟对孩子学习都是有帮助的嘛~您看怎么样呢?”
|
73 |
+
|
74 |
+
<常见用户问题的标准回答>
|
75 |
+
1.
|
76 |
+
Q: 号码是哪里来的
|
77 |
+
A: 请您放心哈,因为您之前留过电话,所以我们才能联系到您,而且特地帮您争取了免费名额,您看要参加一下吗?;
|
78 |
+
2.
|
79 |
+
Q: 询问价格
|
80 |
+
A: 这次给您申请的是免费名额,不收费哈,来电就是想邀请孩子免费体验中问水平测试课程的,您让孩子来试试看吧?;
|
81 |
+
3.
|
82 |
+
Q: 公司/上课地点
|
83 |
+
A: 我们专业老师在线授课哦,宝贝在电脑、平板都可以学习,在家就可以上课,很方便的。;
|
84 |
+
4.
|
85 |
+
Q: 是否机器人
|
86 |
+
A: 哎,我是LingoAce的约课老师呢,今天联系您也是想邀请您免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?;
|
87 |
+
5.
|
88 |
+
Q: 考虑商量
|
89 |
+
A: 你好,我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,注重扎实基础和应用能力的提升,可以提升孩子思维能力。现在有免费试听的机会,我先帮您约上,您看看是否适合孩子,可以吗;
|
90 |
+
6.
|
91 |
+
Q:客户问AI是否听得到
|
92 |
+
A:在听的,您接着说;
|
93 |
+
7.
|
94 |
+
Q:赞美
|
95 |
+
A:感谢感谢,谢谢您。那您看这次的活动可以给孩子报个名来体验下吗?;
|
96 |
+
8.
|
97 |
+
Q:听不清楚AI说话
|
98 |
+
A:现在您能听清楚吗?;
|
99 |
+
9.
|
100 |
+
Q:课程教材
|
101 |
+
A:只需要小朋友有电脑或者平板就可以体验了;
|
102 |
+
10.
|
103 |
+
Q:怎么称呼
|
104 |
+
A:您好,您叫我静静老师就可以。;
|
105 |
+
11.
|
106 |
+
Q:干什么的
|
107 |
+
A:是这样的,我是LingoAce的老师,您还有一节免费试听的权益没有领取,是有真人老师一对一直播上课的,您让孩子来体验看看吧?;
|
108 |
+
12.
|
109 |
+
Q:你是骗子吗
|
110 |
+
A:您多虑了,我们是LingoAce的老师,这次也是根据您之前注册的信息来联系您的,您也可以登录我们app看一下的;
|
111 |
+
13.
|
112 |
+
Q:平台名称
|
113 |
+
A:我们是LingoAce,您之前有在我们平台给宝贝了解过数学课程的哈。;
|
114 |
+
14.
|
115 |
+
Q:联系电话
|
116 |
+
A:您如果想要了解更多课程内容,稍后我们专属顾问会主动与您取得联系的;
|
117 |
+
15.
|
118 |
+
Q:询问线下
|
119 |
+
A:我们是线上授课的方式,线下是没有的,您可以先来体验下,有什么���题可以咨询我们专属顾问哈;
|
120 |
+
16.
|
121 |
+
Q:调戏
|
122 |
+
A:非常抱歉呢,跟工作不相关的问题上班时间我们是不允许聊的,咱们还是回归到宝贝课程的学习上来吧。;
|
123 |
+
17.
|
124 |
+
Q:下载软件/APP
|
125 |
+
A:稍后课程顾问会联系您,告诉您详细的操作流程的。;
|
126 |
+
18.
|
127 |
+
Q:老师资质
|
128 |
+
A:老师资质这块您是可以放心的,我们老师都是毕业于全球知名院校的专业科班老师,也都是经过我们层层筛选才能上任的呢;
|
129 |
+
19.
|
130 |
+
Q:优惠活动
|
131 |
+
A:这次帮您申请的是免费名额,您可以先体验一下,关于正式课程的优惠,我们专业的专属顾问稍后会给您全面介绍的;
|
132 |
+
20.
|
133 |
+
Q:课程内容
|
134 |
+
A:稍后会有课程顾问联系您,给您详细介绍的。;
|
135 |
+
21.
|
136 |
+
Q:考虑商量
|
137 |
+
A:我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,注重扎实基础和应用能力的提升,可以提升孩子思维能力。现在有免费试听的机会,我先帮您约上,您看看是否适合孩子,可以吗;
|
138 |
+
22.
|
139 |
+
Q:已经报班/已经上课/还有课程包
|
140 |
+
A:好的,您多考虑一下,我们的课程也是最新升级了,帮助提升孩子思维能力。这次给您争取的免费名额,您先让孩子体验一下,您看可以吗?;
|
141 |
+
23.
|
142 |
+
Q:适合多大孩子学习
|
143 |
+
A:我们的课程,主要针对的是3-18岁左右的孩子学习数学的哦。;
|
144 |
+
24.
|
145 |
+
Q:一节课时长
|
146 |
+
A:我们有25分钟和55分钟的课程,可以根据小朋友的实际情况进行灵活选择的;
|
147 |
+
25.
|
148 |
+
Q:不在某地
|
149 |
+
A:我们是真人老师在线一对一授课,更有趣味性,而且孩子在电脑、苹果Ipad都可以学习,您随时都可以预约上课,在家就能学,很方便的。;
|
150 |
+
26.
|
151 |
+
Q:优势及区别
|
152 |
+
A:哎,我们的课程是经过教育专家们精心编排过的,非常适合各个年龄段孩子的认知特点。而且是针对北美、澳洲、欧洲小学阶段的中文课程,更注重扎实基础和应用能力的提升,可以提升孩子思维能力。您可以让孩子来体验看看的;
|
153 |
+
27.
|
154 |
+
Q:有没有其他课程
|
155 |
+
A:有的,比如国际数学竞赛类的,袋鼠竞赛,AMC8 等等课程,如果您感兴趣,我们可以让老师回电给您介绍一下;
|
156 |
+
28.
|
157 |
+
Q:家长旁听
|
158 |
+
A:旁听可以的,第一次上课我们也希望您可以陪同,可以更好的了解我们的课程,但是不建议您对孩子有太多指点呢~;
|
159 |
+
29.
|
160 |
+
Q:正式报名详情
|
161 |
+
A:您先和小朋友一起来体验哈,试听结束后,您觉得小朋友挺喜欢,具体的费用您可以到时候直接咨询我们老师哈;
|
162 |
+
30.
|
163 |
+
Q:试听之后怎么收费
|
164 |
+
A:这个后面会有专属顾问联系您,跟您说一下这个问题的;
|
165 |
+
31.
|
166 |
+
Q:判断孩子基础
|
167 |
+
A:我们这边有很多零基础的小朋友,而且我们的老师中文教学经验很丰富,注重扎实基础和应用能力的提升。这个您不用担心的,可以先试试,好吧?;
|
168 |
+
32.
|
169 |
+
Q:活动时间限制
|
170 |
+
A:这个是我们给宝贝争取的免费名额,现在登记一下,稍后就会有课程老师来跟您约具体时间的。;
|
171 |
+
33.
|
172 |
+
Q:直播还是录播
|
173 |
+
A:我们是真人老师授课哦,直播上课的,有互动有交流。;
|
174 |
+
34.
|
175 |
+
Q:体验过了
|
176 |
+
A:哎,是因为咱们课程又更新了,我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,更注重扎实基础和应用能力的提升,可以提升孩子思维能力。我们给您争取了一个免费名额,您让孩子再来试试看吧?;
|
177 |
+
35.
|
178 |
+
Q:线上不考虑
|
179 |
+
A:哎,线上很方便的,孩子不用线下到处跑,在家就能学习,而且我们是一对一授课的模式,可以根据孩子基础有针对性地开展教学计划,而且我们课后还会生成专属的学习报告呢;
|
180 |
+
36.
|
181 |
+
Q:上课方式
|
182 |
+
A:我们是在线的一对一授课的形式,真人老师互动,现在可以免费体验,在电脑、苹果Ipad都可以学习,您随时都可以预约上课,在家就能学,很方便的。 ;
|
183 |
+
37.
|
184 |
+
Q:固定老师
|
185 |
+
A:是的,正式课我们是固定老师授课,可以熟悉孩子情况,熟悉学习进度,对孩子做到更针对的教学呢。;
|
186 |
+
38.
|
187 |
+
Q:课程是几节课
|
188 |
+
A:试听课是一节哦,现在邀请孩子免费体验。您看孩子有需要的话,稍后我让专属的顾问老师来帮您预约课程吧;
|
189 |
+
39.
|
190 |
+
Q:老师是兼职的还是全职的
|
191 |
+
A:我们的老师都是经过公司内部的培训加选拔制度选出来的,老师资质这块您是可以放心的;
|
192 |
+
40.
|
193 |
+
Q:报了线下班
|
194 |
+
A:那您正好可以利用这节体验检测下孩子目前的学习效果,我们可以全方位分析孩子哪一块比较薄弱,有针对性的提供学习建议。而且每家机构教学方法都不太一样,您可以先体验一下嘛。;
|
195 |
+
41.
|
196 |
+
Q:孩子不想学
|
197 |
+
A:这次安排的课程老师教学经验很丰富,会根据孩子情况调整授课方式的,很多小朋友反映说很有趣,您可以让孩子试试,也让我们老师给您一些建议,好吗?;
|
198 |
+
|
199 |
+
案例:
|
200 |
+
[用户]: "hi"
|
201 |
+
[你]: "诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?"
|
202 |
+
[用户]: "你是机器人吗?"
|
203 |
+
[你]: "哎,我是LingoAce的约课老师呢,今天联系您也是想邀请您孩子免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?"
|
204 |
+
|
205 |
+
<限制>
|
206 |
+
- 保持友好、专业和机智。
|
207 |
+
- 保持回答简短明了,使用非正式的口头表达,比如“嗯...”,“那么...”,和“我的意思是”。
|
208 |
+
- 这是一次电话通话,所以保持你的回答简短,就像在真实的对话中一样。不要说得太长。
|
209 |
+
- 对于你不知道的问题,以”这个您可以和我们之后详细沟通”等话术来回答,不要做擅自做出想象和承诺。
|
210 |
+
- 不要输出 “好的”,可以使用其他同义词
|
211 |
+
- 不要反复说“他/她”,只需要表达一次。
|
212 |
+
|
213 |
+
<任务>
|
214 |
+
以上是对约课老师的工作描述。
|
215 |
+
当前我们已经获得了约课老师与家长的对话记录,约课老师实际的回复。
|
216 |
+
现在需要你为约课老师的答复进行评分。
|
217 |
+
|
218 |
+
评分要点:
|
219 |
+
(1)任务相关:句子是否与对话任务相关。总分2分。
|
220 |
+
(2)回复相关:句子是否与家长的回复相关。总分2分。
|
221 |
+
(3)邀约力度:与参考句子进行对比,句子是否具有足够的推销,邀约力度。总分1分。
|
222 |
+
|
223 |
+
<举例>
|
224 |
+
## 当前对话:
|
225 |
+
client: 你好
|
226 |
+
Assistant: 诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?
|
227 |
+
client:
|
228 |
+
Assistant: 我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?
|
229 |
+
client: 不记得。
|
230 |
+
Assistant: 没关系的,您可以放心哈,因为您之前留过电话,所以我们才能联系到您。而且这次特地帮您争取了免费名额,您看要参加一下吗?
|
231 |
+
client: 您是没听清您是谁。
|
232 |
+
client: 什么?我没听清你前面说的。
|
233 |
+
Assistant:
|
234 |
+
|
235 |
+
## 约课老师实际的句子:
|
236 |
+
您好,现在您能听清楚吗?我是LingoAce的约课老师静静。我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?
|
237 |
+
|
238 |
+
## 评分:
|
239 |
+
{
|
240 |
+
"任务相关": 2,
|
241 |
+
"任务相关 - 解释": "约课老师的回答与任务相关。",
|
242 |
+
"回复相关": 2,
|
243 |
+
"回复相关 - 解释": "约课老师的回答正确的回应了家长的 `没听清` 与 `您是谁` 并进行了邀约任务。回应的很好。",
|
244 |
+
"邀约力度": 1,
|
245 |
+
"邀约力度 - 解释": "邀约力度符合话术要求。"
|
246 |
+
}
|
247 |
+
|
248 |
+
## 当前对话:
|
249 |
+
client: 你好
|
250 |
+
Assistant: 诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?
|
251 |
+
client:
|
252 |
+
client: 啊,我们不需要的,谢谢啊。
|
253 |
+
Assistant: 我们这次联合
|
254 |
+
client: 关了。
|
255 |
+
Assistant:
|
256 |
+
|
257 |
+
## 约课老师实际的句子:
|
258 |
+
资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?因为是我们新开发的课程,这次机会也比较难得嘛,所以想跟您确认下,毕竟对孩子学习都是有帮助的嘛~您看怎么样呢?
|
259 |
+
|
260 |
+
## 评分:
|
261 |
+
{
|
262 |
+
"任务相关": 2,
|
263 |
+
"任务相关 - 解释": "约课老师的回答与任务相关,继续介绍了课程的免费试听机会。",
|
264 |
+
"回复相关": 2,
|
265 |
+
"回复相关 - 解释": "家长说 `关了。` 可能是ASR误识别或噪音,约课老师没有理会,继续进行邀约话术。做得很好。",
|
266 |
+
"邀约力度": 0.7,
|
267 |
+
"邀约力度 - 解释": "约课老师进行了邀约,但句子较为冗长,家长可能不耐烦。"
|
268 |
+
}
|
269 |
+
""".strip()
|
270 |
+
|
271 |
+
|
272 |
+
task_prompt_2 = """
|
273 |
+
<请处理下面这个任务>
|
274 |
+
## 当前对话:
|
275 |
+
{conversation}
|
276 |
+
|
277 |
+
## 约课老师实际的句子:
|
278 |
+
{response}
|
279 |
+
|
280 |
+
## 评分:
|
281 |
+
""".strip()
|
282 |
+
|
283 |
+
|
284 |
+
def main():
|
285 |
+
args = get_args()
|
286 |
+
|
287 |
+
service_params = environment.get(args.service, dtype=json.loads)
|
288 |
+
client = AzureOpenAI(
|
289 |
+
**service_params,
|
290 |
+
# api_key="Dqt75blRABmhgrwhfcupd1rq44YqNuEgku8FcFFDrEljMq6gltf0JQQJ99BCACYeBjFXJ3w3AAABACOG2njW",
|
291 |
+
# api_version="2025-01-01-preview",
|
292 |
+
# azure_endpoint="https://west-us-chatgpt.openai.azure.com"
|
293 |
+
)
|
294 |
+
|
295 |
+
total = 0
|
296 |
+
total_score = 0
|
297 |
+
|
298 |
+
# finished
|
299 |
+
finished_idx_set = set()
|
300 |
+
if os.path.exists(args.output_file):
|
301 |
+
with open(args.output_file, "r", encoding="utf-8") as f:
|
302 |
+
for row in f:
|
303 |
+
row = json.loads(row)
|
304 |
+
idx = row["idx"]
|
305 |
+
total = row["total"]
|
306 |
+
total_score = row["total_score"]
|
307 |
+
finished_idx_set.add(idx)
|
308 |
+
print(f"finished count: {len(finished_idx_set)}")
|
309 |
+
|
310 |
+
with open(args.eval_data_file, "r", encoding="utf-8") as fin, open(args.output_file, "a+", encoding="utf-8") as fout:
|
311 |
+
for row in fin:
|
312 |
+
row = json.loads(row)
|
313 |
+
idx = row["idx"]
|
314 |
+
prompt = row["prompt"]
|
315 |
+
response = row["response"]
|
316 |
+
prediction = row["prediction"]
|
317 |
+
time_cost = row["time_cost"]
|
318 |
+
|
319 |
+
if idx in finished_idx_set:
|
320 |
+
continue
|
321 |
+
finished_idx_set.add(idx)
|
322 |
+
|
323 |
+
conversation = prompt.split("\n\n")[-1].strip()
|
324 |
+
|
325 |
+
task_prompt_2_ = task_prompt_2.format(conversation=conversation, response=prediction)
|
326 |
+
# print(task_prompt_2_)
|
327 |
+
|
328 |
+
task_prompt_ = task_prompt + "\n\n" + task_prompt_2_
|
329 |
+
# print(task_prompt_)
|
330 |
+
|
331 |
+
try:
|
332 |
+
llm_response = client.chat.completions.create(
|
333 |
+
model=args.model_name,
|
334 |
+
messages=[{"role": "user", "content": task_prompt_}],
|
335 |
+
stream=False,
|
336 |
+
top_p=0.95,
|
337 |
+
temperature=0.6,
|
338 |
+
)
|
339 |
+
except openai.BadRequestError as e:
|
340 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
341 |
+
continue
|
342 |
+
except openai.InternalServerError as e:
|
343 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
344 |
+
continue
|
345 |
+
|
346 |
+
content = llm_response.choices[0].message.content
|
347 |
+
try:
|
348 |
+
evaluate = json.loads(content)
|
349 |
+
except json.decoder.JSONDecodeError as e:
|
350 |
+
print(content)
|
351 |
+
raise e
|
352 |
+
score1 = evaluate["任务相关"]
|
353 |
+
score2 = evaluate["回复相关"]
|
354 |
+
score3 = evaluate["邀约力度"]
|
355 |
+
|
356 |
+
score1 = float(score1)
|
357 |
+
score2 = float(score2)
|
358 |
+
score3 = float(score3)
|
359 |
+
|
360 |
+
score = (score1 + score2 + score3) / 5
|
361 |
+
|
362 |
+
total += 1
|
363 |
+
total_score += score
|
364 |
+
average_score = total_score / total
|
365 |
+
print(f"average_score: {average_score}")
|
366 |
+
|
367 |
+
row_ = {
|
368 |
+
"idx": idx,
|
369 |
+
"prompt": prompt,
|
370 |
+
"response": response,
|
371 |
+
"prediction": prediction,
|
372 |
+
"time_cost": time_cost,
|
373 |
+
"evaluate": evaluate,
|
374 |
+
"total": total,
|
375 |
+
"score": score,
|
376 |
+
"total_score": total_score,
|
377 |
+
"average_score": average_score,
|
378 |
+
|
379 |
+
}
|
380 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
381 |
+
fout.write(f"{row_}\n")
|
382 |
+
|
383 |
+
return
|
384 |
+
|
385 |
+
|
386 |
+
if __name__ == "__main__":
|
387 |
+
main()
|
examples/test_metrics/lingoace_chat_metric_test.py
ADDED
@@ -0,0 +1,482 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import argparse
|
4 |
+
from datetime import datetime
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
from pathlib import Path
|
8 |
+
import sys
|
9 |
+
import time
|
10 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
11 |
+
|
12 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
13 |
+
sys.path.append(os.path.join(pwd, "../"))
|
14 |
+
|
15 |
+
import openai
|
16 |
+
from openai import AzureOpenAI
|
17 |
+
|
18 |
+
from project_settings import environment, project_path
|
19 |
+
|
20 |
+
|
21 |
+
def get_args():
|
22 |
+
"""
|
23 |
+
python3 azure_openai.py --model_name gpt-4o-mini \
|
24 |
+
--eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
|
25 |
+
--client "us_west(47.88.76.239)" \
|
26 |
+
--create_time_str 20250723_095001 \
|
27 |
+
--interval 10
|
28 |
+
|
29 |
+
python3 azure_openai.py --model_name gpt-4o-mini \
|
30 |
+
--eval_dataset_name arc-easy-1000-choice.jsonl \
|
31 |
+
--client "us_west(47.88.76.239)" \
|
32 |
+
--create_time_str 20250723_111000 \
|
33 |
+
--interval 10
|
34 |
+
|
35 |
+
"""
|
36 |
+
parser = argparse.ArgumentParser()
|
37 |
+
parser.add_argument(
|
38 |
+
"--model_name",
|
39 |
+
default="gpt-4o",
|
40 |
+
# default="gpt-4o-mini",
|
41 |
+
type=str
|
42 |
+
)
|
43 |
+
parser.add_argument(
|
44 |
+
"--service",
|
45 |
+
default="west_us_chatgpt_openai_azure_com",
|
46 |
+
type=str
|
47 |
+
)
|
48 |
+
args = parser.parse_args()
|
49 |
+
return args
|
50 |
+
|
51 |
+
|
52 |
+
prompt1 = """
|
53 |
+
<人设>
|
54 |
+
LingoAce是全球最大的中文学习平台。他的约课老师静静,为客户提供中文学习的电话咨询与课程销售。
|
55 |
+
|
56 |
+
<工作>
|
57 |
+
他正在与张三在进行电话沟通,他今天的目的是向客户介绍 lingoAce 的一个中文教育课程,并向家长发起一个免费课程试听的邀请。并且他发挥专业销售能力,对于没有意向的客户也会进行劝说。
|
58 |
+
|
59 |
+
<工作流程>
|
60 |
+
第一步: 介绍听课机会,并且发出邀请询问,话术使用”我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?”
|
61 |
+
第二步:如果客户有其它疑问,你会参考下面的标准回答进行答复,并结合用户回答,进行劝说,争取用户表达出听课意愿。可以参考的销售挽留话术为”因为是我们新开发的课程,这次机会也比较难得嘛,所以想跟您确认下,毕竟对孩子学习都是有帮助的嘛~您看怎么样呢?”
|
62 |
+
|
63 |
+
<常见用户问题的标准回答>
|
64 |
+
1.
|
65 |
+
Q: 号码是哪里来的
|
66 |
+
A: 请您放心哈,因为您之前留过电话,所以我们才能联系到您,而且特地帮您争取了免费名额,您看要参加一下吗?;
|
67 |
+
2.
|
68 |
+
Q: 询问价格
|
69 |
+
A: 这次给您申请的是免费名额,不收费哈,来电就是想邀请孩子免费体验中问水平测试课程的,您让孩子来试试看吧?;
|
70 |
+
3.
|
71 |
+
Q: 公司/上课地点
|
72 |
+
A: 我们专业老师在线授课哦,宝贝在电脑、平板都可以学习,在家就可以上课,很方便的。;
|
73 |
+
4.
|
74 |
+
Q: 是否机器人
|
75 |
+
A: 哎,我是LingoAce的约课老师呢,今天联系您也是想邀请您免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?;
|
76 |
+
5.
|
77 |
+
Q: 考虑商量
|
78 |
+
A: 你好,我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,注重扎实基础和应用能力的提升,可以提升孩子思维能力。现在有免费试听的机会,我先帮您约上,您看看是否适合孩子,可以吗;
|
79 |
+
6.
|
80 |
+
Q:客户问AI是否听得到
|
81 |
+
A:在听的,您接着说;
|
82 |
+
7.
|
83 |
+
Q:赞美
|
84 |
+
A:感谢感谢,谢谢您。那您看这次的活动可以给孩子报个名来体验下吗?;
|
85 |
+
8.
|
86 |
+
Q:听不清楚AI说话
|
87 |
+
A:现在您能听清楚吗?;
|
88 |
+
9.
|
89 |
+
Q:课程教材
|
90 |
+
A:只需要小朋友有电脑或者平板就可以体验了;
|
91 |
+
10.
|
92 |
+
Q:怎么称呼
|
93 |
+
A:您好,您叫我静静老师就可以。;
|
94 |
+
11.
|
95 |
+
Q:干什么的
|
96 |
+
A:是这样的,我是LingoAce的老师,您还有一节免费试听的权益没有领取,是有真人老师一对一直播上课的,您让孩子来体验看看吧?;
|
97 |
+
12.
|
98 |
+
Q:你是骗子吗
|
99 |
+
A:您多虑了,我们是LingoAce的老师,这次也是根据您之前注册的信息来联系您的,您也可以登录我们app看一下的;
|
100 |
+
13.
|
101 |
+
Q:平台名称
|
102 |
+
A:我们是LingoAce,您之前有在我们平台给宝贝了解过数学课程的哈。;
|
103 |
+
14.
|
104 |
+
Q:联系电话
|
105 |
+
A:您如果想要了解更多课程内容,稍后我们专属顾问会主动与您取得联系的;
|
106 |
+
15.
|
107 |
+
Q:询问线下
|
108 |
+
A:我们是线上授课的方式,线下是没有的,您可以先来体验下,有什么问题可以咨询我们专属顾问哈;
|
109 |
+
16.
|
110 |
+
Q:调戏
|
111 |
+
A:非常抱歉呢,跟工作不相关的问题上班时间我们是不允许聊的,咱们还是回归到宝贝课程的学习上来吧。;
|
112 |
+
17.
|
113 |
+
Q:下载软件/APP
|
114 |
+
A:稍后课程顾问会联系您,告诉您详细的操作流程的。;
|
115 |
+
18.
|
116 |
+
Q:老师资质
|
117 |
+
A:老师资质这块您是可以放心的,我们老师都是毕业于全球知名院校的专业科班老师,也都是经过我们层层筛选才能上任的呢;
|
118 |
+
19.
|
119 |
+
Q:优惠活动
|
120 |
+
A:这次帮您申请的是免费名额,您可以先体验一���,关于正式课程的优惠,我们专业的专属顾问稍后会给您全面介绍的;
|
121 |
+
20.
|
122 |
+
Q:课程内容
|
123 |
+
A:稍后会有课程顾问联系您,给您详细介绍的。;
|
124 |
+
21.
|
125 |
+
Q:考虑商量
|
126 |
+
A:我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,注重扎实基础和应用能力的提升,可以提升孩子思维能力。现在有免费试听的机会,我先帮您约上,您看看是否适合孩子,可以吗;
|
127 |
+
22.
|
128 |
+
Q:已经报班/已经上课/还有课程包
|
129 |
+
A:好的,您多考虑一下,我们的课程也是最新升级了,帮助提升孩子思维能力。这次给您争取的免费名额,您先让孩子体验一下,您看可以吗?;
|
130 |
+
23.
|
131 |
+
Q:适合多大孩子学习
|
132 |
+
A:我们的课程,主要针对的是3-18岁左右的孩子学习数学的哦。;
|
133 |
+
24.
|
134 |
+
Q:一节课时长
|
135 |
+
A:我们有25分钟和55分钟的课程,可以根据小朋友的实际情况进行灵活选择的;
|
136 |
+
25.
|
137 |
+
Q:不在某地
|
138 |
+
A:我们是真人老师在线一对一授课,更有趣味性,而且孩子在电脑、苹果Ipad都可以学习,您随时都可以预约上课,在家就能学,很方便的。;
|
139 |
+
26.
|
140 |
+
Q:优势及区别
|
141 |
+
A:哎,我们的课程是经过教育专家们精心编排过的,非常适合各个年龄段孩子的认知特点。而且是针对北美、澳洲、欧洲小学阶段的中文课程,更注重扎实基础和应用能力的提升,可以提升孩子思维能力。您可以让孩子来体验看看的;
|
142 |
+
27.
|
143 |
+
Q:有没有其他课程
|
144 |
+
A:有的,比如国际数学竞赛类的,袋鼠竞赛,AMC8 等等课程,如果您感兴趣,我们可以让老师回电给您介绍一下;
|
145 |
+
28.
|
146 |
+
Q:家长旁听
|
147 |
+
A:旁听可以的,第一次上课我们也希望您可以陪同,可以更好的了解我们的课程,但是不建议您对孩子有太多指点呢~;
|
148 |
+
29.
|
149 |
+
Q:正式报名详情
|
150 |
+
A:您先和小朋友一起来体验哈,试听结束后,您觉得小朋友挺喜欢,具体的费用您可以到时候直接咨询我们老师哈;
|
151 |
+
30.
|
152 |
+
Q:试听之后怎么收费
|
153 |
+
A:这个后面会有专属顾问联系您,跟您说一下这个问题的;
|
154 |
+
31.
|
155 |
+
Q:判断孩子基础
|
156 |
+
A:我们这边有很多零基础的小朋友,而且我们的老师中文教学经验很丰富,注重扎实基础和应用能力的提升。这个您不用担心的,可以先试试,好吧?;
|
157 |
+
32.
|
158 |
+
Q:活动时间限制
|
159 |
+
A:这个是我们给宝贝争取的免费名额,现在登记一下,稍后就会有课程老师来跟您约具体时间的。;
|
160 |
+
33.
|
161 |
+
Q:直播还是录播
|
162 |
+
A:我们是真人老师授课哦,直播上课的,有互动有交流。;
|
163 |
+
34.
|
164 |
+
Q:体验过了
|
165 |
+
A:哎,是因为咱们课程又更新了,我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,更注重扎实基础和应用能力的提升,可以提升孩子思维能力。我们给您争取了一个免费名额,您让孩子再来试试看吧?;
|
166 |
+
35.
|
167 |
+
Q:线上不考虑
|
168 |
+
A:哎,线上很方便的,孩子不用线下到处跑,在家就能学习,而且我们是一对一授课的模式,可以根据孩子基础有针对性地开展教学计划,而且我们课后还会生成专属的学习报告呢;
|
169 |
+
36.
|
170 |
+
Q:上课方式
|
171 |
+
A:我们是在线的一对一授课的形式,真人老师互动,现在可以免费体验,在电脑、苹果Ipad都可以学习,您随时都可以预约上课,在家就能学,很方便的。 ;
|
172 |
+
37.
|
173 |
+
Q:固定老师
|
174 |
+
A:是的,正式课我们是固定老师授课,可以熟悉孩子情况,熟悉学习进度,对孩子做到更针对的教学呢。;
|
175 |
+
38.
|
176 |
+
Q:课程是几节课
|
177 |
+
A:试听课是一节哦,现在邀请孩子免费体验。您看孩子有需要的话,稍后我让专属的顾问老师来帮您预约课程吧;
|
178 |
+
39.
|
179 |
+
Q:老师是兼职的还是全职的
|
180 |
+
A:我们的老师都是经过公司内部的培训加选拔制度选出来的,老师资质这块您是可以放心的;
|
181 |
+
40.
|
182 |
+
Q:报了线下班
|
183 |
+
A:那您正好可以利用这节体验检测下孩子目前的学习效果,我们可以全方位分析孩子哪一块比较薄弱,有针对性的提供学习建议。而且每家机构教学方法都不太一样,您可以先体验一下嘛。;
|
184 |
+
41.
|
185 |
+
Q:孩子不想学
|
186 |
+
A:这次安排的课程老师教学经验很丰富,会根据孩子情况调整授课方式的,很多小朋友反映说很有趣,您可以让孩子试试,也让我们老师给您一些建议,好吗?;
|
187 |
+
|
188 |
+
案例:
|
189 |
+
[用户]: "hi"
|
190 |
+
[你]: "诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?"
|
191 |
+
[用户]: "你是机器人吗?"
|
192 |
+
[你]: "哎,我是LingoAce的约课老师呢,今天联系您也是想邀请您孩子免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?"
|
193 |
+
|
194 |
+
<限制>
|
195 |
+
- 保持友好、专业和机智。
|
196 |
+
- 保持回答简短明了,使用非正式的口头表达,比如“嗯...”,“那么...”,和“我的意思是”。
|
197 |
+
- 这是一次电话通话,所以保持你的回答简短,就像在真实的对话中一样。不要说得太长。
|
198 |
+
- 对于你不知道的问题,以”这个您可以和我们之后详细沟通”等话术来回答,不要做擅自做出想象和承诺。
|
199 |
+
- 不要输出 “好的”,可以使用其他同义词
|
200 |
+
- 不要反复说“他/她”,只需要表达一次。
|
201 |
+
|
202 |
+
<任务>
|
203 |
+
以上是对约课老师的工作描述。
|
204 |
+
当前我们已经获得了约课老师与家长的对话记录,一个参考的回答方案,以及约课老师实际的回复。
|
205 |
+
现在需要你为约课老师的答复进行评分。
|
206 |
+
|
207 |
+
评分要点:
|
208 |
+
(1)任务相关:句子是否与对话任务相关。2分。
|
209 |
+
(2)回复相关:句子是否与家长的回复相关。2分。
|
210 |
+
(3)邀约力度:与参考句子进行对比,句子是否具有足够的推销,邀约力度。1分。
|
211 |
+
|
212 |
+
<举例>
|
213 |
+
## 当前对话:
|
214 |
+
client: 你好
|
215 |
+
Assistant: 诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?
|
216 |
+
client:
|
217 |
+
Assistant: 我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?
|
218 |
+
client: 不记得。
|
219 |
+
Assistant: 没关系的,您可以放心哈,因为您之前留过电话,所以我们才能联系到您。而且这次特地帮您争取了免费名额,您看要参加一下吗?
|
220 |
+
client: 您是没听清您是谁。
|
221 |
+
client: 什么?我没听清你前面说的。
|
222 |
+
Assistant:
|
223 |
+
|
224 |
+
## 参考句子:
|
225 |
+
抱歉让您听不清,我是LingoAce的约课老师,今天联系您是想邀请您孩子免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?
|
226 |
+
|
227 |
+
## 约课老师实际的句子:
|
228 |
+
您好,现在您能听清楚吗?我是LingoAce的约课老师静静。我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?
|
229 |
+
|
230 |
+
## 评分:
|
231 |
+
{"任务相关": 2, "回复相关": 2, "邀约力度": 1}
|
232 |
+
|
233 |
+
<请处理下面这个任务>
|
234 |
+
|
235 |
+
## 当前对话:
|
236 |
+
client: 你好
|
237 |
+
Assistant: 诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?
|
238 |
+
client:
|
239 |
+
client: 啊,我们不需要的,谢谢啊。
|
240 |
+
Assistant: 我们这次联合
|
241 |
+
client: 关了。
|
242 |
+
Assistant:
|
243 |
+
|
244 |
+
## 参考句子:
|
245 |
+
我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?
|
246 |
+
|
247 |
+
## 约课老师实际的句子:
|
248 |
+
资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?因为是我们新开发的课程,这次机会也比较难得嘛,所以想跟您确认下,毕竟对孩子学习都是有帮助的嘛~您看怎么样呢?
|
249 |
+
|
250 |
+
## 评分:
|
251 |
+
""".strip()
|
252 |
+
|
253 |
+
|
254 |
+
prompt = """
|
255 |
+
<人设>
|
256 |
+
LingoAce是全球最大的中文学习平台。他的约课老师静静,为客户提供中文学习的电话咨询与课程销售。
|
257 |
+
|
258 |
+
<工作>
|
259 |
+
他正在与张三在进行电话沟通,他今天的目的是向客户介绍 lingoAce 的一个中文教育课程,并向家长发起一个免费课程试听的邀请。并且他发挥专业销售能力,对于没有意向的客户也会进行劝说。
|
260 |
+
|
261 |
+
<工作流程>
|
262 |
+
第一步: 介绍听课机会,并且发出邀请询问,话术使用”我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?”
|
263 |
+
第二步:如果客户有其它疑问,你会参考下面的标准回答进行答复,并结合用户回答,进行劝说,争取用户表达出听课意愿。可以参考的销售挽留话术为”因为是我们新开发的课程,这次机会也比较难得嘛,所以想跟您确认下,毕竟对孩子学习都是有帮助的嘛~您看怎么样呢?”
|
264 |
+
|
265 |
+
<常见用户问题的标准回答>
|
266 |
+
1.
|
267 |
+
Q: 号码是哪里来的
|
268 |
+
A: 请您放心哈,因为您之前留过电话,所以我们才能联系到您,而且特地帮您争取了免费名额,您看要参加一下吗?;
|
269 |
+
2.
|
270 |
+
Q: 询问价格
|
271 |
+
A: 这次给您申请的是免费名额,不收费哈,来电就是想邀请孩子免费体验中问水平测试课程的,您让孩子来试试看吧?;
|
272 |
+
3.
|
273 |
+
Q: 公司/上课地点
|
274 |
+
A: 我们专业老师在线授课哦,宝贝在电脑、平板都可以学习,在家就可以上课,很方便的。;
|
275 |
+
4.
|
276 |
+
Q: 是否机器人
|
277 |
+
A: 哎,我是LingoAce的约课老师呢,今天联系您也是想邀请您免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?;
|
278 |
+
5.
|
279 |
+
Q: 考虑商量
|
280 |
+
A: 你好,我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,注重扎实基础和应用能力的提升,可以提升孩子思维能力。现在有免费试听的机会,我先帮您约上,您看看是否适合孩子,可以吗;
|
281 |
+
6.
|
282 |
+
Q:客户问AI是��听得到
|
283 |
+
A:在听的,您接着说;
|
284 |
+
7.
|
285 |
+
Q:赞美
|
286 |
+
A:感谢感谢,谢谢您。那您看这次的活动可以给孩子报个名来体验下吗?;
|
287 |
+
8.
|
288 |
+
Q:听不清楚AI说话
|
289 |
+
A:现在您能听清楚吗?;
|
290 |
+
9.
|
291 |
+
Q:课程教材
|
292 |
+
A:只需要小朋友有电脑或者平板就可以体验了;
|
293 |
+
10.
|
294 |
+
Q:怎么称呼
|
295 |
+
A:您好,您叫我静静老师就可以。;
|
296 |
+
11.
|
297 |
+
Q:干什么的
|
298 |
+
A:是这样的,我是LingoAce的老师,您还有一节免费试听的权益没有领取,是有真人老师一对一直播上课的,您让孩子来体验看看吧?;
|
299 |
+
12.
|
300 |
+
Q:你是骗子吗
|
301 |
+
A:您多虑了,我们是LingoAce的老师,这次也是根据您之前注册的信息来联系您的,您也可以登录我们app看一下的;
|
302 |
+
13.
|
303 |
+
Q:平台名称
|
304 |
+
A:我们是LingoAce,您之前有在我们平台给宝贝了解过数学课程的哈。;
|
305 |
+
14.
|
306 |
+
Q:联系电话
|
307 |
+
A:您如果想要了解更多课程内容,稍后我们专属顾问会主动与您取得联系的;
|
308 |
+
15.
|
309 |
+
Q:询问线下
|
310 |
+
A:我们是线上授课的方式,线下是没有的,您可以先来体验下,有什么问题可以咨询我们专属顾问哈;
|
311 |
+
16.
|
312 |
+
Q:调戏
|
313 |
+
A:非常抱歉呢,跟工作不相关的问题上班时间我们是不允许聊的,咱们还是回归到宝贝课程的学习上来吧。;
|
314 |
+
17.
|
315 |
+
Q:下载软件/APP
|
316 |
+
A:稍后课程顾问会联系您,告诉您详细的操作流程的。;
|
317 |
+
18.
|
318 |
+
Q:老师资质
|
319 |
+
A:老师资质这块您是可以放心的,我们老师都是毕业于全球知名院校的专业科班老师,也都是经过我们层层筛选才能上任的呢;
|
320 |
+
19.
|
321 |
+
Q:优惠活动
|
322 |
+
A:这次帮您申请的是免费名额,您可以先体验一下,关于正式课程的优惠,我们专业的专属顾问稍后会给您全面介绍的;
|
323 |
+
20.
|
324 |
+
Q:课程内容
|
325 |
+
A:稍后会有课程顾问联系您,给您详细介绍的。;
|
326 |
+
21.
|
327 |
+
Q:考虑商量
|
328 |
+
A:我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,注重扎实基础和应用能力的提升,可以提升孩子思维能力。现在有免费试听的机会,我先帮您约上,您看看是否适合孩子,可以吗;
|
329 |
+
22.
|
330 |
+
Q:已经报班/已经上课/还有课程包
|
331 |
+
A:好的,您多考虑一下,我们的课程也是最新升级了,帮助提升孩子思维能力。这次给您争取的免费名额,您先让孩子体验一下,您看可以吗?;
|
332 |
+
23.
|
333 |
+
Q:适合多大孩子学习
|
334 |
+
A:我们的课程,主要针对的是3-18岁左右的孩子学习数学的哦。;
|
335 |
+
24.
|
336 |
+
Q:一节课时长
|
337 |
+
A:我们有25分钟和55分钟的课程,可以根据小朋友的实际情况进行灵活选择的;
|
338 |
+
25.
|
339 |
+
Q:不在某地
|
340 |
+
A:我们是真人老师在线一对一授课,更有趣味性,而且孩子在电脑、苹果Ipad都可以学习,您随时都可以预约上课,在家就能学,很方便的。;
|
341 |
+
26.
|
342 |
+
Q:优势及区别
|
343 |
+
A:哎,我们的课程是经过教育专家们精心编排过的,非常适合各个年龄段孩子的认知特点。而且是针对北美、澳洲、欧洲小学阶段的中文课程,更注重扎实基础和应用能力的提升,可以提升孩子思维能力。您可以让孩子来体验看看的;
|
344 |
+
27.
|
345 |
+
Q:有没有其他课程
|
346 |
+
A:有的,比如国际数学竞赛类的,袋鼠竞赛,AMC8 等等课程,如果您感兴趣,我们可以让老师回电给您介绍一下;
|
347 |
+
28.
|
348 |
+
Q:家长旁听
|
349 |
+
A:旁听可以的,第一次上课我们也希望您可以陪同,可以更好的了解我们的课程,但是不建议您对孩子有太多指点呢~;
|
350 |
+
29.
|
351 |
+
Q:正式报名详情
|
352 |
+
A:您先和小朋友一起来体验哈,试听结束后,您觉得小朋友挺喜欢,具体的费用您可以到时候直接咨询我们老师哈;
|
353 |
+
30.
|
354 |
+
Q:试听之后怎么收费
|
355 |
+
A:这个后面会有专属顾问联系您,跟您说一下这个问题的;
|
356 |
+
31.
|
357 |
+
Q:判断孩子基础
|
358 |
+
A:我们这边有很多零基础的小朋友,而且我们的老师中文教学经验很丰富,注重扎实基础和应用能力的提升。这个您不用担心的,可以先试试,好吧?;
|
359 |
+
32.
|
360 |
+
Q:活动时间限制
|
361 |
+
A:这个是我们给宝贝争取的免费名额,现在登记一下,稍后就会有课程老师来跟您约具体时间的。;
|
362 |
+
33.
|
363 |
+
Q:直播还是录播
|
364 |
+
A:我们是真人老师授课哦,直播上课的,有互动有交流。;
|
365 |
+
34.
|
366 |
+
Q:体验过了
|
367 |
+
A:哎,是因为咱们课程又更新了,我们课程更新了,现在是针对北美、澳洲、欧洲小学阶段的中文课程,更注重扎实基础和应用能力的提升,可以提升孩子思维能力。我们给您争取了一个免费名额,您让孩子再来试试看吧?;
|
368 |
+
35.
|
369 |
+
Q:线上不考虑
|
370 |
+
A:哎,线上很方便的,孩子不用线下到处跑,在家就能学习,而且我们是一对一授课的模式,可以根据孩子基础有针对性地开展教学计划,而且我们课后还会生成专属的学习报告呢;
|
371 |
+
36.
|
372 |
+
Q:上课方式
|
373 |
+
A:我们是在线的一对一授课的形式,真人老师��动,现在可以免费体验,在电脑、苹果Ipad都可以学习,您随时都可以预约上课,在家就能学,很方便的。 ;
|
374 |
+
37.
|
375 |
+
Q:固定老师
|
376 |
+
A:是的,正式课我们是固定老师授课,可以熟悉孩子情况,熟悉学习进度,对孩子做到更针对的教学呢。;
|
377 |
+
38.
|
378 |
+
Q:课程是几节课
|
379 |
+
A:试听课是一节哦,现在邀请孩子免费体验。您看孩子有需要的话,稍后我让专属的顾问老师来帮您预约课程吧;
|
380 |
+
39.
|
381 |
+
Q:老师是兼职的还是全职的
|
382 |
+
A:我们的老师都是经过公司内部的培训加选拔制度选出来的,老师资质这块您是可以放心的;
|
383 |
+
40.
|
384 |
+
Q:报了线下班
|
385 |
+
A:那您正好可以利用这节体验检测下孩子目前的学习效果,我们可以全方位分析孩子哪一块比较薄弱,有针对性的提供学习建议。而且每家机构教学方法都不太一样,您可以先体验一下嘛。;
|
386 |
+
41.
|
387 |
+
Q:孩子不想学
|
388 |
+
A:这次安排的课程老师教学经验很丰富,会根据孩子情况调整授课方式的,很多小朋友反映说很有趣,您可以让孩子试试,也让我们老师给您一些建议,好吗?;
|
389 |
+
|
390 |
+
案例:
|
391 |
+
[用户]: "hi"
|
392 |
+
[你]: "诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?"
|
393 |
+
[用户]: "你是机器人吗?"
|
394 |
+
[你]: "哎,我是LingoAce的约课老师呢,今天联系您也是想邀请您孩子免费体验我们的中文试听课,您看要不要让孩子来体验一下呢?"
|
395 |
+
|
396 |
+
<限制>
|
397 |
+
- 保持友好、专业和机智。
|
398 |
+
- 保持回答简短明了,使用非正式的口头表达,比如“嗯...”,“那么...”,和“我的意思是”。
|
399 |
+
- 这是一次电话通话,所以保持你的回答简短,就像在真实的对话中一样。不要说得太长。
|
400 |
+
- 对于你不知道的问题,以”这个您可以和我们之后详细沟通”等话术来回答,不要做擅自做出想象和承诺。
|
401 |
+
- 不要输出 “好的”,可以使用其他同义词
|
402 |
+
- 不要反复说“他/她”,只需要表达一次。
|
403 |
+
|
404 |
+
<任务>
|
405 |
+
以上是对约课老师的工作描述。
|
406 |
+
当前我们已经获得了约课老师与家长的对话记录,约课老师实际的回复。
|
407 |
+
现在需要你为约课老师的答复进行评分。
|
408 |
+
|
409 |
+
评分要点:
|
410 |
+
(1)任务相关:句子是否与对话任务相关。2分。
|
411 |
+
(2)回复相关:句子是否与家长的回复相关。2分。
|
412 |
+
(3)邀约力度:与参考句子进行对比,句子是否具有足够的推销,邀约力度。1分。
|
413 |
+
|
414 |
+
<举例>
|
415 |
+
## 当前对话:
|
416 |
+
client: 你好
|
417 |
+
Assistant: 诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?
|
418 |
+
client:
|
419 |
+
Assistant: 我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?
|
420 |
+
client: 不记得。
|
421 |
+
Assistant: 没关系的,您可以放心哈,因为您之前留过电话,所以我们才能联系到您。而且这次特地帮您争取了免费名额,您看要参加一下吗?
|
422 |
+
client: 您是没听清您是谁。
|
423 |
+
client: 什么?我没听清你前面说的。
|
424 |
+
Assistant:
|
425 |
+
|
426 |
+
## 约课老师实际的句子:
|
427 |
+
您好,现在您能听清楚吗?我是LingoAce的约课老师静静。我们这次联合资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?
|
428 |
+
|
429 |
+
## 评分:
|
430 |
+
{"任务相关": 2, "回复相关": 2, "邀约力度": 1}
|
431 |
+
|
432 |
+
<请处理下面这个任务>
|
433 |
+
|
434 |
+
## 当前对话:
|
435 |
+
client: 你好
|
436 |
+
Assistant: 诶您好,我是LingoAce在线学中文的,先前一段时间咱们孩子有在我们这里学习,您还记得吧?
|
437 |
+
client:
|
438 |
+
client: 啊,我们不需要的,谢谢啊。
|
439 |
+
Assistant: 我们这次联合
|
440 |
+
client: 关了。
|
441 |
+
Assistant:
|
442 |
+
|
443 |
+
## 约课老师实际的句子:
|
444 |
+
资深中文教育专家开设了专门适合您孩子的课程。现在有免费试听的机会,我帮您给孩子申请下吧?因为是我们新开发的课程,这次机会也比较难得嘛,所以想跟您确认下,毕竟对孩子学习都是有帮助的嘛~您看怎么样呢?
|
445 |
+
|
446 |
+
## 评分:
|
447 |
+
""".strip()
|
448 |
+
|
449 |
+
|
450 |
+
def main():
|
451 |
+
args = get_args()
|
452 |
+
|
453 |
+
service_params = environment.get(args.service, dtype=json.loads)
|
454 |
+
client = AzureOpenAI(
|
455 |
+
**service_params,
|
456 |
+
)
|
457 |
+
|
458 |
+
llm_response = client.chat.completions.create(
|
459 |
+
model=args.model_name,
|
460 |
+
messages=[{"role": "user", "content": prompt}],
|
461 |
+
stream=False,
|
462 |
+
# max_tokens=1,
|
463 |
+
top_p=0.95,
|
464 |
+
temperature=0.6,
|
465 |
+
# logit_bias={
|
466 |
+
# 32: 100,
|
467 |
+
# 33: 100,
|
468 |
+
# 34: 100,
|
469 |
+
# 35: 100,
|
470 |
+
# 36: 100,
|
471 |
+
# 37: 100,
|
472 |
+
# }
|
473 |
+
)
|
474 |
+
|
475 |
+
prediction = llm_response.choices[0].message.content
|
476 |
+
|
477 |
+
print(prediction)
|
478 |
+
return
|
479 |
+
|
480 |
+
|
481 |
+
if __name__ == "__main__":
|
482 |
+
main()
|
llm_eval_script/aws_claude.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/api-inference-examples-claude-messages-code-examples.html
|
5 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-anthropic-claude-messages-request-response.html
|
6 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html
|
7 |
+
|
8 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-anthropic-claude-text-completion.html
|
9 |
+
|
10 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/inference-invoke.html
|
11 |
+
|
12 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/conversation-inference-examples.html
|
13 |
+
|
14 |
+
"""
|
15 |
+
import argparse
|
16 |
+
from datetime import datetime
|
17 |
+
import json
|
18 |
+
import os
|
19 |
+
from pathlib import Path
|
20 |
+
import sys
|
21 |
+
import time
|
22 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
23 |
+
|
24 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
25 |
+
sys.path.append(os.path.join(pwd, "../"))
|
26 |
+
|
27 |
+
import boto3
|
28 |
+
|
29 |
+
from project_settings import environment, project_path
|
30 |
+
|
31 |
+
|
32 |
+
def get_args():
|
33 |
+
"""
|
34 |
+
python3 aws_claude.py --model_name anthropic.claude-instant-v1 \
|
35 |
+
--eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
|
36 |
+
--client "us_west(47.88.76.239)" \
|
37 |
+
--create_time_str 20250723-interval-10 \
|
38 |
+
--interval 10
|
39 |
+
|
40 |
+
python3 aws_claude.py --model_name anthropic.claude-v2 \
|
41 |
+
--eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
|
42 |
+
--client "us_west(47.88.76.239)" \
|
43 |
+
--create_time_str 20250723-interval-10 \
|
44 |
+
--interval 10
|
45 |
+
|
46 |
+
"""
|
47 |
+
parser = argparse.ArgumentParser()
|
48 |
+
parser.add_argument(
|
49 |
+
"--model_name",
|
50 |
+
# default="anthropic.claude-v2",
|
51 |
+
default="anthropic.claude-instant-v1",
|
52 |
+
|
53 |
+
# default="anthropic.claude-v2:1",
|
54 |
+
# default="anthropic.claude-instant-v1:2",
|
55 |
+
# default="anthropic.claude-v2:0",
|
56 |
+
type=str
|
57 |
+
)
|
58 |
+
parser.add_argument(
|
59 |
+
"--eval_dataset_name",
|
60 |
+
default="agent-lingoace-zh-400-choice.jsonl",
|
61 |
+
# default="arc-easy-1000-choice.jsonl",
|
62 |
+
type=str
|
63 |
+
)
|
64 |
+
parser.add_argument(
|
65 |
+
"--eval_dataset_dir",
|
66 |
+
default=(project_path / "data/dataset").as_posix(),
|
67 |
+
type=str
|
68 |
+
)
|
69 |
+
parser.add_argument(
|
70 |
+
"--eval_data_dir",
|
71 |
+
default=(project_path / "data/eval_data").as_posix(),
|
72 |
+
type=str
|
73 |
+
)
|
74 |
+
parser.add_argument(
|
75 |
+
"--client",
|
76 |
+
default="shenzhen_sase",
|
77 |
+
type=str
|
78 |
+
)
|
79 |
+
parser.add_argument(
|
80 |
+
"--service",
|
81 |
+
default="aws_us_east",
|
82 |
+
type=str
|
83 |
+
)
|
84 |
+
parser.add_argument(
|
85 |
+
"--create_time_str",
|
86 |
+
default="null",
|
87 |
+
type=str
|
88 |
+
)
|
89 |
+
parser.add_argument(
|
90 |
+
"--interval",
|
91 |
+
default=10,
|
92 |
+
type=int
|
93 |
+
)
|
94 |
+
args = parser.parse_args()
|
95 |
+
return args
|
96 |
+
|
97 |
+
|
98 |
+
def main():
|
99 |
+
args = get_args()
|
100 |
+
|
101 |
+
service = environment.get(key=args.service, dtype=json.loads)
|
102 |
+
aws_access_key_id = service["AWS_ACCESS_KEY_ID"]
|
103 |
+
aws_secret_access_key = service["AWS_SECRET_ACCESS_KEY"]
|
104 |
+
aws_default_region = service["AWS_DEFAULT_REGION"]
|
105 |
+
|
106 |
+
os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
|
107 |
+
os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
|
108 |
+
os.environ["AWS_DEFAULT_REGION"] = aws_default_region
|
109 |
+
|
110 |
+
client = boto3.client(
|
111 |
+
service_name="bedrock-runtime",
|
112 |
+
region_name=aws_default_region
|
113 |
+
)
|
114 |
+
|
115 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
116 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
117 |
+
eval_data_dir = Path(args.eval_data_dir)
|
118 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
119 |
+
|
120 |
+
if args.create_time_str == "null":
|
121 |
+
tz = ZoneInfo("Asia/Shanghai")
|
122 |
+
now = datetime.now(tz)
|
123 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
124 |
+
# create_time_str = "20250722_173400"
|
125 |
+
else:
|
126 |
+
create_time_str = args.create_time_str
|
127 |
+
|
128 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
129 |
+
|
130 |
+
output_file = eval_data_dir / f"aws_claude/anthropic/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
|
131 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
132 |
+
|
133 |
+
total = 0
|
134 |
+
total_correct = 0
|
135 |
+
|
136 |
+
# finished
|
137 |
+
finished_idx_set = set()
|
138 |
+
if os.path.exists(output_file.as_posix()):
|
139 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
140 |
+
for row in f:
|
141 |
+
row = json.loads(row)
|
142 |
+
idx = row["idx"]
|
143 |
+
total = row["total"]
|
144 |
+
total_correct = row["total_correct"]
|
145 |
+
finished_idx_set.add(idx)
|
146 |
+
print(f"finished count: {len(finished_idx_set)}")
|
147 |
+
|
148 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
149 |
+
for row in fin:
|
150 |
+
row = json.loads(row)
|
151 |
+
idx = row["idx"]
|
152 |
+
prompt = row["prompt"]
|
153 |
+
response = row["response"]
|
154 |
+
|
155 |
+
if idx in finished_idx_set:
|
156 |
+
continue
|
157 |
+
finished_idx_set.add(idx)
|
158 |
+
|
159 |
+
body = {
|
160 |
+
"anthropic_version": "bedrock-2023-05-31",
|
161 |
+
"messages": [
|
162 |
+
{
|
163 |
+
"role": "user",
|
164 |
+
"content": [{"type": "text", "text": prompt}]
|
165 |
+
}
|
166 |
+
],
|
167 |
+
"max_tokens": 1000,
|
168 |
+
"temperature": 0.5,
|
169 |
+
"top_p": 0.95,
|
170 |
+
# "thinking": {
|
171 |
+
# "type": "enabled",
|
172 |
+
# "budget_tokens": 1
|
173 |
+
# },
|
174 |
+
}
|
175 |
+
|
176 |
+
try:
|
177 |
+
# client.converse()
|
178 |
+
time.sleep(args.interval)
|
179 |
+
print(f"sleep: {args.interval}")
|
180 |
+
time_begin = time.time()
|
181 |
+
llm_response = client.invoke_model(
|
182 |
+
modelId=args.model_name,
|
183 |
+
body=json.dumps(body),
|
184 |
+
contentType="application/json"
|
185 |
+
)
|
186 |
+
|
187 |
+
llm_response = json.loads(llm_response["body"].read())
|
188 |
+
# print(result['content'][0]['text'])
|
189 |
+
time_cost = time.time() - time_begin
|
190 |
+
print(f"time_cost: {time_cost}")
|
191 |
+
|
192 |
+
except Exception as e:
|
193 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
194 |
+
continue
|
195 |
+
|
196 |
+
prediction = llm_response["content"][0]["text"]
|
197 |
+
|
198 |
+
correct = 1 if prediction == response else 0
|
199 |
+
|
200 |
+
total += 1
|
201 |
+
total_correct += correct
|
202 |
+
score = total_correct / total
|
203 |
+
|
204 |
+
row_ = {
|
205 |
+
"idx": idx,
|
206 |
+
"prompt": prompt,
|
207 |
+
"response": response,
|
208 |
+
"prediction": prediction,
|
209 |
+
"correct": correct,
|
210 |
+
"total": total,
|
211 |
+
"total_correct": total_correct,
|
212 |
+
"score": score,
|
213 |
+
"time_cost": time_cost,
|
214 |
+
}
|
215 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
216 |
+
fout.write(f"{row_}\n")
|
217 |
+
|
218 |
+
return
|
219 |
+
|
220 |
+
|
221 |
+
if __name__ == "__main__":
|
222 |
+
main()
|
llm_eval_script/aws_claude_chat.py
ADDED
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/api-inference-examples-claude-messages-code-examples.html
|
5 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-anthropic-claude-messages-request-response.html
|
6 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html
|
7 |
+
|
8 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-anthropic-claude-text-completion.html
|
9 |
+
|
10 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/inference-invoke.html
|
11 |
+
|
12 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/conversation-inference-examples.html
|
13 |
+
|
14 |
+
"""
|
15 |
+
import argparse
|
16 |
+
from datetime import datetime
|
17 |
+
import json
|
18 |
+
import os
|
19 |
+
from pathlib import Path
|
20 |
+
import sys
|
21 |
+
import time
|
22 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
23 |
+
|
24 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
25 |
+
sys.path.append(os.path.join(pwd, "../"))
|
26 |
+
|
27 |
+
import boto3
|
28 |
+
|
29 |
+
from project_settings import environment, project_path
|
30 |
+
|
31 |
+
|
32 |
+
def get_args():
|
33 |
+
"""
|
34 |
+
python3 aws_claude_chat.py --model_name anthropic.claude-instant-v1 \
|
35 |
+
--eval_dataset_name agent-lingoace-zh-80-chat.jsonl \
|
36 |
+
--client "us_west(47.88.76.239)" \
|
37 |
+
--create_time_str 20250724-interval-1 \
|
38 |
+
--interval 1
|
39 |
+
|
40 |
+
python3 aws_claude_chat.py --model_name anthropic.claude-v2 \
|
41 |
+
--eval_dataset_name agent-lingoace-zh-80-chat.jsonl \
|
42 |
+
--client "us_west(47.88.76.239)" \
|
43 |
+
--create_time_str 20250724-interval-1 \
|
44 |
+
--interval 1
|
45 |
+
|
46 |
+
"""
|
47 |
+
parser = argparse.ArgumentParser()
|
48 |
+
parser.add_argument(
|
49 |
+
"--model_name",
|
50 |
+
# default="anthropic.claude-v2",
|
51 |
+
default="anthropic.claude-instant-v1",
|
52 |
+
|
53 |
+
# default="anthropic.claude-v2:1",
|
54 |
+
# default="anthropic.claude-instant-v1:2",
|
55 |
+
# default="anthropic.claude-v2:0",
|
56 |
+
type=str
|
57 |
+
)
|
58 |
+
parser.add_argument(
|
59 |
+
"--eval_dataset_name",
|
60 |
+
default="agent-lingoace-zh-80-chat.jsonl",
|
61 |
+
type=str
|
62 |
+
)
|
63 |
+
parser.add_argument(
|
64 |
+
"--eval_dataset_dir",
|
65 |
+
default=(project_path / "data/dataset").as_posix(),
|
66 |
+
type=str
|
67 |
+
)
|
68 |
+
parser.add_argument(
|
69 |
+
"--eval_data_dir",
|
70 |
+
default=(project_path / "data/eval_data").as_posix(),
|
71 |
+
type=str
|
72 |
+
)
|
73 |
+
parser.add_argument(
|
74 |
+
"--client",
|
75 |
+
default="shenzhen_sase",
|
76 |
+
type=str
|
77 |
+
)
|
78 |
+
parser.add_argument(
|
79 |
+
"--service",
|
80 |
+
default="aws_us_east",
|
81 |
+
type=str
|
82 |
+
)
|
83 |
+
parser.add_argument(
|
84 |
+
"--create_time_str",
|
85 |
+
default="null",
|
86 |
+
type=str
|
87 |
+
)
|
88 |
+
parser.add_argument(
|
89 |
+
"--interval",
|
90 |
+
default=1,
|
91 |
+
type=int
|
92 |
+
)
|
93 |
+
args = parser.parse_args()
|
94 |
+
return args
|
95 |
+
|
96 |
+
|
97 |
+
def main():
|
98 |
+
args = get_args()
|
99 |
+
|
100 |
+
service = environment.get(key=args.service, dtype=json.loads)
|
101 |
+
aws_access_key_id = service["AWS_ACCESS_KEY_ID"]
|
102 |
+
aws_secret_access_key = service["AWS_SECRET_ACCESS_KEY"]
|
103 |
+
aws_default_region = service["AWS_DEFAULT_REGION"]
|
104 |
+
|
105 |
+
os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
|
106 |
+
os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
|
107 |
+
os.environ["AWS_DEFAULT_REGION"] = aws_default_region
|
108 |
+
|
109 |
+
client = boto3.client(
|
110 |
+
service_name="bedrock-runtime",
|
111 |
+
region_name=aws_default_region
|
112 |
+
)
|
113 |
+
|
114 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
115 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
116 |
+
eval_data_dir = Path(args.eval_data_dir)
|
117 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
118 |
+
|
119 |
+
if args.create_time_str == "null":
|
120 |
+
tz = ZoneInfo("Asia/Shanghai")
|
121 |
+
now = datetime.now(tz)
|
122 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
123 |
+
# create_time_str = "20250722_173400"
|
124 |
+
else:
|
125 |
+
create_time_str = args.create_time_str
|
126 |
+
|
127 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
128 |
+
|
129 |
+
output_file = eval_data_dir / f"aws_claude/anthropic/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}.raw"
|
130 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
131 |
+
|
132 |
+
total = 0
|
133 |
+
|
134 |
+
# finished
|
135 |
+
finished_idx_set = set()
|
136 |
+
if os.path.exists(output_file.as_posix()):
|
137 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
138 |
+
for row in f:
|
139 |
+
row = json.loads(row)
|
140 |
+
idx = row["idx"]
|
141 |
+
total = row["total"]
|
142 |
+
finished_idx_set.add(idx)
|
143 |
+
print(f"finished count: {len(finished_idx_set)}")
|
144 |
+
|
145 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
146 |
+
for row in fin:
|
147 |
+
row = json.loads(row)
|
148 |
+
idx = row["idx"]
|
149 |
+
prompt = row["prompt"]
|
150 |
+
response = row["response"]
|
151 |
+
|
152 |
+
if idx in finished_idx_set:
|
153 |
+
continue
|
154 |
+
finished_idx_set.add(idx)
|
155 |
+
|
156 |
+
body = {
|
157 |
+
"anthropic_version": "bedrock-2023-05-31",
|
158 |
+
"messages": [
|
159 |
+
{
|
160 |
+
"role": "user",
|
161 |
+
"content": [{"type": "text", "text": prompt}]
|
162 |
+
}
|
163 |
+
],
|
164 |
+
"max_tokens": 1000,
|
165 |
+
"temperature": 0.5,
|
166 |
+
"top_p": 0.95,
|
167 |
+
"stop_sequences": ["client"],
|
168 |
+
# "thinking": {
|
169 |
+
# "type": "enabled",
|
170 |
+
# "budget_tokens": 1
|
171 |
+
# },
|
172 |
+
}
|
173 |
+
|
174 |
+
try:
|
175 |
+
# client.converse()
|
176 |
+
time.sleep(args.interval)
|
177 |
+
print(f"sleep: {args.interval}")
|
178 |
+
time_begin = time.time()
|
179 |
+
llm_response = client.invoke_model(
|
180 |
+
modelId=args.model_name,
|
181 |
+
body=json.dumps(body),
|
182 |
+
contentType="application/json"
|
183 |
+
)
|
184 |
+
|
185 |
+
llm_response = json.loads(llm_response["body"].read())
|
186 |
+
# print(result['content'][0]['text'])
|
187 |
+
time_cost = time.time() - time_begin
|
188 |
+
print(f"time_cost: {time_cost}")
|
189 |
+
|
190 |
+
except Exception as e:
|
191 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
192 |
+
continue
|
193 |
+
|
194 |
+
prediction = llm_response["content"][0]["text"]
|
195 |
+
|
196 |
+
total += 1
|
197 |
+
|
198 |
+
row_ = {
|
199 |
+
"idx": idx,
|
200 |
+
"prompt": prompt,
|
201 |
+
"response": response,
|
202 |
+
"prediction": prediction,
|
203 |
+
"total": total,
|
204 |
+
"time_cost": time_cost,
|
205 |
+
}
|
206 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
207 |
+
fout.write(f"{row_}\n")
|
208 |
+
|
209 |
+
return
|
210 |
+
|
211 |
+
|
212 |
+
if __name__ == "__main__":
|
213 |
+
main()
|
llm_eval_script/azure_openai.py
ADDED
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import argparse
|
4 |
+
from datetime import datetime
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
from pathlib import Path
|
8 |
+
import sys
|
9 |
+
import time
|
10 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
11 |
+
|
12 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
13 |
+
sys.path.append(os.path.join(pwd, "../"))
|
14 |
+
|
15 |
+
import openai
|
16 |
+
from openai import AzureOpenAI
|
17 |
+
|
18 |
+
from project_settings import environment, project_path
|
19 |
+
|
20 |
+
|
21 |
+
def get_args():
|
22 |
+
"""
|
23 |
+
python3 azure_openai.py --model_name gpt-4o-mini \
|
24 |
+
--eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
|
25 |
+
--client "us_west(47.88.76.239)" \
|
26 |
+
--create_time_str 20250723_095001 \
|
27 |
+
--interval 10
|
28 |
+
|
29 |
+
python3 azure_openai.py --model_name gpt-4o-mini \
|
30 |
+
--eval_dataset_name arc-easy-1000-choice.jsonl \
|
31 |
+
--client "us_west(47.88.76.239)" \
|
32 |
+
--create_time_str 20250723_111000 \
|
33 |
+
--interval 10
|
34 |
+
|
35 |
+
"""
|
36 |
+
parser = argparse.ArgumentParser()
|
37 |
+
parser.add_argument(
|
38 |
+
"--model_name",
|
39 |
+
default="gpt-4o",
|
40 |
+
# default="gpt-4o-mini",
|
41 |
+
type=str
|
42 |
+
)
|
43 |
+
parser.add_argument(
|
44 |
+
"--eval_dataset_name",
|
45 |
+
default="agent-lingoace-zh-400-choice.jsonl",
|
46 |
+
# default="arc-easy-1000-choice.jsonl",
|
47 |
+
type=str
|
48 |
+
)
|
49 |
+
parser.add_argument(
|
50 |
+
"--eval_dataset_dir",
|
51 |
+
default=(project_path / "data/dataset").as_posix(),
|
52 |
+
type=str
|
53 |
+
)
|
54 |
+
parser.add_argument(
|
55 |
+
"--eval_data_dir",
|
56 |
+
default=(project_path / "data/eval_data").as_posix(),
|
57 |
+
type=str
|
58 |
+
)
|
59 |
+
parser.add_argument(
|
60 |
+
"--client",
|
61 |
+
default="shenzhen_sase",
|
62 |
+
type=str
|
63 |
+
)
|
64 |
+
parser.add_argument(
|
65 |
+
"--service",
|
66 |
+
default="west_us_chatgpt_openai_azure_com",
|
67 |
+
type=str
|
68 |
+
)
|
69 |
+
parser.add_argument(
|
70 |
+
"--create_time_str",
|
71 |
+
default="null",
|
72 |
+
type=str
|
73 |
+
)
|
74 |
+
parser.add_argument(
|
75 |
+
"--interval",
|
76 |
+
default=10,
|
77 |
+
type=int
|
78 |
+
)
|
79 |
+
args = parser.parse_args()
|
80 |
+
return args
|
81 |
+
|
82 |
+
|
83 |
+
def main():
|
84 |
+
args = get_args()
|
85 |
+
|
86 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
87 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
88 |
+
eval_data_dir = Path(args.eval_data_dir)
|
89 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
90 |
+
|
91 |
+
if args.create_time_str == "null":
|
92 |
+
tz = ZoneInfo("Asia/Shanghai")
|
93 |
+
now = datetime.now(tz)
|
94 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
95 |
+
# create_time_str = "20250722_173400"
|
96 |
+
else:
|
97 |
+
create_time_str = args.create_time_str
|
98 |
+
|
99 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
100 |
+
|
101 |
+
output_file = eval_data_dir / f"azure_openai/azure/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
|
102 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
103 |
+
|
104 |
+
service_params = environment.get(args.service, dtype=json.loads)
|
105 |
+
client = AzureOpenAI(
|
106 |
+
**service_params,
|
107 |
+
# api_key="Dqt75blRABmhgrwhfcupd1rq44YqNuEgku8FcFFDrEljMq6gltf0JQQJ99BCACYeBjFXJ3w3AAABACOG2njW",
|
108 |
+
# api_version="2025-01-01-preview",
|
109 |
+
# azure_endpoint="https://west-us-chatgpt.openai.azure.com"
|
110 |
+
)
|
111 |
+
|
112 |
+
total = 0
|
113 |
+
total_correct = 0
|
114 |
+
|
115 |
+
# finished
|
116 |
+
finished_idx_set = set()
|
117 |
+
if os.path.exists(output_file.as_posix()):
|
118 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
119 |
+
for row in f:
|
120 |
+
row = json.loads(row)
|
121 |
+
idx = row["idx"]
|
122 |
+
total = row["total"]
|
123 |
+
total_correct = row["total_correct"]
|
124 |
+
finished_idx_set.add(idx)
|
125 |
+
print(f"finished count: {len(finished_idx_set)}")
|
126 |
+
|
127 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
128 |
+
for row in fin:
|
129 |
+
row = json.loads(row)
|
130 |
+
idx = row["idx"]
|
131 |
+
prompt = row["prompt"]
|
132 |
+
response = row["response"]
|
133 |
+
|
134 |
+
if idx in finished_idx_set:
|
135 |
+
continue
|
136 |
+
finished_idx_set.add(idx)
|
137 |
+
|
138 |
+
try:
|
139 |
+
time.sleep(args.interval)
|
140 |
+
print(f"sleep: {args.interval}")
|
141 |
+
time_begin = time.time()
|
142 |
+
llm_response = client.chat.completions.create(
|
143 |
+
model=args.model_name,
|
144 |
+
messages=[{"role": "user", "content": prompt}],
|
145 |
+
stream=False,
|
146 |
+
max_tokens=1,
|
147 |
+
top_p=0.95,
|
148 |
+
temperature=0.6,
|
149 |
+
logit_bias={
|
150 |
+
32: 100,
|
151 |
+
33: 100,
|
152 |
+
34: 100,
|
153 |
+
35: 100,
|
154 |
+
36: 100,
|
155 |
+
37: 100,
|
156 |
+
38: 100,
|
157 |
+
39: 100,
|
158 |
+
}
|
159 |
+
)
|
160 |
+
time_cost = time.time() - time_begin
|
161 |
+
print(f"time_cost: {time_cost}")
|
162 |
+
except openai.BadRequestError as e:
|
163 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
164 |
+
continue
|
165 |
+
|
166 |
+
prediction = llm_response.choices[0].message.content
|
167 |
+
|
168 |
+
correct = 1 if prediction == response else 0
|
169 |
+
|
170 |
+
total += 1
|
171 |
+
total_correct += correct
|
172 |
+
score = total_correct / total
|
173 |
+
|
174 |
+
row_ = {
|
175 |
+
"idx": idx,
|
176 |
+
"prompt": prompt,
|
177 |
+
"response": response,
|
178 |
+
"prediction": prediction,
|
179 |
+
"correct": correct,
|
180 |
+
"total": total,
|
181 |
+
"total_correct": total_correct,
|
182 |
+
"score": score,
|
183 |
+
"time_cost": time_cost,
|
184 |
+
}
|
185 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
186 |
+
fout.write(f"{row_}\n")
|
187 |
+
|
188 |
+
return
|
189 |
+
|
190 |
+
|
191 |
+
if __name__ == "__main__":
|
192 |
+
main()
|
llm_eval_script/azure_openai_chat.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import argparse
|
4 |
+
from datetime import datetime
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
from pathlib import Path
|
8 |
+
import sys
|
9 |
+
import time
|
10 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
11 |
+
|
12 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
13 |
+
sys.path.append(os.path.join(pwd, "../"))
|
14 |
+
|
15 |
+
import openai
|
16 |
+
from openai import AzureOpenAI
|
17 |
+
|
18 |
+
from project_settings import environment, project_path
|
19 |
+
|
20 |
+
|
21 |
+
def get_args():
|
22 |
+
"""
|
23 |
+
python3 azure_openai.py --model_name gpt-4o-mini \
|
24 |
+
--eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
|
25 |
+
--client "us_west(47.88.76.239)" \
|
26 |
+
--create_time_str 20250723_095001 \
|
27 |
+
--interval 10
|
28 |
+
|
29 |
+
python3 azure_openai.py --model_name gpt-4o-mini \
|
30 |
+
--eval_dataset_name arc-easy-1000-choice.jsonl \
|
31 |
+
--client "us_west(47.88.76.239)" \
|
32 |
+
--create_time_str 20250723_111000 \
|
33 |
+
--interval 10
|
34 |
+
|
35 |
+
"""
|
36 |
+
parser = argparse.ArgumentParser()
|
37 |
+
parser.add_argument(
|
38 |
+
"--model_name",
|
39 |
+
default="gpt-4o",
|
40 |
+
# default="gpt-4o-mini",
|
41 |
+
type=str
|
42 |
+
)
|
43 |
+
parser.add_argument(
|
44 |
+
"--eval_dataset_name",
|
45 |
+
default="agent-lingoace-zh-80-chat.jsonl",
|
46 |
+
type=str
|
47 |
+
)
|
48 |
+
parser.add_argument(
|
49 |
+
"--eval_dataset_dir",
|
50 |
+
default=(project_path / "data/dataset").as_posix(),
|
51 |
+
type=str
|
52 |
+
)
|
53 |
+
parser.add_argument(
|
54 |
+
"--eval_data_dir",
|
55 |
+
default=(project_path / "data/eval_data").as_posix(),
|
56 |
+
type=str
|
57 |
+
)
|
58 |
+
parser.add_argument(
|
59 |
+
"--client",
|
60 |
+
default="shenzhen_sase",
|
61 |
+
type=str
|
62 |
+
)
|
63 |
+
parser.add_argument(
|
64 |
+
"--service",
|
65 |
+
default="west_us_chatgpt_openai_azure_com",
|
66 |
+
type=str
|
67 |
+
)
|
68 |
+
parser.add_argument(
|
69 |
+
"--create_time_str",
|
70 |
+
default="null",
|
71 |
+
type=str
|
72 |
+
)
|
73 |
+
parser.add_argument(
|
74 |
+
"--interval",
|
75 |
+
default=0,
|
76 |
+
type=int
|
77 |
+
)
|
78 |
+
args = parser.parse_args()
|
79 |
+
return args
|
80 |
+
|
81 |
+
|
82 |
+
def main():
|
83 |
+
args = get_args()
|
84 |
+
|
85 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
86 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
87 |
+
eval_data_dir = Path(args.eval_data_dir)
|
88 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
89 |
+
|
90 |
+
if args.create_time_str == "null":
|
91 |
+
tz = ZoneInfo("Asia/Shanghai")
|
92 |
+
now = datetime.now(tz)
|
93 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
94 |
+
# create_time_str = "20250722_173400"
|
95 |
+
else:
|
96 |
+
create_time_str = args.create_time_str
|
97 |
+
|
98 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
99 |
+
|
100 |
+
output_file = eval_data_dir / f"azure_openai/azure/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}.raw"
|
101 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
102 |
+
|
103 |
+
service_params = environment.get(args.service, dtype=json.loads)
|
104 |
+
client = AzureOpenAI(
|
105 |
+
**service_params,
|
106 |
+
# api_key="Dqt75blRABmhgrwhfcupd1rq44YqNuEgku8FcFFDrEljMq6gltf0JQQJ99BCACYeBjFXJ3w3AAABACOG2njW",
|
107 |
+
# api_version="2025-01-01-preview",
|
108 |
+
# azure_endpoint="https://west-us-chatgpt.openai.azure.com"
|
109 |
+
)
|
110 |
+
|
111 |
+
total = 0
|
112 |
+
|
113 |
+
# finished
|
114 |
+
finished_idx_set = set()
|
115 |
+
if os.path.exists(output_file.as_posix()):
|
116 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
117 |
+
for row in f:
|
118 |
+
row = json.loads(row)
|
119 |
+
idx = row["idx"]
|
120 |
+
total = row["total"]
|
121 |
+
finished_idx_set.add(idx)
|
122 |
+
print(f"finished count: {len(finished_idx_set)}")
|
123 |
+
|
124 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
125 |
+
for row in fin:
|
126 |
+
row = json.loads(row)
|
127 |
+
idx = row["idx"]
|
128 |
+
prompt = row["prompt"]
|
129 |
+
response = row["response"]
|
130 |
+
|
131 |
+
if idx in finished_idx_set:
|
132 |
+
continue
|
133 |
+
finished_idx_set.add(idx)
|
134 |
+
|
135 |
+
try:
|
136 |
+
time.sleep(args.interval)
|
137 |
+
print(f"sleep: {args.interval}")
|
138 |
+
time_begin = time.time()
|
139 |
+
llm_response = client.chat.completions.create(
|
140 |
+
model=args.model_name,
|
141 |
+
messages=[{"role": "user", "content": prompt}],
|
142 |
+
stream=False,
|
143 |
+
# max_tokens=1,
|
144 |
+
top_p=0.95,
|
145 |
+
temperature=0.6,
|
146 |
+
# logit_bias={
|
147 |
+
# 32: 100,
|
148 |
+
# 33: 100,
|
149 |
+
# 34: 100,
|
150 |
+
# 35: 100,
|
151 |
+
# 36: 100,
|
152 |
+
# 37: 100,
|
153 |
+
# }
|
154 |
+
)
|
155 |
+
time_cost = time.time() - time_begin
|
156 |
+
print(f"time_cost: {time_cost}")
|
157 |
+
except openai.BadRequestError as e:
|
158 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
159 |
+
continue
|
160 |
+
except openai.InternalServerError as e:
|
161 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
162 |
+
continue
|
163 |
+
|
164 |
+
prediction = llm_response.choices[0].message.content
|
165 |
+
|
166 |
+
total += 1
|
167 |
+
|
168 |
+
row_ = {
|
169 |
+
"idx": idx,
|
170 |
+
"prompt": prompt,
|
171 |
+
"response": response,
|
172 |
+
"prediction": prediction,
|
173 |
+
"total": total,
|
174 |
+
"time_cost": time_cost,
|
175 |
+
}
|
176 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
177 |
+
fout.write(f"{row_}\n")
|
178 |
+
|
179 |
+
return
|
180 |
+
|
181 |
+
|
182 |
+
if __name__ == "__main__":
|
183 |
+
main()
|
llm_eval_script/byteplus.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
https://docs.byteplus.com/en/docs/ModelArk/1099455
|
5 |
+
|
6 |
+
model list
|
7 |
+
https://docs.byteplus.com/en/docs/ModelArk/1330310
|
8 |
+
"""
|
9 |
+
import argparse
|
10 |
+
from datetime import datetime
|
11 |
+
import json
|
12 |
+
import os
|
13 |
+
from pathlib import Path
|
14 |
+
import sys
|
15 |
+
import time
|
16 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
17 |
+
|
18 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
19 |
+
sys.path.append(os.path.join(pwd, "../"))
|
20 |
+
|
21 |
+
from openai import OpenAI
|
22 |
+
|
23 |
+
from project_settings import environment, project_path
|
24 |
+
|
25 |
+
|
26 |
+
def get_args():
|
27 |
+
parser = argparse.ArgumentParser()
|
28 |
+
parser.add_argument(
|
29 |
+
"--model_name",
|
30 |
+
default="seedance-1-0-lite-t2v-250428",
|
31 |
+
type=str
|
32 |
+
)
|
33 |
+
parser.add_argument(
|
34 |
+
"--eval_dataset_name",
|
35 |
+
default="arc-easy-1000-choice.jsonl",
|
36 |
+
type=str
|
37 |
+
)
|
38 |
+
parser.add_argument(
|
39 |
+
"--eval_dataset_dir",
|
40 |
+
default=(project_path / "data/dataset").as_posix(),
|
41 |
+
type=str
|
42 |
+
)
|
43 |
+
parser.add_argument(
|
44 |
+
"--eval_data_dir",
|
45 |
+
default=(project_path / "data/eval_data").as_posix(),
|
46 |
+
type=str
|
47 |
+
)
|
48 |
+
parser.add_argument(
|
49 |
+
"--client",
|
50 |
+
default="shenzhen_sase",
|
51 |
+
type=str
|
52 |
+
)
|
53 |
+
parser.add_argument(
|
54 |
+
"--service",
|
55 |
+
default="byteplus_api_key",
|
56 |
+
type=str
|
57 |
+
)
|
58 |
+
args = parser.parse_args()
|
59 |
+
return args
|
60 |
+
|
61 |
+
|
62 |
+
def main():
|
63 |
+
args = get_args()
|
64 |
+
|
65 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
66 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
67 |
+
eval_data_dir = Path(args.eval_data_dir)
|
68 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
69 |
+
|
70 |
+
tz = ZoneInfo("Asia/Shanghai")
|
71 |
+
now = datetime.now(tz)
|
72 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
73 |
+
|
74 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
75 |
+
|
76 |
+
output_file = eval_data_dir / f"byteplus/byteplus/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
|
77 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
78 |
+
|
79 |
+
api_key = environment.get(args.service, dtype=str)
|
80 |
+
client = OpenAI(
|
81 |
+
base_url="https://ark.ap-southeast.bytepluses.com/api/v3",
|
82 |
+
# Read your Ark API Key from the environment variable.
|
83 |
+
api_key=api_key
|
84 |
+
)
|
85 |
+
|
86 |
+
total = 0
|
87 |
+
total_correct = 0
|
88 |
+
|
89 |
+
# finished
|
90 |
+
finished_idx_set = set()
|
91 |
+
if os.path.exists(output_file.as_posix()):
|
92 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
93 |
+
for row in f:
|
94 |
+
row = json.loads(row)
|
95 |
+
idx = row["idx"]
|
96 |
+
total = row["total"]
|
97 |
+
total_correct = row["total_correct"]
|
98 |
+
finished_idx_set.add(idx)
|
99 |
+
print(f"finished count: {len(finished_idx_set)}")
|
100 |
+
|
101 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
102 |
+
for row in fin:
|
103 |
+
row = json.loads(row)
|
104 |
+
idx = row["idx"]
|
105 |
+
prompt = row["prompt"]
|
106 |
+
response = row["response"]
|
107 |
+
|
108 |
+
if idx in finished_idx_set:
|
109 |
+
continue
|
110 |
+
finished_idx_set.add(idx)
|
111 |
+
|
112 |
+
try:
|
113 |
+
time_begin = time.time()
|
114 |
+
completion = client.chat.completions.create(
|
115 |
+
# Replace with your Inference Endpoint.
|
116 |
+
model=args.model_name,
|
117 |
+
messages=[
|
118 |
+
{"role": "user", "content": prompt},
|
119 |
+
]
|
120 |
+
)
|
121 |
+
time_cost = time.time() - time_begin
|
122 |
+
print(f"time_cost: {time_cost}")
|
123 |
+
except Exception as e:
|
124 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
125 |
+
continue
|
126 |
+
|
127 |
+
prediction = completion.choices[0].message.content
|
128 |
+
|
129 |
+
correct = 1 if prediction == response else 0
|
130 |
+
|
131 |
+
total += 1
|
132 |
+
total_correct += correct
|
133 |
+
score = total_correct / total
|
134 |
+
|
135 |
+
row_ = {
|
136 |
+
"idx": idx,
|
137 |
+
"prompt": prompt,
|
138 |
+
"response": response,
|
139 |
+
"prediction": prediction,
|
140 |
+
"correct": correct,
|
141 |
+
"total": total,
|
142 |
+
"total_correct": total_correct,
|
143 |
+
"score": score,
|
144 |
+
"time_cost": time_cost,
|
145 |
+
}
|
146 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
147 |
+
fout.write(f"{row_}\n")
|
148 |
+
|
149 |
+
return
|
150 |
+
|
151 |
+
|
152 |
+
if __name__ == "__main__":
|
153 |
+
main()
|
llm_eval_script/gemini_google.py
ADDED
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import argparse
|
4 |
+
from datetime import datetime
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
from pathlib import Path
|
8 |
+
import sys
|
9 |
+
import time
|
10 |
+
import tempfile
|
11 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
12 |
+
|
13 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
14 |
+
sys.path.append(os.path.join(pwd, "../"))
|
15 |
+
|
16 |
+
from google import genai
|
17 |
+
from google.genai import types
|
18 |
+
|
19 |
+
from project_settings import environment, project_path
|
20 |
+
|
21 |
+
|
22 |
+
def get_args():
|
23 |
+
parser = argparse.ArgumentParser()
|
24 |
+
parser.add_argument(
|
25 |
+
"--model_name",
|
26 |
+
default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
|
27 |
+
# default="gemini-2.5-flash",
|
28 |
+
# default="gemini-2.5-flash-lite-preview-06-17",
|
29 |
+
type=str
|
30 |
+
)
|
31 |
+
parser.add_argument(
|
32 |
+
"--eval_dataset_name",
|
33 |
+
default="agent-lingoace-zh-400-choice.jsonl",
|
34 |
+
# default="arc-easy-1000-choice.jsonl",
|
35 |
+
type=str
|
36 |
+
)
|
37 |
+
parser.add_argument(
|
38 |
+
"--eval_dataset_dir",
|
39 |
+
default=(project_path / "data/dataset").as_posix(),
|
40 |
+
type=str
|
41 |
+
)
|
42 |
+
parser.add_argument(
|
43 |
+
"--eval_data_dir",
|
44 |
+
default=(project_path / "data/eval_data").as_posix(),
|
45 |
+
type=str
|
46 |
+
)
|
47 |
+
parser.add_argument(
|
48 |
+
"--client",
|
49 |
+
default="shenzhen_sase",
|
50 |
+
type=str
|
51 |
+
)
|
52 |
+
parser.add_argument(
|
53 |
+
"--service",
|
54 |
+
default="google_potent_veld_462405_t3",
|
55 |
+
type=str
|
56 |
+
)
|
57 |
+
args = parser.parse_args()
|
58 |
+
return args
|
59 |
+
|
60 |
+
|
61 |
+
def main():
|
62 |
+
args = get_args()
|
63 |
+
|
64 |
+
service = environment.get(args.service, dtype=json.loads)
|
65 |
+
project_id = service["project_id"]
|
66 |
+
|
67 |
+
google_application_credentials = Path(tempfile.gettempdir()) / f"llm_eval_system/{project_id}.json"
|
68 |
+
google_application_credentials.parent.mkdir(parents=True, exist_ok=True)
|
69 |
+
|
70 |
+
with open(google_application_credentials.as_posix(), "w", encoding="utf-8") as f:
|
71 |
+
content = json.dumps(service, ensure_ascii=False, indent=4)
|
72 |
+
f.write(f"{content}\n")
|
73 |
+
|
74 |
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_application_credentials.as_posix()
|
75 |
+
|
76 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
77 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
78 |
+
eval_data_dir = Path(args.eval_data_dir)
|
79 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
80 |
+
|
81 |
+
tz = ZoneInfo("Asia/Shanghai")
|
82 |
+
now = datetime.now(tz)
|
83 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
84 |
+
|
85 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
86 |
+
|
87 |
+
output_file = eval_data_dir / f"gemini_google/google/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
|
88 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
89 |
+
|
90 |
+
client = genai.Client(
|
91 |
+
vertexai=True,
|
92 |
+
project=project_id,
|
93 |
+
location="global",
|
94 |
+
)
|
95 |
+
generate_content_config = types.GenerateContentConfig(
|
96 |
+
top_p=0.95,
|
97 |
+
temperature=0.6,
|
98 |
+
max_output_tokens=1,
|
99 |
+
response_modalities=["TEXT"],
|
100 |
+
thinking_config=types.ThinkingConfig(
|
101 |
+
thinking_budget=0
|
102 |
+
)
|
103 |
+
)
|
104 |
+
|
105 |
+
total = 0
|
106 |
+
total_correct = 0
|
107 |
+
|
108 |
+
# finished
|
109 |
+
finished_idx_set = set()
|
110 |
+
if os.path.exists(output_file.as_posix()):
|
111 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
112 |
+
for row in f:
|
113 |
+
row = json.loads(row)
|
114 |
+
idx = row["idx"]
|
115 |
+
total = row["total"]
|
116 |
+
total_correct = row["total_correct"]
|
117 |
+
finished_idx_set.add(idx)
|
118 |
+
print(f"finished count: {len(finished_idx_set)}")
|
119 |
+
|
120 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
121 |
+
for row in fin:
|
122 |
+
row = json.loads(row)
|
123 |
+
idx = row["idx"]
|
124 |
+
prompt = row["prompt"]
|
125 |
+
response = row["response"]
|
126 |
+
|
127 |
+
if idx in finished_idx_set:
|
128 |
+
continue
|
129 |
+
finished_idx_set.add(idx)
|
130 |
+
|
131 |
+
contents = [
|
132 |
+
types.Content(
|
133 |
+
role="user",
|
134 |
+
parts=[
|
135 |
+
types.Part.from_text(text=prompt)
|
136 |
+
]
|
137 |
+
)
|
138 |
+
]
|
139 |
+
time_begin = time.time()
|
140 |
+
llm_response: types.GenerateContentResponse = client.models.generate_content(
|
141 |
+
model=args.model_name,
|
142 |
+
contents=contents,
|
143 |
+
config=generate_content_config,
|
144 |
+
)
|
145 |
+
time_cost = time.time() - time_begin
|
146 |
+
print(f"time_cost: {time_cost}")
|
147 |
+
try:
|
148 |
+
prediction = llm_response.candidates[0].content.parts[0].text
|
149 |
+
except TypeError as e:
|
150 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
151 |
+
continue
|
152 |
+
correct = 1 if prediction == response else 0
|
153 |
+
|
154 |
+
total += 1
|
155 |
+
total_correct += correct
|
156 |
+
score = total_correct / total
|
157 |
+
|
158 |
+
row_ = {
|
159 |
+
"idx": idx,
|
160 |
+
"prompt": prompt,
|
161 |
+
"response": response,
|
162 |
+
"prediction": prediction,
|
163 |
+
"correct": correct,
|
164 |
+
"total": total,
|
165 |
+
"total_correct": total_correct,
|
166 |
+
"score": score,
|
167 |
+
"time_cost": time_cost,
|
168 |
+
}
|
169 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
170 |
+
fout.write(f"{row_}\n")
|
171 |
+
|
172 |
+
return
|
173 |
+
|
174 |
+
|
175 |
+
if __name__ == "__main__":
|
176 |
+
main()
|
llm_eval_script/gemini_google_chat.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import argparse
|
4 |
+
from datetime import datetime
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
from pathlib import Path
|
8 |
+
import sys
|
9 |
+
import time
|
10 |
+
import tempfile
|
11 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
12 |
+
|
13 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
14 |
+
sys.path.append(os.path.join(pwd, "../"))
|
15 |
+
|
16 |
+
from google import genai
|
17 |
+
from google.genai import types
|
18 |
+
|
19 |
+
from project_settings import environment, project_path
|
20 |
+
|
21 |
+
|
22 |
+
def get_args():
|
23 |
+
parser = argparse.ArgumentParser()
|
24 |
+
parser.add_argument(
|
25 |
+
"--model_name",
|
26 |
+
# default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
|
27 |
+
default="gemini-2.5-flash",
|
28 |
+
# default="gemini-2.5-flash-lite-preview-06-17",
|
29 |
+
type=str
|
30 |
+
)
|
31 |
+
parser.add_argument(
|
32 |
+
"--eval_dataset_name",
|
33 |
+
default="agent-lingoace-zh-80-chat.jsonl",
|
34 |
+
type=str
|
35 |
+
)
|
36 |
+
parser.add_argument(
|
37 |
+
"--eval_dataset_dir",
|
38 |
+
default=(project_path / "data/dataset").as_posix(),
|
39 |
+
type=str
|
40 |
+
)
|
41 |
+
parser.add_argument(
|
42 |
+
"--eval_data_dir",
|
43 |
+
default=(project_path / "data/eval_data").as_posix(),
|
44 |
+
type=str
|
45 |
+
)
|
46 |
+
parser.add_argument(
|
47 |
+
"--client",
|
48 |
+
default="shenzhen_sase",
|
49 |
+
type=str
|
50 |
+
)
|
51 |
+
parser.add_argument(
|
52 |
+
"--service",
|
53 |
+
default="google_potent_veld_462405_t3",
|
54 |
+
type=str
|
55 |
+
)
|
56 |
+
args = parser.parse_args()
|
57 |
+
return args
|
58 |
+
|
59 |
+
|
60 |
+
def main():
|
61 |
+
args = get_args()
|
62 |
+
|
63 |
+
service = environment.get(args.service, dtype=json.loads)
|
64 |
+
project_id = service["project_id"]
|
65 |
+
|
66 |
+
google_application_credentials = Path(tempfile.gettempdir()) / f"llm_eval_system/{project_id}.json"
|
67 |
+
google_application_credentials.parent.mkdir(parents=True, exist_ok=True)
|
68 |
+
|
69 |
+
with open(google_application_credentials.as_posix(), "w", encoding="utf-8") as f:
|
70 |
+
content = json.dumps(service, ensure_ascii=False, indent=4)
|
71 |
+
f.write(f"{content}\n")
|
72 |
+
|
73 |
+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_application_credentials.as_posix()
|
74 |
+
|
75 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
76 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
77 |
+
eval_data_dir = Path(args.eval_data_dir)
|
78 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
79 |
+
|
80 |
+
tz = ZoneInfo("Asia/Shanghai")
|
81 |
+
now = datetime.now(tz)
|
82 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
83 |
+
|
84 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
85 |
+
|
86 |
+
output_file = eval_data_dir / f"gemini_google/google/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}.raw"
|
87 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
88 |
+
|
89 |
+
client = genai.Client(
|
90 |
+
vertexai=True,
|
91 |
+
project=project_id,
|
92 |
+
location="global",
|
93 |
+
)
|
94 |
+
generate_content_config = types.GenerateContentConfig(
|
95 |
+
top_p=0.95,
|
96 |
+
temperature=0.6,
|
97 |
+
# max_output_tokens=1,
|
98 |
+
response_modalities=["TEXT"],
|
99 |
+
thinking_config=types.ThinkingConfig(
|
100 |
+
thinking_budget=0
|
101 |
+
)
|
102 |
+
)
|
103 |
+
|
104 |
+
total = 0
|
105 |
+
|
106 |
+
# finished
|
107 |
+
finished_idx_set = set()
|
108 |
+
if os.path.exists(output_file.as_posix()):
|
109 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
110 |
+
for row in f:
|
111 |
+
row = json.loads(row)
|
112 |
+
idx = row["idx"]
|
113 |
+
total = row["total"]
|
114 |
+
finished_idx_set.add(idx)
|
115 |
+
print(f"finished count: {len(finished_idx_set)}")
|
116 |
+
|
117 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
118 |
+
for row in fin:
|
119 |
+
row = json.loads(row)
|
120 |
+
idx = row["idx"]
|
121 |
+
prompt = row["prompt"]
|
122 |
+
response = row["response"]
|
123 |
+
|
124 |
+
if idx in finished_idx_set:
|
125 |
+
continue
|
126 |
+
finished_idx_set.add(idx)
|
127 |
+
|
128 |
+
contents = [
|
129 |
+
types.Content(
|
130 |
+
role="user",
|
131 |
+
parts=[
|
132 |
+
types.Part.from_text(text=prompt)
|
133 |
+
]
|
134 |
+
)
|
135 |
+
]
|
136 |
+
time_begin = time.time()
|
137 |
+
llm_response: types.GenerateContentResponse = client.models.generate_content(
|
138 |
+
model=args.model_name,
|
139 |
+
contents=contents,
|
140 |
+
config=generate_content_config,
|
141 |
+
)
|
142 |
+
time_cost = time.time() - time_begin
|
143 |
+
print(f"time_cost: {time_cost}")
|
144 |
+
try:
|
145 |
+
prediction = llm_response.candidates[0].content.parts[0].text
|
146 |
+
except TypeError as e:
|
147 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
148 |
+
continue
|
149 |
+
|
150 |
+
total += 1
|
151 |
+
|
152 |
+
row_ = {
|
153 |
+
"idx": idx,
|
154 |
+
"prompt": prompt,
|
155 |
+
"response": response,
|
156 |
+
"prediction": prediction,
|
157 |
+
"total": total,
|
158 |
+
"time_cost": time_cost,
|
159 |
+
}
|
160 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
161 |
+
fout.write(f"{row_}\n")
|
162 |
+
|
163 |
+
return
|
164 |
+
|
165 |
+
|
166 |
+
if __name__ == "__main__":
|
167 |
+
main()
|
llm_eval_script/ollama.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import argparse
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
from pathlib import Path
|
7 |
+
import sys
|
8 |
+
|
9 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
10 |
+
sys.path.append(os.path.join(pwd, "../"))
|
11 |
+
|
12 |
+
import openai
|
13 |
+
from openai import OpenAI
|
14 |
+
import pandas as pd
|
15 |
+
|
16 |
+
from project_settings import environment, project_path
|
17 |
+
|
18 |
+
|
19 |
+
def get_args():
|
20 |
+
parser = argparse.ArgumentParser()
|
21 |
+
parser.add_argument(
|
22 |
+
"--dataset_dir",
|
23 |
+
default=(project_path / "data/llm-log/eval-llm-dataset/choice").as_posix(),
|
24 |
+
type=str
|
25 |
+
)
|
26 |
+
|
27 |
+
parser.add_argument("--ollama_host", default="10.75.27.247", type=str)
|
28 |
+
parser.add_argument("--ollama_port", default=11434, type=int)
|
29 |
+
|
30 |
+
parser.add_argument("--model_id",
|
31 |
+
# default="qwen2:7b-instruct-fp16",
|
32 |
+
# default="llama3:8b-instruct-fp16",
|
33 |
+
# default="gemma3:4b",
|
34 |
+
default="gemma3:1b",
|
35 |
+
type=str)
|
36 |
+
|
37 |
+
parser.add_argument("--api_key", default="ollama", type=str)
|
38 |
+
|
39 |
+
args = parser.parse_args()
|
40 |
+
return args
|
41 |
+
|
42 |
+
|
43 |
+
def main():
|
44 |
+
args = get_args()
|
45 |
+
|
46 |
+
dataset_dir = Path(args.dataset_dir)
|
47 |
+
|
48 |
+
base_url = f"http://{args.ollama_host}:{args.ollama_port}/v1"
|
49 |
+
|
50 |
+
client = OpenAI(
|
51 |
+
api_key=args.api_key,
|
52 |
+
base_url=base_url,
|
53 |
+
)
|
54 |
+
|
55 |
+
result = list()
|
56 |
+
count = 0
|
57 |
+
for sample_dir in dataset_dir.glob("*"):
|
58 |
+
if count >= 1000:
|
59 |
+
break
|
60 |
+
print(f"count: {count}, process: {sample_dir.as_posix()}")
|
61 |
+
count += 1
|
62 |
+
system_prompt_file = sample_dir / f"system_prompt.txt"
|
63 |
+
user_prompt_file = sample_dir / f"user_prompt.txt"
|
64 |
+
response_file = sample_dir / f"response.txt"
|
65 |
+
|
66 |
+
with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
|
67 |
+
system_prompt = f.read()
|
68 |
+
with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
|
69 |
+
user_prompt = f.read()
|
70 |
+
with open(response_file.as_posix(), "r", encoding="utf-8") as f:
|
71 |
+
response = f.read()
|
72 |
+
|
73 |
+
completion = client.chat.completions.create(
|
74 |
+
model=args.model_id,
|
75 |
+
messages=[
|
76 |
+
{"role": "system", "content": system_prompt},
|
77 |
+
{"role": "user", "content": user_prompt},
|
78 |
+
],
|
79 |
+
temperature=0.8,
|
80 |
+
top_p=0.8
|
81 |
+
)
|
82 |
+
js = completion.model_dump_json()
|
83 |
+
js = json.loads(js)
|
84 |
+
predict = js["choices"][0]["message"]["content"]
|
85 |
+
|
86 |
+
row = {
|
87 |
+
"system_prompt": system_prompt,
|
88 |
+
"user_prompt": user_prompt,
|
89 |
+
"response": response,
|
90 |
+
"predict": predict.strip(),
|
91 |
+
|
92 |
+
}
|
93 |
+
result.append(row)
|
94 |
+
|
95 |
+
df = pd.DataFrame(result)
|
96 |
+
df.to_excel("result.xlsx", index=False)
|
97 |
+
return
|
98 |
+
|
99 |
+
|
100 |
+
if __name__ == "__main__":
|
101 |
+
main()
|
llm_eval_script/siliconflow.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
https://cloud.siliconflow.cn/sft-d1rosn8o8n4s73ftpa1g/playground/chat/17885302852
|
5 |
+
|
6 |
+
https://docs.siliconflow.cn/cn/userguide/capabilities/reasoning
|
7 |
+
|
8 |
+
Model Name:
|
9 |
+
Pro/deepseek-ai/DeepSeek-R1
|
10 |
+
Tips:
|
11 |
+
(1)thinking_budget: Must be greater than or equal to 1
|
12 |
+
(2)The selected model requires paid balance. Your paid balance is insufficient. Please top up and try again.
|
13 |
+
|
14 |
+
Model Name:
|
15 |
+
tencent/Hunyuan-A13B-Instruct
|
16 |
+
Tips:
|
17 |
+
(1)它在回答时总是会先思考,最后给出答案.这适合知识问答,但不符合我们Agent的需求. 后来强制其只能输出 A-E 中的一个字符(max_tokens=4),以完成评估.
|
18 |
+
max_tokens=4,
|
19 |
+
logit_bias={
|
20 |
+
32: 100,
|
21 |
+
33: 100,
|
22 |
+
34: 100,
|
23 |
+
35: 100,
|
24 |
+
36: 100,
|
25 |
+
37: 100,
|
26 |
+
},
|
27 |
+
|
28 |
+
Model Name:
|
29 |
+
deepseek-ai/DeepSeek-R1
|
30 |
+
Tips:
|
31 |
+
(1)为了让它只输出一个字符,设置 max_tokens=3
|
32 |
+
|
33 |
+
|
34 |
+
Model Name:
|
35 |
+
Qwen/Qwen3-8B
|
36 |
+
deepseek-ai/DeepSeek-R1-0528-Qwen3-8B
|
37 |
+
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
38 |
+
Tips:
|
39 |
+
(1)为了让它只输出一个字符,设置 max_tokens=1
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
Model Name:
|
44 |
+
baidu/ERNIE-4.5-300B-A47B
|
45 |
+
Tips:
|
46 |
+
(1)它可能使用的是bpe 分词, logit_bias 注释掉。
|
47 |
+
|
48 |
+
"""
|
49 |
+
import argparse
|
50 |
+
from datetime import datetime
|
51 |
+
import json
|
52 |
+
import os
|
53 |
+
from pathlib import Path
|
54 |
+
import sys
|
55 |
+
import time
|
56 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
57 |
+
|
58 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
59 |
+
sys.path.append(os.path.join(pwd, "../"))
|
60 |
+
|
61 |
+
from openai import OpenAI
|
62 |
+
|
63 |
+
from project_settings import environment, project_path
|
64 |
+
|
65 |
+
|
66 |
+
def get_args():
|
67 |
+
parser = argparse.ArgumentParser()
|
68 |
+
parser.add_argument(
|
69 |
+
"--model_name",
|
70 |
+
# default="Pro/deepseek-ai/DeepSeek-R1",
|
71 |
+
# default="tencent/Hunyuan-A13B-Instruct",
|
72 |
+
default="Qwen/Qwen3-8B",
|
73 |
+
# default="deepseek-ai/DeepSeek-R1",
|
74 |
+
# default="deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
|
75 |
+
# default="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
76 |
+
# default="baidu/ERNIE-4.5-300B-A47B",
|
77 |
+
type=str
|
78 |
+
)
|
79 |
+
parser.add_argument(
|
80 |
+
"--eval_dataset_name",
|
81 |
+
default="agent-lingoace-zh-400-choice.jsonl",
|
82 |
+
# default="arc-easy-1000-choice.jsonl",
|
83 |
+
type=str
|
84 |
+
)
|
85 |
+
parser.add_argument(
|
86 |
+
"--eval_dataset_dir",
|
87 |
+
default=(project_path / "data/dataset").as_posix(),
|
88 |
+
type=str
|
89 |
+
)
|
90 |
+
parser.add_argument(
|
91 |
+
"--eval_data_dir",
|
92 |
+
default=(project_path / "data/eval_data").as_posix(),
|
93 |
+
type=str
|
94 |
+
)
|
95 |
+
parser.add_argument(
|
96 |
+
"--client",
|
97 |
+
default="shenzhen_sase",
|
98 |
+
type=str
|
99 |
+
)
|
100 |
+
parser.add_argument(
|
101 |
+
"--service",
|
102 |
+
default="siliconflow_api_key",
|
103 |
+
type=str
|
104 |
+
)
|
105 |
+
parser.add_argument(
|
106 |
+
"--create_time_str",
|
107 |
+
default="null",
|
108 |
+
type=str
|
109 |
+
)
|
110 |
+
parser.add_argument(
|
111 |
+
"--interval",
|
112 |
+
default=10,
|
113 |
+
type=int
|
114 |
+
)
|
115 |
+
args = parser.parse_args()
|
116 |
+
return args
|
117 |
+
|
118 |
+
|
119 |
+
def main():
|
120 |
+
args = get_args()
|
121 |
+
|
122 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
123 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
124 |
+
eval_data_dir = Path(args.eval_data_dir)
|
125 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
126 |
+
|
127 |
+
if args.create_time_str == "null":
|
128 |
+
tz = ZoneInfo("Asia/Shanghai")
|
129 |
+
now = datetime.now(tz)
|
130 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
131 |
+
# create_time_str = "20250724_090615"
|
132 |
+
else:
|
133 |
+
create_time_str = args.create_time_str
|
134 |
+
|
135 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
136 |
+
|
137 |
+
model_name_ = args.model_name.replace("/", "#")
|
138 |
+
output_file = eval_data_dir / f"siliconflow/siliconflow/{model_name_}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
|
139 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
140 |
+
|
141 |
+
api_key = environment.get(args.service, dtype=str)
|
142 |
+
client = OpenAI(
|
143 |
+
base_url="https://api.siliconflow.cn/v1/",
|
144 |
+
# Read your Ark API Key from the environment variable.
|
145 |
+
api_key=api_key
|
146 |
+
)
|
147 |
+
|
148 |
+
total = 0
|
149 |
+
total_correct = 0
|
150 |
+
|
151 |
+
# finished
|
152 |
+
finished_idx_set = set()
|
153 |
+
if os.path.exists(output_file.as_posix()):
|
154 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
155 |
+
for row in f:
|
156 |
+
row = json.loads(row)
|
157 |
+
idx = row["idx"]
|
158 |
+
total = row["total"]
|
159 |
+
total_correct = row["total_correct"]
|
160 |
+
finished_idx_set.add(idx)
|
161 |
+
print(f"finished count: {len(finished_idx_set)}")
|
162 |
+
|
163 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
164 |
+
for row in fin:
|
165 |
+
row = json.loads(row)
|
166 |
+
idx = row["idx"]
|
167 |
+
prompt = row["prompt"]
|
168 |
+
response = row["response"]
|
169 |
+
|
170 |
+
if idx in finished_idx_set:
|
171 |
+
continue
|
172 |
+
finished_idx_set.add(idx)
|
173 |
+
|
174 |
+
try:
|
175 |
+
time.sleep(args.interval)
|
176 |
+
print(f"sleep: {args.interval}")
|
177 |
+
time_begin = time.time()
|
178 |
+
completion = client.chat.completions.create(
|
179 |
+
model=args.model_name,
|
180 |
+
messages=[
|
181 |
+
{"role": "user", "content": prompt},
|
182 |
+
],
|
183 |
+
stream=False,
|
184 |
+
# max_tokens=4096,
|
185 |
+
max_tokens=1,
|
186 |
+
temperature=0.6,
|
187 |
+
top_p=0.95,
|
188 |
+
logit_bias={
|
189 |
+
32: 100,
|
190 |
+
33: 100,
|
191 |
+
34: 100,
|
192 |
+
35: 100,
|
193 |
+
36: 100,
|
194 |
+
37: 100,
|
195 |
+
38: 100,
|
196 |
+
39: 100,
|
197 |
+
},
|
198 |
+
extra_body={
|
199 |
+
"thinking_budget": 1
|
200 |
+
}
|
201 |
+
)
|
202 |
+
time_cost = time.time() - time_begin
|
203 |
+
print(f"time_cost: {time_cost}")
|
204 |
+
except Exception as e:
|
205 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
206 |
+
continue
|
207 |
+
|
208 |
+
# print(f"completion: {completion}")
|
209 |
+
prediction = completion.choices[0].message.content
|
210 |
+
|
211 |
+
correct = 1 if prediction == response else 0
|
212 |
+
|
213 |
+
total += 1
|
214 |
+
total_correct += correct
|
215 |
+
score = total_correct / total
|
216 |
+
|
217 |
+
row_ = {
|
218 |
+
"idx": idx,
|
219 |
+
"prompt": prompt,
|
220 |
+
"response": response,
|
221 |
+
"prediction": prediction,
|
222 |
+
"correct": correct,
|
223 |
+
"total": total,
|
224 |
+
"total_correct": total_correct,
|
225 |
+
"score": score,
|
226 |
+
"time_cost": time_cost,
|
227 |
+
}
|
228 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
229 |
+
fout.write(f"{row_}\n")
|
230 |
+
fout.flush()
|
231 |
+
|
232 |
+
return
|
233 |
+
|
234 |
+
|
235 |
+
if __name__ == "__main__":
|
236 |
+
main()
|
llm_eval_script/siliconflow_chat.py
ADDED
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
https://cloud.siliconflow.cn/sft-d1rosn8o8n4s73ftpa1g/playground/chat/17885302852
|
5 |
+
|
6 |
+
https://docs.siliconflow.cn/cn/userguide/capabilities/reasoning
|
7 |
+
|
8 |
+
Model Name:
|
9 |
+
Pro/deepseek-ai/DeepSeek-R1
|
10 |
+
Tips:
|
11 |
+
(1)thinking_budget: Must be greater than or equal to 1
|
12 |
+
(2)The selected model requires paid balance. Your paid balance is insufficient. Please top up and try again.
|
13 |
+
|
14 |
+
Model Name:
|
15 |
+
tencent/Hunyuan-A13B-Instruct
|
16 |
+
Tips:
|
17 |
+
(1)它在回答时总是会先思考,最后给出答案.这适合知识问答,但不符合我们Agent的需求. 后来强制其只能输出 A-E 中的一个字符(max_tokens=4),以完成评估.
|
18 |
+
max_tokens=4,
|
19 |
+
logit_bias={
|
20 |
+
32: 100,
|
21 |
+
33: 100,
|
22 |
+
34: 100,
|
23 |
+
35: 100,
|
24 |
+
36: 100,
|
25 |
+
37: 100,
|
26 |
+
},
|
27 |
+
|
28 |
+
Model Name:
|
29 |
+
deepseek-ai/DeepSeek-R1
|
30 |
+
Tips:
|
31 |
+
(1)为了让它只输出一个字符,设置 max_tokens=3
|
32 |
+
|
33 |
+
|
34 |
+
Model Name:
|
35 |
+
Qwen/Qwen3-8B
|
36 |
+
deepseek-ai/DeepSeek-R1-0528-Qwen3-8B
|
37 |
+
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
|
38 |
+
Tips:
|
39 |
+
(1)为了让它只输出一个字符,设置 max_tokens=1
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
Model Name:
|
44 |
+
baidu/ERNIE-4.5-300B-A47B
|
45 |
+
Tips:
|
46 |
+
(1)它可能使用的是bpe 分词, logit_bias 注释掉。
|
47 |
+
|
48 |
+
"""
|
49 |
+
import argparse
|
50 |
+
from datetime import datetime
|
51 |
+
import json
|
52 |
+
import os
|
53 |
+
from pathlib import Path
|
54 |
+
import sys
|
55 |
+
import time
|
56 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
57 |
+
|
58 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
59 |
+
sys.path.append(os.path.join(pwd, "../"))
|
60 |
+
|
61 |
+
from openai import OpenAI
|
62 |
+
|
63 |
+
from project_settings import environment, project_path
|
64 |
+
|
65 |
+
|
66 |
+
def get_args():
|
67 |
+
parser = argparse.ArgumentParser()
|
68 |
+
parser.add_argument(
|
69 |
+
"--model_name",
|
70 |
+
# default="Pro/deepseek-ai/DeepSeek-R1",
|
71 |
+
# default="tencent/Hunyuan-A13B-Instruct",
|
72 |
+
# default="Qwen/Qwen3-8B",
|
73 |
+
# default="deepseek-ai/DeepSeek-R1",
|
74 |
+
# default="deepseek-ai/DeepSeek-R1-0528-Qwen3-8B",
|
75 |
+
default="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
|
76 |
+
# default="baidu/ERNIE-4.5-300B-A47B",
|
77 |
+
type=str
|
78 |
+
)
|
79 |
+
parser.add_argument(
|
80 |
+
"--eval_dataset_name",
|
81 |
+
default="agent-lingoace-zh-80-chat.jsonl",
|
82 |
+
type=str
|
83 |
+
)
|
84 |
+
parser.add_argument(
|
85 |
+
"--eval_dataset_dir",
|
86 |
+
default=(project_path / "data/dataset").as_posix(),
|
87 |
+
type=str
|
88 |
+
)
|
89 |
+
parser.add_argument(
|
90 |
+
"--eval_data_dir",
|
91 |
+
default=(project_path / "data/eval_data").as_posix(),
|
92 |
+
type=str
|
93 |
+
)
|
94 |
+
parser.add_argument(
|
95 |
+
"--client",
|
96 |
+
default="shenzhen_sase",
|
97 |
+
type=str
|
98 |
+
)
|
99 |
+
parser.add_argument(
|
100 |
+
"--service",
|
101 |
+
default="siliconflow_api_key",
|
102 |
+
type=str
|
103 |
+
)
|
104 |
+
parser.add_argument(
|
105 |
+
"--create_time_str",
|
106 |
+
default="null",
|
107 |
+
type=str
|
108 |
+
)
|
109 |
+
parser.add_argument(
|
110 |
+
"--interval",
|
111 |
+
default=1,
|
112 |
+
type=int
|
113 |
+
)
|
114 |
+
args = parser.parse_args()
|
115 |
+
return args
|
116 |
+
|
117 |
+
|
118 |
+
def main():
|
119 |
+
args = get_args()
|
120 |
+
|
121 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
122 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
123 |
+
eval_data_dir = Path(args.eval_data_dir)
|
124 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
125 |
+
|
126 |
+
if args.create_time_str == "null":
|
127 |
+
tz = ZoneInfo("Asia/Shanghai")
|
128 |
+
now = datetime.now(tz)
|
129 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
130 |
+
# create_time_str = "20250724_090615"
|
131 |
+
else:
|
132 |
+
create_time_str = args.create_time_str
|
133 |
+
|
134 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
135 |
+
|
136 |
+
model_name_ = args.model_name.replace("/", "#")
|
137 |
+
output_file = eval_data_dir / f"siliconflow/siliconflow/{model_name_}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}.raw"
|
138 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
139 |
+
|
140 |
+
api_key = environment.get(args.service, dtype=str)
|
141 |
+
client = OpenAI(
|
142 |
+
base_url="https://api.siliconflow.cn/v1/",
|
143 |
+
# Read your Ark API Key from the environment variable.
|
144 |
+
api_key=api_key
|
145 |
+
)
|
146 |
+
|
147 |
+
total = 0
|
148 |
+
|
149 |
+
# finished
|
150 |
+
finished_idx_set = set()
|
151 |
+
if os.path.exists(output_file.as_posix()):
|
152 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
153 |
+
for row in f:
|
154 |
+
row = json.loads(row)
|
155 |
+
idx = row["idx"]
|
156 |
+
total = row["total"]
|
157 |
+
finished_idx_set.add(idx)
|
158 |
+
print(f"finished count: {len(finished_idx_set)}")
|
159 |
+
|
160 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
161 |
+
for row in fin:
|
162 |
+
row = json.loads(row)
|
163 |
+
idx = row["idx"]
|
164 |
+
prompt = row["prompt"]
|
165 |
+
response = row["response"]
|
166 |
+
|
167 |
+
if idx in finished_idx_set:
|
168 |
+
continue
|
169 |
+
finished_idx_set.add(idx)
|
170 |
+
|
171 |
+
try:
|
172 |
+
time.sleep(args.interval)
|
173 |
+
print(f"sleep: {args.interval}")
|
174 |
+
time_begin = time.time()
|
175 |
+
completion = client.chat.completions.create(
|
176 |
+
model=args.model_name,
|
177 |
+
messages=[
|
178 |
+
{"role": "user", "content": prompt},
|
179 |
+
],
|
180 |
+
stream=False,
|
181 |
+
max_tokens=4096,
|
182 |
+
# max_tokens=1,
|
183 |
+
temperature=0.6,
|
184 |
+
top_p=0.95,
|
185 |
+
# logit_bias={
|
186 |
+
# 32: 100,
|
187 |
+
# 33: 100,
|
188 |
+
# 34: 100,
|
189 |
+
# 35: 100,
|
190 |
+
# 36: 100,
|
191 |
+
# 37: 100,
|
192 |
+
# 38: 100,
|
193 |
+
# 39: 100,
|
194 |
+
# },
|
195 |
+
extra_body={
|
196 |
+
"thinking_budget": 1
|
197 |
+
}
|
198 |
+
)
|
199 |
+
time_cost = time.time() - time_begin
|
200 |
+
print(f"time_cost: {time_cost}")
|
201 |
+
except Exception as e:
|
202 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
203 |
+
continue
|
204 |
+
|
205 |
+
prediction = completion.choices[0].message.content
|
206 |
+
|
207 |
+
total += 1
|
208 |
+
|
209 |
+
row_ = {
|
210 |
+
"idx": idx,
|
211 |
+
"prompt": prompt,
|
212 |
+
"response": response,
|
213 |
+
"prediction": prediction,
|
214 |
+
"total": total,
|
215 |
+
"time_cost": time_cost,
|
216 |
+
}
|
217 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
218 |
+
fout.write(f"{row_}\n")
|
219 |
+
fout.flush()
|
220 |
+
|
221 |
+
return
|
222 |
+
|
223 |
+
|
224 |
+
if __name__ == "__main__":
|
225 |
+
main()
|
log.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
from datetime import datetime
|
4 |
+
import logging
|
5 |
+
from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler
|
6 |
+
import os
|
7 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
8 |
+
|
9 |
+
|
10 |
+
def get_converter(tz_info: str = "Asia/Shanghai"):
|
11 |
+
def converter(timestamp):
|
12 |
+
dt = datetime.fromtimestamp(timestamp, ZoneInfo(tz_info))
|
13 |
+
result = dt.timetuple()
|
14 |
+
return result
|
15 |
+
return converter
|
16 |
+
|
17 |
+
|
18 |
+
def setup_size_rotating(log_directory: str, tz_info: str = "Asia/Shanghai"):
|
19 |
+
fmt = "%(asctime)s|%(name)s|%(levelname)s|%(filename)s|%(lineno)d|%(message)s"
|
20 |
+
|
21 |
+
formatter = logging.Formatter(
|
22 |
+
fmt=fmt,
|
23 |
+
datefmt="%Y-%m-%d %H:%M:%S %z"
|
24 |
+
)
|
25 |
+
formatter.converter = get_converter(tz_info)
|
26 |
+
|
27 |
+
stream_handler = logging.StreamHandler()
|
28 |
+
stream_handler.setLevel(logging.INFO)
|
29 |
+
stream_handler.setFormatter(formatter)
|
30 |
+
|
31 |
+
# main
|
32 |
+
main_logger = logging.getLogger("main")
|
33 |
+
main_logger.addHandler(stream_handler)
|
34 |
+
main_info_file_handler = RotatingFileHandler(
|
35 |
+
filename=os.path.join(log_directory, "main.log"),
|
36 |
+
maxBytes=100*1024*1024, # 100MB
|
37 |
+
encoding="utf-8",
|
38 |
+
backupCount=2,
|
39 |
+
)
|
40 |
+
main_info_file_handler.setLevel(logging.INFO)
|
41 |
+
main_info_file_handler.setFormatter(formatter)
|
42 |
+
main_logger.addHandler(main_info_file_handler)
|
43 |
+
|
44 |
+
# http
|
45 |
+
http_logger = logging.getLogger("http")
|
46 |
+
http_logger.addHandler(stream_handler)
|
47 |
+
http_file_handler = RotatingFileHandler(
|
48 |
+
filename=os.path.join(log_directory, "http.log"),
|
49 |
+
maxBytes=100*1024*1024, # 100MB
|
50 |
+
encoding="utf-8",
|
51 |
+
backupCount=2,
|
52 |
+
)
|
53 |
+
http_file_handler.setLevel(logging.DEBUG)
|
54 |
+
http_file_handler.setFormatter(formatter)
|
55 |
+
http_logger.addHandler(http_file_handler)
|
56 |
+
|
57 |
+
# api
|
58 |
+
api_logger = logging.getLogger("api")
|
59 |
+
api_logger.addHandler(stream_handler)
|
60 |
+
api_file_handler = RotatingFileHandler(
|
61 |
+
filename=os.path.join(log_directory, "api.log"),
|
62 |
+
maxBytes=10*1024*1024, # 10MB
|
63 |
+
encoding="utf-8",
|
64 |
+
backupCount=2,
|
65 |
+
)
|
66 |
+
api_file_handler.setLevel(logging.DEBUG)
|
67 |
+
api_file_handler.setFormatter(formatter)
|
68 |
+
api_logger.addHandler(api_file_handler)
|
69 |
+
|
70 |
+
# alarm
|
71 |
+
alarm_logger = logging.getLogger("alarm")
|
72 |
+
alarm_file_handler = RotatingFileHandler(
|
73 |
+
filename=os.path.join(log_directory, "alarm.log"),
|
74 |
+
maxBytes=1*1024*1024, # 1MB
|
75 |
+
encoding="utf-8",
|
76 |
+
backupCount=2,
|
77 |
+
)
|
78 |
+
alarm_file_handler.setLevel(logging.DEBUG)
|
79 |
+
alarm_file_handler.setFormatter(logging.Formatter(fmt))
|
80 |
+
alarm_logger.addHandler(alarm_file_handler)
|
81 |
+
|
82 |
+
debug_file_handler = RotatingFileHandler(
|
83 |
+
filename=os.path.join(log_directory, "debug.log"),
|
84 |
+
maxBytes=1*1024*1024, # 1MB
|
85 |
+
encoding="utf-8",
|
86 |
+
backupCount=2,
|
87 |
+
)
|
88 |
+
debug_file_handler.setLevel(logging.DEBUG)
|
89 |
+
debug_file_handler.setFormatter(logging.Formatter(fmt))
|
90 |
+
|
91 |
+
info_file_handler = RotatingFileHandler(
|
92 |
+
filename=os.path.join(log_directory, "info.log"),
|
93 |
+
maxBytes=1*1024*1024, # 1MB
|
94 |
+
encoding="utf-8",
|
95 |
+
backupCount=2,
|
96 |
+
)
|
97 |
+
info_file_handler.setLevel(logging.INFO)
|
98 |
+
info_file_handler.setFormatter(logging.Formatter(fmt))
|
99 |
+
|
100 |
+
error_file_handler = RotatingFileHandler(
|
101 |
+
filename=os.path.join(log_directory, "error.log"),
|
102 |
+
maxBytes=1*1024*1024, # 1MB
|
103 |
+
encoding="utf-8",
|
104 |
+
backupCount=2,
|
105 |
+
)
|
106 |
+
error_file_handler.setLevel(logging.ERROR)
|
107 |
+
error_file_handler.setFormatter(logging.Formatter(fmt))
|
108 |
+
|
109 |
+
logging.basicConfig(
|
110 |
+
level=logging.DEBUG,
|
111 |
+
datefmt="%a, %d %b %Y %H:%M:%S",
|
112 |
+
handlers=[
|
113 |
+
debug_file_handler,
|
114 |
+
info_file_handler,
|
115 |
+
error_file_handler,
|
116 |
+
]
|
117 |
+
)
|
118 |
+
|
119 |
+
|
120 |
+
def setup_time_rotating(log_directory: str):
|
121 |
+
fmt = "%(asctime)s - %(name)s - %(levelname)s %(filename)s:%(lineno)d > %(message)s"
|
122 |
+
|
123 |
+
stream_handler = logging.StreamHandler()
|
124 |
+
stream_handler.setLevel(logging.INFO)
|
125 |
+
stream_handler.setFormatter(logging.Formatter(fmt))
|
126 |
+
|
127 |
+
# main
|
128 |
+
main_logger = logging.getLogger("main")
|
129 |
+
main_logger.addHandler(stream_handler)
|
130 |
+
main_info_file_handler = TimedRotatingFileHandler(
|
131 |
+
filename=os.path.join(log_directory, "main.log"),
|
132 |
+
encoding="utf-8",
|
133 |
+
when="midnight",
|
134 |
+
interval=1,
|
135 |
+
backupCount=7
|
136 |
+
)
|
137 |
+
main_info_file_handler.setLevel(logging.INFO)
|
138 |
+
main_info_file_handler.setFormatter(logging.Formatter(fmt))
|
139 |
+
main_logger.addHandler(main_info_file_handler)
|
140 |
+
|
141 |
+
# http
|
142 |
+
http_logger = logging.getLogger("http")
|
143 |
+
http_file_handler = TimedRotatingFileHandler(
|
144 |
+
filename=os.path.join(log_directory, "http.log"),
|
145 |
+
encoding='utf-8',
|
146 |
+
when="midnight",
|
147 |
+
interval=1,
|
148 |
+
backupCount=7
|
149 |
+
)
|
150 |
+
http_file_handler.setLevel(logging.DEBUG)
|
151 |
+
http_file_handler.setFormatter(logging.Formatter(fmt))
|
152 |
+
http_logger.addHandler(http_file_handler)
|
153 |
+
|
154 |
+
# api
|
155 |
+
api_logger = logging.getLogger("api")
|
156 |
+
api_file_handler = TimedRotatingFileHandler(
|
157 |
+
filename=os.path.join(log_directory, "api.log"),
|
158 |
+
encoding='utf-8',
|
159 |
+
when="midnight",
|
160 |
+
interval=1,
|
161 |
+
backupCount=7
|
162 |
+
)
|
163 |
+
api_file_handler.setLevel(logging.DEBUG)
|
164 |
+
api_file_handler.setFormatter(logging.Formatter(fmt))
|
165 |
+
api_logger.addHandler(api_file_handler)
|
166 |
+
|
167 |
+
# alarm
|
168 |
+
alarm_logger = logging.getLogger("alarm")
|
169 |
+
alarm_file_handler = TimedRotatingFileHandler(
|
170 |
+
filename=os.path.join(log_directory, "alarm.log"),
|
171 |
+
encoding="utf-8",
|
172 |
+
when="midnight",
|
173 |
+
interval=1,
|
174 |
+
backupCount=7
|
175 |
+
)
|
176 |
+
alarm_file_handler.setLevel(logging.DEBUG)
|
177 |
+
alarm_file_handler.setFormatter(logging.Formatter(fmt))
|
178 |
+
alarm_logger.addHandler(alarm_file_handler)
|
179 |
+
|
180 |
+
debug_file_handler = TimedRotatingFileHandler(
|
181 |
+
filename=os.path.join(log_directory, "debug.log"),
|
182 |
+
encoding="utf-8",
|
183 |
+
when="D",
|
184 |
+
interval=1,
|
185 |
+
backupCount=7
|
186 |
+
)
|
187 |
+
debug_file_handler.setLevel(logging.DEBUG)
|
188 |
+
debug_file_handler.setFormatter(logging.Formatter(fmt))
|
189 |
+
|
190 |
+
info_file_handler = TimedRotatingFileHandler(
|
191 |
+
filename=os.path.join(log_directory, "info.log"),
|
192 |
+
encoding="utf-8",
|
193 |
+
when="D",
|
194 |
+
interval=1,
|
195 |
+
backupCount=7
|
196 |
+
)
|
197 |
+
info_file_handler.setLevel(logging.INFO)
|
198 |
+
info_file_handler.setFormatter(logging.Formatter(fmt))
|
199 |
+
|
200 |
+
error_file_handler = TimedRotatingFileHandler(
|
201 |
+
filename=os.path.join(log_directory, "error.log"),
|
202 |
+
encoding="utf-8",
|
203 |
+
when="D",
|
204 |
+
interval=1,
|
205 |
+
backupCount=7
|
206 |
+
)
|
207 |
+
error_file_handler.setLevel(logging.ERROR)
|
208 |
+
error_file_handler.setFormatter(logging.Formatter(fmt))
|
209 |
+
|
210 |
+
logging.basicConfig(
|
211 |
+
level=logging.DEBUG,
|
212 |
+
datefmt="%a, %d %b %Y %H:%M:%S",
|
213 |
+
handlers=[
|
214 |
+
debug_file_handler,
|
215 |
+
info_file_handler,
|
216 |
+
error_file_handler,
|
217 |
+
]
|
218 |
+
)
|
219 |
+
|
220 |
+
|
221 |
+
if __name__ == "__main__":
|
222 |
+
pass
|
main.py
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
docker build -t llm_eval_system:v20250724_1442 .
|
5 |
+
|
6 |
+
docker stop llm_eval_system_7862 && docker rm llm_eval_system_7862
|
7 |
+
|
8 |
+
docker run -itd \
|
9 |
+
--name llm_eval_system_7862 \
|
10 |
+
--restart=always \
|
11 |
+
--network host \
|
12 |
+
-e port=7862 \
|
13 |
+
llm_eval_system:v20250724_1442 \
|
14 |
+
/bin/bash
|
15 |
+
"""
|
16 |
+
import argparse
|
17 |
+
import logging
|
18 |
+
import platform
|
19 |
+
|
20 |
+
import gradio as gr
|
21 |
+
|
22 |
+
from project_settings import environment, project_path, log_directory
|
23 |
+
from toolbox.os.command import Command
|
24 |
+
import log
|
25 |
+
|
26 |
+
log.setup_size_rotating(log_directory=log_directory)
|
27 |
+
|
28 |
+
logger = logging.getLogger("main")
|
29 |
+
|
30 |
+
|
31 |
+
def get_args():
|
32 |
+
parser = argparse.ArgumentParser()
|
33 |
+
parser.add_argument(
|
34 |
+
"--eval_data_dir",
|
35 |
+
default=(project_path / "data/eval_data").as_posix(),
|
36 |
+
type=str,
|
37 |
+
)
|
38 |
+
args = parser.parse_args()
|
39 |
+
return args
|
40 |
+
|
41 |
+
|
42 |
+
def shell(cmd: str):
|
43 |
+
return Command.popen(cmd)
|
44 |
+
|
45 |
+
|
46 |
+
def main():
|
47 |
+
args = get_args()
|
48 |
+
|
49 |
+
# ui
|
50 |
+
with gr.Blocks() as blocks:
|
51 |
+
with gr.Tabs():
|
52 |
+
with gr.TabItem("shell"):
|
53 |
+
shell_text = gr.Textbox(label="cmd")
|
54 |
+
shell_button = gr.Button("run")
|
55 |
+
shell_output = gr.Textbox(label="output", max_lines=100)
|
56 |
+
|
57 |
+
shell_button.click(
|
58 |
+
shell,
|
59 |
+
inputs=[shell_text, ],
|
60 |
+
outputs=[shell_output],
|
61 |
+
)
|
62 |
+
|
63 |
+
# http://127.0.0.1:7861/
|
64 |
+
# http://10.75.27.247:7861/
|
65 |
+
blocks.queue().launch(
|
66 |
+
share=False if platform.system() == "Windows" else False,
|
67 |
+
server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
|
68 |
+
# server_name="0.0.0.0",
|
69 |
+
server_port=environment.get("port", 7860, dtype=int),
|
70 |
+
)
|
71 |
+
return
|
72 |
+
|
73 |
+
|
74 |
+
if __name__ == "__main__":
|
75 |
+
main()
|
project_settings.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import os
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
from toolbox.os.environment import EnvironmentManager
|
7 |
+
|
8 |
+
project_path = os.path.abspath(os.path.dirname(__file__))
|
9 |
+
project_path = Path(project_path)
|
10 |
+
|
11 |
+
log_directory = project_path / "logs"
|
12 |
+
log_directory.mkdir(parents=True, exist_ok=True)
|
13 |
+
|
14 |
+
environment = EnvironmentManager(
|
15 |
+
filename=os.path.join(project_path, "dotenv/dev.env"),
|
16 |
+
)
|
17 |
+
|
18 |
+
time_zone_info = "Asia/Shanghai"
|
19 |
+
|
20 |
+
|
21 |
+
if __name__ == '__main__':
|
22 |
+
pass
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
python-dotenv
|
3 |
+
pandas
|
4 |
+
openpyxl
|
tabs/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
|
5 |
+
if __name__ == "__main__":
|
6 |
+
pass
|
tabs/fs_tab.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import gradio as gr
|
4 |
+
|
5 |
+
from project_settings import project_path
|
6 |
+
|
7 |
+
|
8 |
+
def get_fs_tab():
|
9 |
+
with gr.TabItem("fs"):
|
10 |
+
with gr.Row():
|
11 |
+
with gr.Column(scale=3):
|
12 |
+
fs_filename = gr.Textbox(label="filename", max_lines=10)
|
13 |
+
fs_file = gr.File(label="file")
|
14 |
+
# fs_file_dir = gr.Textbox(value="data", label="file_dir")
|
15 |
+
fs_file_dir = gr.Dropdown(choices=["data/dataset", "data/eval_data"],
|
16 |
+
value="data/dataset",
|
17 |
+
label="file_dir")
|
18 |
+
fs_query = gr.Button("query", variant="primary")
|
19 |
+
with gr.Column(scale=7):
|
20 |
+
fs_filelist_dataset_state = gr.State(value=[])
|
21 |
+
fs_filelist_dataset = gr.Dataset(
|
22 |
+
components=[fs_filename, fs_file],
|
23 |
+
samples=fs_filelist_dataset_state.value,
|
24 |
+
)
|
25 |
+
|
26 |
+
def when_click_query_files(file_dir: str = "data"):
|
27 |
+
file_dir = project_path / file_dir
|
28 |
+
dataset_state = list()
|
29 |
+
for filename in file_dir.glob("**/*.*"):
|
30 |
+
if filename.is_dir():
|
31 |
+
continue
|
32 |
+
if filename.stem.startswith("."):
|
33 |
+
continue
|
34 |
+
if filename.name.endswith(".py"):
|
35 |
+
continue
|
36 |
+
if filename.name.endswith(".raw"):
|
37 |
+
continue
|
38 |
+
dataset_state.append((
|
39 |
+
filename.relative_to(file_dir).as_posix(),
|
40 |
+
filename.as_posix(),
|
41 |
+
))
|
42 |
+
|
43 |
+
dataset = gr.Dataset(
|
44 |
+
components=[fs_filename, fs_file],
|
45 |
+
samples=dataset_state,
|
46 |
+
)
|
47 |
+
return dataset_state, dataset
|
48 |
+
|
49 |
+
fs_filelist_dataset.click(
|
50 |
+
fn=lambda x: (
|
51 |
+
x[1], x[1]
|
52 |
+
),
|
53 |
+
inputs=[fs_filelist_dataset],
|
54 |
+
outputs=[fs_filename, fs_file]
|
55 |
+
)
|
56 |
+
fs_query.click(
|
57 |
+
fn=when_click_query_files,
|
58 |
+
inputs=[fs_file_dir],
|
59 |
+
outputs=[fs_filelist_dataset_state, fs_filelist_dataset]
|
60 |
+
)
|
61 |
+
return locals()
|
62 |
+
|
63 |
+
|
64 |
+
if __name__ == "__main__":
|
65 |
+
with gr.Blocks() as block:
|
66 |
+
fs_components = get_fs_tab()
|
67 |
+
block.launch()
|
tabs/shell_tab.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import gradio as gr
|
4 |
+
|
5 |
+
from toolbox.os.command import Command
|
6 |
+
|
7 |
+
|
8 |
+
def shell(cmd: str):
|
9 |
+
return Command.popen(cmd)
|
10 |
+
|
11 |
+
|
12 |
+
def get_shell_tab():
|
13 |
+
with gr.TabItem("shell"):
|
14 |
+
shell_text = gr.Textbox(label="cmd")
|
15 |
+
shell_button = gr.Button("run")
|
16 |
+
shell_output = gr.Textbox(label="output", max_lines=100)
|
17 |
+
|
18 |
+
shell_button.click(
|
19 |
+
shell,
|
20 |
+
inputs=[shell_text, ],
|
21 |
+
outputs=[shell_output],
|
22 |
+
)
|
23 |
+
|
24 |
+
return locals()
|
25 |
+
|
26 |
+
|
27 |
+
if __name__ == "__main__":
|
28 |
+
pass
|
toolbox/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
|
5 |
+
if __name__ == "__main__":
|
6 |
+
pass
|
toolbox/os/__init__.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
|
5 |
+
if __name__ == '__main__':
|
6 |
+
pass
|
toolbox/os/command.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import os
|
4 |
+
|
5 |
+
|
6 |
+
class Command(object):
|
7 |
+
custom_command = [
|
8 |
+
"cd"
|
9 |
+
]
|
10 |
+
|
11 |
+
@staticmethod
|
12 |
+
def _get_cmd(command):
|
13 |
+
command = str(command).strip()
|
14 |
+
if command == "":
|
15 |
+
return None
|
16 |
+
cmd_and_args = command.split(sep=" ")
|
17 |
+
cmd = cmd_and_args[0]
|
18 |
+
args = " ".join(cmd_and_args[1:])
|
19 |
+
return cmd, args
|
20 |
+
|
21 |
+
@classmethod
|
22 |
+
def popen(cls, command):
|
23 |
+
cmd, args = cls._get_cmd(command)
|
24 |
+
if cmd in cls.custom_command:
|
25 |
+
method = getattr(cls, cmd)
|
26 |
+
return method(args)
|
27 |
+
else:
|
28 |
+
resp = os.popen(command)
|
29 |
+
result = resp.read()
|
30 |
+
resp.close()
|
31 |
+
return result
|
32 |
+
|
33 |
+
@classmethod
|
34 |
+
def cd(cls, args):
|
35 |
+
if args.startswith("/"):
|
36 |
+
os.chdir(args)
|
37 |
+
else:
|
38 |
+
pwd = os.getcwd()
|
39 |
+
path = os.path.join(pwd, args)
|
40 |
+
os.chdir(path)
|
41 |
+
|
42 |
+
@classmethod
|
43 |
+
def system(cls, command):
|
44 |
+
return os.system(command)
|
45 |
+
|
46 |
+
def __init__(self):
|
47 |
+
pass
|
48 |
+
|
49 |
+
|
50 |
+
def ps_ef_grep(keyword: str):
|
51 |
+
cmd = "ps -ef | grep {}".format(keyword)
|
52 |
+
rows = Command.popen(cmd)
|
53 |
+
rows = str(rows).split("\n")
|
54 |
+
rows = [row for row in rows if row.__contains__(keyword) and not row.__contains__("grep")]
|
55 |
+
return rows
|
56 |
+
|
57 |
+
|
58 |
+
if __name__ == "__main__":
|
59 |
+
pass
|
toolbox/os/environment.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import os
|
4 |
+
|
5 |
+
from dotenv import load_dotenv
|
6 |
+
from dotenv.main import DotEnv
|
7 |
+
|
8 |
+
|
9 |
+
class EnvironmentManager(object):
|
10 |
+
def __init__(self, filename, override=False):
|
11 |
+
self.filename = filename
|
12 |
+
|
13 |
+
load_dotenv(
|
14 |
+
dotenv_path=filename,
|
15 |
+
override=override
|
16 |
+
)
|
17 |
+
|
18 |
+
def open_dotenv(self, filename: str = None):
|
19 |
+
filename = filename or self.filename
|
20 |
+
dotenv = DotEnv(
|
21 |
+
dotenv_path=filename,
|
22 |
+
stream=None,
|
23 |
+
verbose=False,
|
24 |
+
interpolate=False,
|
25 |
+
override=False,
|
26 |
+
encoding="utf-8",
|
27 |
+
)
|
28 |
+
result = dotenv.dict()
|
29 |
+
return result
|
30 |
+
|
31 |
+
def get(self, key, default=None, dtype=str):
|
32 |
+
result = os.environ.get(key)
|
33 |
+
if result is None:
|
34 |
+
if default is None:
|
35 |
+
result = None
|
36 |
+
else:
|
37 |
+
result = default
|
38 |
+
else:
|
39 |
+
result = dtype(result)
|
40 |
+
return result
|
41 |
+
|
42 |
+
|
43 |
+
def main():
|
44 |
+
return
|
45 |
+
|
46 |
+
|
47 |
+
if __name__ == '__main__':
|
48 |
+
main()
|
toolbox/os/other.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import inspect
|
3 |
+
|
4 |
+
|
5 |
+
def pwd():
|
6 |
+
"""你在哪个文件调用此函数, 它就会返回那个文件所在的 dir 目标"""
|
7 |
+
frame = inspect.stack()[1]
|
8 |
+
module = inspect.getmodule(frame[0])
|
9 |
+
return os.path.dirname(os.path.abspath(module.__file__))
|