HoneyTian commited on
Commit
1e55fa2
·
1 Parent(s): adb1e77
Files changed (20) hide show
  1. Dockerfile +1 -1
  2. data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-bingoplus-ph-200-chat.jsonl +3 -0
  3. data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-bingoplus-ph-90-choice.jsonl +3 -0
  4. data/eval_data/gemini_google/google/llama-4-maverick-17b-128e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-bingoplus-ph-200-chat.jsonl +3 -0
  5. data/eval_data/gemini_google/google/llama-4-maverick-17b-128e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-bingoplus-ph-90-choice.jsonl +3 -0
  6. data/eval_data/gemini_google/google/llama-4-maverick-17b-128e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-lingoace-zh-400-choice.jsonl +3 -0
  7. data/eval_data/gemini_google/google/llama-4-maverick-17b-128e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-lingoace-zh-80-chat.jsonl +3 -0
  8. data/eval_data/gemini_google/google/llama-4-maverick-17b-128e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/arc-easy-1000-choice.jsonl +3 -0
  9. data/eval_data/gemini_google/google/llama-4-scout-17b-16e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-bingoplus-ph-200-chat.jsonl +3 -0
  10. data/eval_data/gemini_google/google/llama-4-scout-17b-16e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-bingoplus-ph-90-choice.jsonl +3 -0
  11. data/eval_data/gemini_google/google/llama-4-scout-17b-16e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-lingoace-zh-400-choice.jsonl +3 -0
  12. data/eval_data/gemini_google/google/llama-4-scout-17b-16e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-lingoace-zh-80-chat.jsonl +3 -0
  13. examples/make_raw_dataset/step_3_filter_by_keywords.py +19 -2
  14. examples/test_metrics/bingoplus_chat_metric.py +2 -2
  15. examples/test_metrics/lingoace_chat_metric.py +2 -2
  16. llm_eval_script/byteplus.py +2 -1
  17. llm_eval_script/byteplus_chat.py +4 -3
  18. llm_eval_script/gemini_google.py +49 -7
  19. llm_eval_script/gemini_google_chat.py +8 -4
  20. main.py +1 -0
Dockerfile CHANGED
@@ -5,7 +5,7 @@ WORKDIR /code
5
  COPY . /code
6
 
7
  RUN apt-get update
8
- RUN apt-get install -y wget unzip ffmpeg build-essential git
9
 
10
  RUN pip install --upgrade pip
11
  RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
 
5
  COPY . /code
6
 
7
  RUN apt-get update
8
+ RUN apt-get install -y wget unzip ffmpeg build-essential git git-lfs
9
 
10
  RUN pip install --upgrade pip
11
  RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-bingoplus-ph-200-chat.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a574d56126be957ef4d283af06243125886f7544ccaa5bbbe0b01900abe2c62f
3
+ size 2417697
data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-bingoplus-ph-90-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:118787cf7fd66a6683864ff4b79fc648c7d17c65b420c25092c14857c75674ed
3
+ size 258515
data/eval_data/gemini_google/google/llama-4-maverick-17b-128e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-bingoplus-ph-200-chat.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab42fc8b853062a9391db33fe890869e7f61e7f9c118ea2c84e3c3555768ca00
3
+ size 2419510
data/eval_data/gemini_google/google/llama-4-maverick-17b-128e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-bingoplus-ph-90-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d738dbb5fa0aef7cc3880b0ec50f2a54143ce586b74bb3c1cffe009f53344dc
3
+ size 258673
data/eval_data/gemini_google/google/llama-4-maverick-17b-128e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-lingoace-zh-400-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ae30069ee95459c290f53eb50dcb72cb2c11a8a7c3691a96006f4d462dd767b
3
+ size 1211487
data/eval_data/gemini_google/google/llama-4-maverick-17b-128e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-lingoace-zh-80-chat.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ea3a2b7e5c28a98464352433baecdb7f6c011046d6853282709f7b62ca1386c
3
+ size 874387
data/eval_data/gemini_google/google/llama-4-maverick-17b-128e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/arc-easy-1000-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:970ffc784ca83d2ce6e826d3303590d0646f77395bdd832fa809cf09dad46529
3
+ size 720927
data/eval_data/gemini_google/google/llama-4-scout-17b-16e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-bingoplus-ph-200-chat.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f762c204ac2438aebe08f143bbffddd10d2e94701dd787b103506c09c79f1c1b
3
+ size 2471787
data/eval_data/gemini_google/google/llama-4-scout-17b-16e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-bingoplus-ph-90-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6963aa07be72dff967b2388cb4d0303ed76624ba7b48f3f5861c9b207c08448
3
+ size 258578
data/eval_data/gemini_google/google/llama-4-scout-17b-16e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-lingoace-zh-400-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b38cb68452d6f237d275aa03a6c589ece653d4f8ecd5e808d41bb0ac729d850
3
+ size 1211826
data/eval_data/gemini_google/google/llama-4-scout-17b-16e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-lingoace-zh-80-chat.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40aab0bb0dd05948d878e0ffab0cb84eca630530079619bd79744957cf42bef2
3
+ size 874346
examples/make_raw_dataset/step_3_filter_by_keywords.py CHANGED
@@ -50,12 +50,29 @@ def main():
50
 
51
  for key_str in [
52
  # "BingoPlus",
53
- " COD ",
 
 
 
 
 
 
 
 
 
54
  ]:
55
  if system_prompt.__contains__(key_str) or user_prompt.__contains__(key_str):
56
  print(f"process: {sample_dir.as_posix()}")
57
  # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-bingoplus"
58
- tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-cod"
 
 
 
 
 
 
 
 
59
  tgt_dir.mkdir(parents=True, exist_ok=True)
60
  shutil.move(
61
  sample_dir.as_posix(),
 
50
 
51
  for key_str in [
52
  # "BingoPlus",
53
+ # " COD ",
54
+ # "NXPay",
55
+ # "NX Money",
56
+ # "Exodus Telecom",
57
+ # "Exodus Retail",
58
+ "Exodus Automotive",
59
+ # "kta kilat", "KTA KILAT",
60
+ # "NXCloud",
61
+ # "作为VIP客户",
62
+ "FedEx",
63
  ]:
64
  if system_prompt.__contains__(key_str) or user_prompt.__contains__(key_str):
65
  print(f"process: {sample_dir.as_posix()}")
66
  # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-bingoplus"
67
+ # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-cod"
68
+ # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-nxpay"
69
+ # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-nxmoney"
70
+ # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-exodus-retail"
71
+ # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-exodus-automotive"
72
+ # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-kta"
73
+ # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-nxcloud"
74
+ # tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-vip"
75
+ tgt_dir = dataset_dir / f"{data_dir.parts[-1]}-fedex"
76
  tgt_dir.mkdir(parents=True, exist_ok=True)
77
  shutil.move(
78
  sample_dir.as_posix(),
examples/test_metrics/bingoplus_chat_metric.py CHANGED
@@ -38,12 +38,12 @@ python3 azure_openai.py --model_name gpt-4o-mini \
38
  )
39
  parser.add_argument(
40
  "--eval_data_file",
41
- default=(project_path / "data/eval_data/gemini_google/google/gemini-2.5-flash-lite-preview-06-17/shenzhen_sase/google_potent_veld_462405_t3/20250729_161543/agent-bingoplus-ph-200-chat.jsonl.raw").as_posix(),
42
  type=str
43
  )
44
  parser.add_argument(
45
  "--output_file",
46
- default=(project_path / "data/eval_data/gemini_google/google/gemini-2.5-flash-lite-preview-06-17/shenzhen_sase/google_potent_veld_462405_t3/20250729_161543/agent-bingoplus-ph-200-chat.jsonl").as_posix(),
47
  type=str
48
  )
49
  parser.add_argument(
 
38
  )
39
  parser.add_argument(
40
  "--eval_data_file",
41
+ default=(project_path / "data/eval_data/gemini_google/google/llama-4-scout-17b-16e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-bingoplus-ph-200-chat.jsonl.raw").as_posix(),
42
  type=str
43
  )
44
  parser.add_argument(
45
  "--output_file",
46
+ default=(project_path / "data/eval_data/gemini_google/google/llama-4-scout-17b-16e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-bingoplus-ph-200-chat.jsonl").as_posix(),
47
  type=str
48
  )
49
  parser.add_argument(
examples/test_metrics/lingoace_chat_metric.py CHANGED
@@ -43,12 +43,12 @@ python3 azure_openai.py --model_name gpt-4o-mini \
43
  )
44
  parser.add_argument(
45
  "--eval_data_file",
46
- default=(project_path / "data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-lingoace-zh-80-chat.jsonl.raw").as_posix(),
47
  type=str
48
  )
49
  parser.add_argument(
50
  "--output_file",
51
- default=(project_path / "data/eval_data/byteplus/byteplus/seed-1-6-flash-250615/shenzhen_sase/byteplus_api_key/20250728_113641/agent-lingoace-zh-80-chat.jsonl").as_posix(),
52
  type=str
53
  )
54
  parser.add_argument(
 
43
  )
44
  parser.add_argument(
45
  "--eval_data_file",
46
+ default=(project_path / "data/eval_data/gemini_google/google/llama-4-scout-17b-16e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-lingoace-zh-80-chat.jsonl.raw").as_posix(),
47
  type=str
48
  )
49
  parser.add_argument(
50
  "--output_file",
51
+ default=(project_path / "data/eval_data/gemini_google/google/llama-4-scout-17b-16e-instruct-maas/shenzhen_sase/google_potent_veld_462405_t3/20250731_162116/agent-lingoace-zh-80-chat.jsonl").as_posix(),
52
  type=str
53
  )
54
  parser.add_argument(
llm_eval_script/byteplus.py CHANGED
@@ -49,8 +49,9 @@ def get_args():
49
  )
50
  parser.add_argument(
51
  "--eval_dataset_name",
 
52
  # default="agent-lingoace-zh-400-choice.jsonl",
53
- default="arc-easy-1000-choice.jsonl",
54
  type=str
55
  )
56
  parser.add_argument(
 
49
  )
50
  parser.add_argument(
51
  "--eval_dataset_name",
52
+ default="agent-bingoplus-ph-90-choice.jsonl",
53
  # default="agent-lingoace-zh-400-choice.jsonl",
54
+ # default="arc-easy-1000-choice.jsonl",
55
  type=str
56
  )
57
  parser.add_argument(
llm_eval_script/byteplus_chat.py CHANGED
@@ -42,14 +42,15 @@ def get_args():
42
  parser = argparse.ArgumentParser()
43
  parser.add_argument(
44
  "--model_name",
45
- default="seed-1-6-250615",
46
- # default="seed-1-6-flash-250615",
47
  # default="deepseek-v3-250324",
48
  type=str
49
  )
50
  parser.add_argument(
51
  "--eval_dataset_name",
52
- default="agent-lingoace-zh-80-chat.jsonl",
 
53
  type=str
54
  )
55
  parser.add_argument(
 
42
  parser = argparse.ArgumentParser()
43
  parser.add_argument(
44
  "--model_name",
45
+ # default="seed-1-6-250615",
46
+ default="seed-1-6-flash-250615",
47
  # default="deepseek-v3-250324",
48
  type=str
49
  )
50
  parser.add_argument(
51
  "--eval_dataset_name",
52
+ # default="agent-lingoace-zh-80-chat.jsonl",
53
+ default="agent-bingoplus-ph-200-chat.jsonl",
54
  type=str
55
  )
56
  parser.add_argument(
llm_eval_script/gemini_google.py CHANGED
@@ -1,5 +1,25 @@
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import argparse
4
  from datetime import datetime
5
  import json
@@ -25,13 +45,17 @@ def get_args():
25
  "--model_name",
26
  # default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
27
  # default="gemini-2.5-flash",
28
- default="gemini-2.5-flash-lite-preview-06-17",
 
 
 
 
29
  type=str
30
  )
31
  parser.add_argument(
32
  "--eval_dataset_name",
33
- default="agent-bingoplus-ph-90-choice.jsonl",
34
- # default="agent-lingoace-zh-400-choice.jsonl",
35
  # default="arc-easy-1000-choice.jsonl",
36
  type=str
37
  )
@@ -55,6 +79,17 @@ def get_args():
55
  default="google_potent_veld_462405_t3",
56
  type=str
57
  )
 
 
 
 
 
 
 
 
 
 
 
58
  args = parser.parse_args()
59
  return args
60
 
@@ -79,9 +114,13 @@ def main():
79
  eval_data_dir = Path(args.eval_data_dir)
80
  eval_data_dir.mkdir(parents=True, exist_ok=True)
81
 
82
- tz = ZoneInfo("Asia/Shanghai")
83
- now = datetime.now(tz)
84
- create_time_str = now.strftime("%Y%m%d_%H%M%S")
 
 
 
 
85
 
86
  eval_dataset = eval_dataset_dir / args.eval_dataset_name
87
 
@@ -91,7 +130,8 @@ def main():
91
  client = genai.Client(
92
  vertexai=True,
93
  project=project_id,
94
- location="global",
 
95
  )
96
  generate_content_config = types.GenerateContentConfig(
97
  top_p=0.95,
@@ -137,6 +177,8 @@ def main():
137
  ]
138
  )
139
  ]
 
 
140
  time_begin = time.time()
141
  llm_response: types.GenerateContentResponse = client.models.generate_content(
142
  model=args.model_name,
 
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
3
+ """
4
+ https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/claude?hl=zh-cn
5
+ https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/claude/use-claude?hl=zh-cn
6
+
7
+
8
+ Llama
9
+
10
+ https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama/use-llama?hl=zh-cn
11
+ https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama/use-llama?hl=zh-cn#regions-quotas
12
+
13
+ Model Name
14
+ llama-4-maverick-17b-128e-instruct-maas
15
+ llama-4-scout-17b-16e-instruct-maas
16
+
17
+ 区域选择 us-east5
18
+
19
+
20
+
21
+
22
+ """
23
  import argparse
24
  from datetime import datetime
25
  import json
 
45
  "--model_name",
46
  # default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
47
  # default="gemini-2.5-flash",
48
+ # default="gemini-2.5-flash-lite-preview-06-17",
49
+ # default="claude-opus-4@20250514",
50
+ # default="claude-sonnet-4@20250514",
51
+ # default="llama-4-maverick-17b-128e-instruct-maas",
52
+ default="llama-4-scout-17b-16e-instruct-maas",
53
  type=str
54
  )
55
  parser.add_argument(
56
  "--eval_dataset_name",
57
+ # default="agent-bingoplus-ph-90-choice.jsonl",
58
+ default="agent-lingoace-zh-400-choice.jsonl",
59
  # default="arc-easy-1000-choice.jsonl",
60
  type=str
61
  )
 
79
  default="google_potent_veld_462405_t3",
80
  type=str
81
  )
82
+ parser.add_argument(
83
+ "--create_time_str",
84
+ # default="null",
85
+ default="20250731_162116",
86
+ type=str
87
+ )
88
+ parser.add_argument(
89
+ "--interval",
90
+ default=1,
91
+ type=int
92
+ )
93
  args = parser.parse_args()
94
  return args
95
 
 
114
  eval_data_dir = Path(args.eval_data_dir)
115
  eval_data_dir.mkdir(parents=True, exist_ok=True)
116
 
117
+ if args.create_time_str == "null":
118
+ tz = ZoneInfo("Asia/Shanghai")
119
+ now = datetime.now(tz)
120
+ create_time_str = now.strftime("%Y%m%d_%H%M%S")
121
+ # create_time_str = "20250729-interval-5"
122
+ else:
123
+ create_time_str = args.create_time_str
124
 
125
  eval_dataset = eval_dataset_dir / args.eval_dataset_name
126
 
 
130
  client = genai.Client(
131
  vertexai=True,
132
  project=project_id,
133
+ # location="global",
134
+ location="us-east5",
135
  )
136
  generate_content_config = types.GenerateContentConfig(
137
  top_p=0.95,
 
177
  ]
178
  )
179
  ]
180
+ time.sleep(args.interval)
181
+ print(f"sleep: {args.interval}")
182
  time_begin = time.time()
183
  llm_response: types.GenerateContentResponse = client.models.generate_content(
184
  model=args.model_name,
llm_eval_script/gemini_google_chat.py CHANGED
@@ -25,7 +25,9 @@ def get_args():
25
  "--model_name",
26
  # default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
27
  # default="gemini-2.5-flash",
28
- default="gemini-2.5-flash-lite-preview-06-17",
 
 
29
  type=str
30
  )
31
  parser.add_argument(
@@ -57,12 +59,12 @@ def get_args():
57
  parser.add_argument(
58
  "--create_time_str",
59
  # default="null",
60
- default="20250729_161543",
61
  type=str
62
  )
63
  parser.add_argument(
64
  "--interval",
65
- default=5,
66
  type=int
67
  )
68
  args = parser.parse_args()
@@ -105,7 +107,9 @@ def main():
105
  client = genai.Client(
106
  vertexai=True,
107
  project=project_id,
108
- location="global",
 
 
109
  )
110
  generate_content_config = types.GenerateContentConfig(
111
  top_p=0.95,
 
25
  "--model_name",
26
  # default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0.
27
  # default="gemini-2.5-flash",
28
+ # default="gemini-2.5-flash-lite-preview-06-17",
29
+ # default="llama-4-maverick-17b-128e-instruct-maas",
30
+ default="llama-4-scout-17b-16e-instruct-maas",
31
  type=str
32
  )
33
  parser.add_argument(
 
59
  parser.add_argument(
60
  "--create_time_str",
61
  # default="null",
62
+ default="20250731_162116",
63
  type=str
64
  )
65
  parser.add_argument(
66
  "--interval",
67
+ default=1,
68
  type=int
69
  )
70
  args = parser.parse_args()
 
107
  client = genai.Client(
108
  vertexai=True,
109
  project=project_id,
110
+ # location="global",
111
+ location="us-east5",
112
+
113
  )
114
  generate_content_config = types.GenerateContentConfig(
115
  top_p=0.95,
main.py CHANGED
@@ -17,6 +17,7 @@ docker run -itd \
17
  --name llm_eval_system_7862 \
18
  --restart=always \
19
  --network host \
 
20
  python:3.12 \
21
  /bin/bash
22
 
 
17
  --name llm_eval_system_7862 \
18
  --restart=always \
19
  --network host \
20
+ -v /data/tianxing/PycharmProjects/llm_eval_system:/data/tianxing/PycharmProjects/llm_eval_system \
21
  python:3.12 \
22
  /bin/bash
23