Ahmed Ahmed commited on
Commit
70ea05e
·
1 Parent(s): 46cc1f1

initial commit

Browse files
app.py CHANGED
@@ -27,6 +27,7 @@ from src.display.utils import (
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
 
30
 
31
 
32
  def restart_space():
@@ -89,6 +90,19 @@ def init_leaderboard(dataframe):
89
  )
90
 
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  demo = gr.Blocks(css=custom_css)
93
  with demo:
94
  gr.HTML(TITLE)
@@ -188,6 +202,29 @@ with demo:
188
  submission_result,
189
  )
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  with gr.Row():
192
  with gr.Accordion("📙 Citation", open=False):
193
  citation_button = gr.Textbox(
 
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
  from src.submission.submit import add_new_eval
30
+ from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
31
 
32
 
33
  def restart_space():
 
90
  )
91
 
92
 
93
+ def run_perplexity_test(model_name, revision, precision):
94
+ """Run perplexity evaluation on demand."""
95
+ if not model_name:
96
+ return "Please enter a model name."
97
+
98
+ success, result = run_dynamic_perplexity_eval(model_name, revision, precision)
99
+
100
+ if success:
101
+ return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\nResults have been saved and will appear in the leaderboard shortly."
102
+ else:
103
+ return f"❌ Evaluation failed: {result}"
104
+
105
+
106
  demo = gr.Blocks(css=custom_css)
107
  with demo:
108
  gr.HTML(TITLE)
 
202
  submission_result,
203
  )
204
 
205
+ with gr.TabItem("🧪 Dynamic Testing", elem_id="dynamic-testing-tab", id=4):
206
+ gr.Markdown("## Run Perplexity Evaluation")
207
+
208
+ with gr.Row():
209
+ with gr.Column():
210
+ dynamic_model_name = gr.Textbox(label="Model name", placeholder="org/model-name")
211
+ dynamic_revision = gr.Textbox(label="Revision", placeholder="main", value="main")
212
+ dynamic_precision = gr.Dropdown(
213
+ choices=["float16", "bfloat16"],
214
+ label="Precision",
215
+ value="float16"
216
+ )
217
+
218
+ with gr.Column():
219
+ dynamic_test_button = gr.Button("🚀 Run Perplexity Test", variant="primary")
220
+ dynamic_result = gr.Markdown()
221
+
222
+ dynamic_test_button.click(
223
+ run_perplexity_test,
224
+ [dynamic_model_name, dynamic_revision, dynamic_precision],
225
+ dynamic_result
226
+ )
227
+
228
  with gr.Row():
229
  with gr.Accordion("📙 Citation", open=False):
230
  citation_button = gr.Textbox(
explain.md ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model Trace - Hugging Face Space Explanation
2
+
3
+ ## Overview
4
+
5
+ This repository hosts a **Hugging Face Space** that creates a dynamic leaderboard for evaluating language models. The space provides a web interface where users can submit models for evaluation and view results in a ranked leaderboard format.
6
+
7
+ ## How It Works
8
+
9
+ ### Architecture
10
+
11
+ The system consists of several key components:
12
+
13
+ 1. **Frontend Interface** (`app.py`): A Gradio web application with three main tabs:
14
+ - **🏅 LLM Benchmark**: Displays the main leaderboard
15
+ - **📝 About**: Shows information about the evaluation process
16
+ - **🚀 Submit here!**: Allows users to submit models for evaluation
17
+
18
+ 2. **Data Storage**: Uses Hugging Face datasets to store:
19
+ - **Evaluation Requests**: Models waiting to be evaluated
20
+ - **Evaluation Results**: Completed evaluation results
21
+
22
+ 3. **Evaluation Queue System**: Models go through different states:
23
+ - **PENDING**: Submitted but not yet evaluated
24
+ - **RUNNING**: Currently being evaluated
25
+ - **FINISHED**: Evaluation completed
26
+
27
+ ### Data Flow
28
+
29
+ 1. **Model Submission**: Users submit models through the web interface
30
+ 2. **Validation**: System checks if the model exists on Hugging Face Hub and has proper metadata
31
+ 3. **Queue Management**: Valid models are added to the evaluation queue
32
+ 4. **Evaluation**: External evaluation system processes the models (not included in this repo)
33
+ 5. **Results Display**: Completed evaluations appear in the leaderboard
34
+
35
+ ### Configuration
36
+
37
+ The main configuration files are:
38
+
39
+ - **`src/envs.py`**: Repository settings and API tokens
40
+ - **`src/about.py`**: Task definitions and leaderboard metadata
41
+ - **`src/display/utils.py`**: Column definitions and display settings
42
+
43
+ ## Current Evaluation Tasks
44
+
45
+ The system is currently configured to evaluate models on:
46
+ - **ANLI** (Adversarial NLI) - accuracy metric
47
+ - **LogiQA** - normalized accuracy metric
48
+
49
+ ## Adding Dynamic Perplexity Testing
50
+
51
+ To add perplexity evaluation as a dynamic test, you'll need to make several modifications:
52
+
53
+ ### 1. Update Task Configuration
54
+
55
+ First, modify `src/about.py` to add perplexity as a new task:
56
+
57
+ ```python
58
+ class Tasks(Enum):
59
+ # Existing tasks
60
+ task0 = Task("anli_r1", "acc", "ANLI")
61
+ task1 = Task("logiqa", "acc_norm", "LogiQA")
62
+ # Add perplexity task
63
+ task2 = Task("perplexity", "perplexity", "Perplexity")
64
+ ```
65
+
66
+ ### 2. Create Perplexity Evaluation Script
67
+
68
+ Create a new file `src/evaluation/perplexity_eval.py`:
69
+
70
+ ```python
71
+ import torch
72
+ from transformers import AutoModelForCausalLM, AutoTokenizer
73
+ import numpy as np
74
+
75
+ def evaluate_perplexity(model_name, revision="main", test_text=None):
76
+ """
77
+ Evaluate perplexity on a fixed piece of text.
78
+
79
+ Args:
80
+ model_name: Hugging Face model identifier
81
+ revision: Model revision/commit hash
82
+ test_text: Text to evaluate perplexity on (default if None)
83
+
84
+ Returns:
85
+ float: Perplexity score (lower is better)
86
+ """
87
+
88
+ # Default test text if none provided
89
+ if test_text is None:
90
+ test_text = """The quick brown fox jumps over the lazy dog. This is a standard test sentence that contains all the letters of the English alphabet. It is commonly used for testing fonts and keyboards."""
91
+
92
+ # Load model and tokenizer
93
+ model = AutoModelForCausalLM.from_pretrained(
94
+ model_name,
95
+ revision=revision,
96
+ torch_dtype=torch.float16,
97
+ device_map="auto"
98
+ )
99
+ tokenizer = AutoTokenizer.from_pretrained(model_name, revision=revision)
100
+
101
+ # Tokenize the text
102
+ inputs = tokenizer(test_text, return_tensors="pt")
103
+
104
+ # Move to same device as model
105
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
106
+
107
+ # Calculate loss
108
+ with torch.no_grad():
109
+ outputs = model(**inputs, labels=inputs["input_ids"])
110
+ loss = outputs.loss
111
+
112
+ # Calculate perplexity
113
+ perplexity = torch.exp(loss).item()
114
+
115
+ return perplexity
116
+
117
+ def create_perplexity_result(model_name, revision, precision, perplexity_score):
118
+ """
119
+ Create a result file in the expected format.
120
+ """
121
+ return {
122
+ "config": {
123
+ "model_dtype": f"torch.{precision}",
124
+ "model_name": model_name,
125
+ "model_sha": revision,
126
+ },
127
+ "results": {
128
+ "perplexity": {
129
+ "perplexity": perplexity_score,
130
+ }
131
+ }
132
+ }
133
+ ```
134
+
135
+ ### 3. Add Dynamic Evaluation Endpoint
136
+
137
+ Create a new file `src/evaluation/dynamic_eval.py`:
138
+
139
+ ```python
140
+ import json
141
+ import os
142
+ from datetime import datetime
143
+ from src.evaluation.perplexity_eval import evaluate_perplexity, create_perplexity_result
144
+ from src.envs import EVAL_RESULTS_PATH, API, RESULTS_REPO
145
+
146
+ def run_dynamic_perplexity_eval(model_name, revision="main", precision="float16"):
147
+ """
148
+ Run perplexity evaluation and save results.
149
+ """
150
+ try:
151
+ # Run evaluation
152
+ perplexity_score = evaluate_perplexity(model_name, revision)
153
+
154
+ # Create result structure
155
+ result = create_perplexity_result(model_name, revision, precision, perplexity_score)
156
+
157
+ # Save result file
158
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
159
+ result_filename = f"results_{model_name.replace('/', '_')}_{timestamp}.json"
160
+
161
+ # Create directory structure
162
+ org, model = model_name.split("/") if "/" in model_name else ("", model_name)
163
+ result_dir = os.path.join(EVAL_RESULTS_PATH, org) if org else EVAL_RESULTS_PATH
164
+ os.makedirs(result_dir, exist_ok=True)
165
+
166
+ result_path = os.path.join(result_dir, result_filename)
167
+
168
+ with open(result_path, "w") as f:
169
+ json.dump(result, f, indent=2)
170
+
171
+ # Upload to Hugging Face dataset
172
+ API.upload_file(
173
+ path_or_fileobj=result_path,
174
+ path_in_repo=result_path.split("eval-results/")[1],
175
+ repo_id=RESULTS_REPO,
176
+ repo_type="dataset",
177
+ commit_message=f"Add perplexity results for {model_name}",
178
+ )
179
+
180
+ return True, perplexity_score
181
+
182
+ except Exception as e:
183
+ return False, str(e)
184
+ ```
185
+
186
+ ### 4. Add Dynamic Testing Interface
187
+
188
+ Modify `app.py` to add a new tab for dynamic testing:
189
+
190
+ ```python
191
+ # Add this import
192
+ from src.evaluation.dynamic_eval import run_dynamic_perplexity_eval
193
+
194
+ # Add this function
195
+ def run_perplexity_test(model_name, revision, precision):
196
+ """Run perplexity evaluation on demand."""
197
+ if not model_name:
198
+ return "Please enter a model name."
199
+
200
+ success, result = run_dynamic_perplexity_eval(model_name, revision, precision)
201
+
202
+ if success:
203
+ return f"✅ Perplexity evaluation completed!\nPerplexity: {result:.4f}\n\nResults have been saved and will appear in the leaderboard shortly."
204
+ else:
205
+ return f"❌ Evaluation failed: {result}"
206
+
207
+ # Add this to the demo interface (inside the gr.Blocks)
208
+ with gr.TabItem("🧪 Dynamic Testing", elem_id="dynamic-testing-tab", id=4):
209
+ gr.Markdown("## Run Perplexity Evaluation")
210
+
211
+ with gr.Row():
212
+ with gr.Column():
213
+ dynamic_model_name = gr.Textbox(label="Model name", placeholder="org/model-name")
214
+ dynamic_revision = gr.Textbox(label="Revision", placeholder="main", value="main")
215
+ dynamic_precision = gr.Dropdown(
216
+ choices=["float16", "bfloat16"],
217
+ label="Precision",
218
+ value="float16"
219
+ )
220
+
221
+ with gr.Column():
222
+ dynamic_test_button = gr.Button("🚀 Run Perplexity Test", variant="primary")
223
+ dynamic_result = gr.Markdown()
224
+
225
+ dynamic_test_button.click(
226
+ run_perplexity_test,
227
+ [dynamic_model_name, dynamic_revision, dynamic_precision],
228
+ dynamic_result
229
+ )
230
+ ```
231
+
232
+ ### 5. Update Requirements
233
+
234
+ Add any additional dependencies to `requirements.txt`:
235
+
236
+ ```txt
237
+ # Add if not already present
238
+ torch
239
+ transformers
240
+ accelerate
241
+ ```
242
+
243
+ ### 6. Configure Environment
244
+
245
+ Update `src/envs.py` to point to your repositories:
246
+
247
+ ```python
248
+ OWNER = "your-org-name" # Change this
249
+ ```
250
+
251
+ You'll need to create two Hugging Face datasets:
252
+ - `your-org-name/requests` - for evaluation requests
253
+ - `your-org-name/results` - for evaluation results
254
+
255
+ ## How to Use the Dynamic Testing
256
+
257
+ 1. **Deploy the Space**: Push your changes to a Hugging Face Space
258
+ 2. **Set Environment Variables**: Add `HF_TOKEN` with write permissions
259
+ 3. **Test Models**: Use the "Dynamic Testing" tab to evaluate models on demand
260
+ 4. **View Results**: Results will appear in the main leaderboard
261
+
262
+ ## Key Features of Dynamic Testing
263
+
264
+ - **On-Demand Evaluation**: Test models immediately without queue
265
+ - **Fixed Text**: Uses consistent test text for fair comparison
266
+ - **Automatic Ranking**: Lower perplexity scores rank higher
267
+ - **Real-time Results**: See results immediately after evaluation
268
+ - **Integration**: Results automatically appear in the main leaderboard
269
+
270
+ ## Customization Options
271
+
272
+ You can customize the perplexity evaluation by:
273
+
274
+ 1. **Changing Test Text**: Modify the default text in `perplexity_eval.py`
275
+ 2. **Adding Multiple Texts**: Evaluate on multiple texts and average results
276
+ 3. **Different Metrics**: Add other metrics like BLEU, ROUGE, etc.
277
+ 4. **Model Loading Options**: Customize model loading parameters
278
+ 5. **Batch Processing**: Process multiple models in sequence
279
+
280
+ ## Security Considerations
281
+
282
+ - Models must be public on Hugging Face Hub
283
+ - Evaluation runs in the Space's environment
284
+ - Results are publicly visible
285
+ - Consider rate limiting for dynamic testing
286
+
287
+ This setup provides a complete dynamic testing system that integrates seamlessly with the existing leaderboard infrastructure.
288
+
289
+ # MODELS TO TEST:
290
+ 'openai-community/gpt2'
291
+ 'EleutherAI/gpt-neo-1.3B'
292
+ 'openai-community/gpt2-large'
requirements.txt CHANGED
@@ -11,6 +11,8 @@ numpy
11
  pandas
12
  python-dateutil
13
  tqdm
14
- transformers
15
  tokenizers>=0.15.0
16
- sentencepiece
 
 
 
11
  pandas
12
  python-dateutil
13
  tqdm
14
+ transformers>=4.30.0
15
  tokenizers>=0.15.0
16
+ sentencepiece
17
+ torch>=2.0.0
18
+ accelerate>=0.20.0
src/about.py CHANGED
@@ -14,6 +14,7 @@ class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  task0 = Task("anli_r1", "acc", "ANLI")
16
  task1 = Task("logiqa", "acc_norm", "LogiQA")
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
 
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  task0 = Task("anli_r1", "acc", "ANLI")
16
  task1 = Task("logiqa", "acc_norm", "LogiQA")
17
+ task2 = Task("perplexity", "perplexity", "Perplexity")
18
 
19
  NUM_FEWSHOT = 0 # Change with your few shot
20
  # ---------------------------------------------------
src/evaluation/dynamic_eval.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime
4
+ from src.evaluation.perplexity_eval import evaluate_perplexity, create_perplexity_result
5
+ from src.envs import EVAL_RESULTS_PATH, API, RESULTS_REPO
6
+
7
+ def run_dynamic_perplexity_eval(model_name, revision="main", precision="float16"):
8
+ """
9
+ Run perplexity evaluation and save results.
10
+ """
11
+ try:
12
+ # Run evaluation
13
+ perplexity_score = evaluate_perplexity(model_name, revision)
14
+
15
+ # Create result structure
16
+ result = create_perplexity_result(model_name, revision, precision, perplexity_score)
17
+
18
+ # Save result file
19
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
20
+ result_filename = f"results_{model_name.replace('/', '_')}_{timestamp}.json"
21
+
22
+ # Create directory structure
23
+ org, model = model_name.split("/") if "/" in model_name else ("", model_name)
24
+ result_dir = os.path.join(EVAL_RESULTS_PATH, org) if org else EVAL_RESULTS_PATH
25
+ os.makedirs(result_dir, exist_ok=True)
26
+
27
+ result_path = os.path.join(result_dir, result_filename)
28
+
29
+ with open(result_path, "w") as f:
30
+ json.dump(result, f, indent=2)
31
+
32
+ # Upload to Hugging Face dataset
33
+ API.upload_file(
34
+ path_or_fileobj=result_path,
35
+ path_in_repo=result_path.split("eval-results/")[1],
36
+ repo_id=RESULTS_REPO,
37
+ repo_type="dataset",
38
+ commit_message=f"Add perplexity results for {model_name}",
39
+ )
40
+
41
+ return True, perplexity_score
42
+
43
+ except Exception as e:
44
+ return False, str(e)
src/evaluation/perplexity_eval.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ import numpy as np
4
+
5
+ def evaluate_perplexity(model_name, revision="main", test_text=None):
6
+ """
7
+ Evaluate perplexity on a fixed piece of text.
8
+
9
+ Args:
10
+ model_name: Hugging Face model identifier
11
+ revision: Model revision/commit hash
12
+ test_text: Text to evaluate perplexity on (default if None)
13
+
14
+ Returns:
15
+ float: Perplexity score (lower is better)
16
+ """
17
+
18
+ # Default test text if none provided
19
+ if test_text is None:
20
+ test_text = """Artificial intelligence has transformed the way we live and work, bringing both opportunities and challenges.
21
+ From autonomous vehicles to language models that can engage in human-like conversation, AI technologies are becoming increasingly
22
+ sophisticated. However, with this advancement comes the responsibility to ensure these systems are developed and deployed ethically,
23
+ with careful consideration for privacy, fairness, and transparency. The future of AI will likely depend on how well we balance innovation
24
+ with these important social considerations."""
25
+
26
+ # Load model and tokenizer
27
+ model = AutoModelForCausalLM.from_pretrained(
28
+ model_name,
29
+ revision=revision,
30
+ torch_dtype=torch.float16,
31
+ device_map="auto"
32
+ )
33
+ tokenizer = AutoTokenizer.from_pretrained(model_name, revision=revision)
34
+
35
+ # Tokenize the text
36
+ inputs = tokenizer(test_text, return_tensors="pt")
37
+
38
+ # Move to same device as model
39
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
40
+
41
+ # Calculate loss
42
+ with torch.no_grad():
43
+ outputs = model(**inputs, labels=inputs["input_ids"])
44
+ loss = outputs.loss
45
+
46
+ # Calculate perplexity
47
+ perplexity = torch.exp(loss).item()
48
+
49
+ return perplexity
50
+
51
+ def create_perplexity_result(model_name, revision, precision, perplexity_score):
52
+ """
53
+ Create a result file in the expected format.
54
+ """
55
+ return {
56
+ "config": {
57
+ "model_dtype": f"torch.{precision}",
58
+ "model_name": model_name,
59
+ "model_sha": revision,
60
+ },
61
+ "results": {
62
+ "perplexity": {
63
+ "perplexity": perplexity_score,
64
+ }
65
+ }
66
+ }