tasal9 commited on
Commit
2e0bc05
·
1 Parent(s): a7bc86c

feat: add smoke_test and generation metrics (latency/token counts)

Browse files
Files changed (2) hide show
  1. app.py +36 -2
  2. smoke_test.py +21 -0
app.py CHANGED
@@ -11,6 +11,7 @@ from transformers.pipelines import pipeline
11
  from transformers import AutoTokenizer
12
  import torch
13
  import importlib
 
14
 
15
 
16
  # ---------------- Configuration ----------------
@@ -44,6 +45,15 @@ LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
44
  logging.basicConfig(level=LOG_LEVEL, format="%(asctime)s %(levelname)s %(message)s")
45
  logger = logging.getLogger("zamai-app")
46
 
 
 
 
 
 
 
 
 
 
47
 
48
  # ---------------- Utilities ----------------
49
  SAMPLE_INSTRUCTIONS = [
@@ -165,6 +175,7 @@ def predict(instruction: str,
165
 
166
  allowed_keys = {"max_new_tokens", "num_beams", "do_sample", "temperature", "top_p", "num_return_sequences"}
167
 
 
168
  try:
169
  gen = get_generator()
170
  raw_kwargs = {
@@ -189,9 +200,28 @@ def predict(instruction: str,
189
  if text:
190
  texts.append(text)
191
  if not texts:
 
 
 
 
 
 
 
192
  return f"### Prompt\n\n````\n{prompt}\n````\n\n### Output\n\n⚠️ No response generated."
193
  joined = "\n\n---\n\n".join(texts)
194
- return f"### Prompt\n\n````\n{prompt}\n````\n\n### Output\n\n{joined}"
 
 
 
 
 
 
 
 
 
 
 
 
195
  except Exception as e:
196
  logger.exception("Generation failed: %s", e)
197
  return f"⚠️ Generation failed: {e}"
@@ -245,7 +275,11 @@ def build_ui():
245
  instruction_dropdown.change(lambda x: x, inputs=instruction_dropdown, outputs=instruction_textbox)
246
 
247
  def refresh():
248
- return f"**Device:** {'GPU' if _detect_device() != -1 else 'CPU'} | **Offline:** {os.getenv('HF_HUB_OFFLINE','0')} | **Env Mode:** {ECHO_MODE}"
 
 
 
 
249
 
250
  refresh_status.click(fn=refresh, inputs=None, outputs=status_box)
251
 
 
11
  from transformers import AutoTokenizer
12
  import torch
13
  import importlib
14
+ import time
15
 
16
 
17
  # ---------------- Configuration ----------------
 
45
  logging.basicConfig(level=LOG_LEVEL, format="%(asctime)s %(levelname)s %(message)s")
46
  logger = logging.getLogger("zamai-app")
47
 
48
+ # Metrics storage for last real generation
49
+ LAST_METRICS = {
50
+ "latency_sec": None,
51
+ "input_tokens": None,
52
+ "output_tokens": None,
53
+ "num_sequences": None,
54
+ "mode": None,
55
+ }
56
+
57
 
58
  # ---------------- Utilities ----------------
59
  SAMPLE_INSTRUCTIONS = [
 
175
 
176
  allowed_keys = {"max_new_tokens", "num_beams", "do_sample", "temperature", "top_p", "num_return_sequences"}
177
 
178
+ start = time.time()
179
  try:
180
  gen = get_generator()
181
  raw_kwargs = {
 
200
  if text:
201
  texts.append(text)
202
  if not texts:
203
+ LAST_METRICS.update({
204
+ "latency_sec": round(time.time() - start, 3),
205
+ "input_tokens": None,
206
+ "output_tokens": 0,
207
+ "num_sequences": 0,
208
+ "mode": active_mode,
209
+ })
210
  return f"### Prompt\n\n````\n{prompt}\n````\n\n### Output\n\n⚠️ No response generated."
211
  joined = "\n\n---\n\n".join(texts)
212
+
213
+ # Basic token counting via whitespace split (approximate)
214
+ input_tokens = len(prompt.split())
215
+ output_tokens = sum(len(t.split()) for t in texts)
216
+ LAST_METRICS.update({
217
+ "latency_sec": round(time.time() - start, 3),
218
+ "input_tokens": input_tokens,
219
+ "output_tokens": output_tokens,
220
+ "num_sequences": len(texts),
221
+ "mode": active_mode,
222
+ })
223
+ metrics_md = f"\n\n### Metrics\n- Latency: {LAST_METRICS['latency_sec']}s\n- Input tokens (approx): {input_tokens}\n- Output tokens (approx): {output_tokens}\n- Sequences: {len(texts)}"
224
+ return f"### Prompt\n\n````\n{prompt}\n````\n\n### Output\n\n{joined}{metrics_md}"
225
  except Exception as e:
226
  logger.exception("Generation failed: %s", e)
227
  return f"⚠️ Generation failed: {e}"
 
275
  instruction_dropdown.change(lambda x: x, inputs=instruction_dropdown, outputs=instruction_textbox)
276
 
277
  def refresh():
278
+ base = f"**Device:** {'GPU' if _detect_device() != -1 else 'CPU'} | **Offline:** {os.getenv('HF_HUB_OFFLINE','0')} | **Env Mode:** {ECHO_MODE}"
279
+ if LAST_METRICS.get('latency_sec') is not None:
280
+ base += (f"<br>**Last Gen:** latency={LAST_METRICS['latency_sec']}s, "
281
+ f"in≈{LAST_METRICS['input_tokens']}, out≈{LAST_METRICS['output_tokens']}, seqs={LAST_METRICS['num_sequences']}")
282
+ return base
283
 
284
  refresh_status.click(fn=refresh, inputs=None, outputs=status_box)
285
 
smoke_test.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from app import predict
3
+
4
+ # Basic smoke tests for each mode.
5
+ # Note: Real mode will load the model weights; keep max_new_tokens small.
6
+
7
+ def run():
8
+ instruction = "ازموینه" # Pashto for test
9
+ print("=== Echo Mode ===")
10
+ print(predict(instruction, "", 8, 2, True, 1.0, 0.9, 1, "echo"))
11
+ print("\n=== Useless Mode ===")
12
+ print(predict(instruction, "", 8, 2, True, 1.0, 0.9, 1, "useless"))
13
+ print("\n=== Real Mode (off) ===")
14
+ t0 = time.time()
15
+ out = predict(instruction, "", 8, 2, True, 1.0, 0.9, 1, "off")
16
+ dt = time.time() - t0
17
+ print(out)
18
+ print(f"\n[Latency real mode: {dt:.2f}s]")
19
+
20
+ if __name__ == "__main__":
21
+ run()