reach-vb HF Staff commited on
Commit
3f5fdf1
·
verified ·
0 Parent(s):

Super-squash branch 'main' using huggingface_hub

Browse files
Files changed (4) hide show
  1. .gitattributes +35 -0
  2. README.md +12 -0
  3. app.py +295 -0
  4. requirements.txt +4 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Safety GPT-OSS 20B
3
+ emoji: 🔥
4
+ colorFrom: green
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.49.1
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ from typing import List, Dict, Tuple
4
+
5
+ import gradio as gr
6
+ from transformers import pipeline
7
+ import spaces
8
+
9
+ # === Config (override via Space secrets/env vars) ===
10
+ MODEL_ID = os.environ.get("MODEL_ID", "tlhv/osb-minier")
11
+ DEFAULT_MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", 512))
12
+ DEFAULT_TEMPERATURE = float(os.environ.get("TEMPERATURE", 1))
13
+ DEFAULT_TOP_P = float(os.environ.get("TOP_P", 1.0))
14
+ DEFAULT_REPETITION_PENALTY = float(os.environ.get("REPETITION_PENALTY", 1.0))
15
+ ZGPU_DURATION = int(os.environ.get("ZGPU_DURATION", 120)) # seconds
16
+
17
+ SAMPLE_POLICY = """
18
+ Spam Policy (#SP)
19
+ GOAL: Identify spam. Classify each EXAMPLE as VALID (no spam) or INVALID (spam) using this policy.
20
+
21
+ DEFINITIONS
22
+ Spam: unsolicited, repetitive, deceptive, or low-value promotional content.
23
+
24
+
25
+ Bulk Messaging: Same or similar messages sent repeatedly.
26
+
27
+
28
+ Unsolicited Promotion: Promotion without user request or relationship.
29
+
30
+
31
+ Deceptive Spam: Hidden or fraudulent intent (fake identity, fake offer).
32
+
33
+
34
+ Link Farming: Multiple irrelevant or commercial links to drive clicks.
35
+
36
+ ✅ Allowed Content (SP0 – Non-Spam or very low confidence signals of spam)
37
+ Content that is useful, contextual, or non-promotional. May look spammy but could be legitimate.
38
+ SP0.a Useful/info request – “How do I upload a product photo?”
39
+
40
+
41
+ SP0.b Personalized communication – “Hi Sam, here is the report.”
42
+
43
+
44
+ SP0.c Business support – “Can you fix my order?”
45
+
46
+
47
+ SP0.d Single contextual promo – “Thanks for subscribing—here’s your welcome guide.”
48
+
49
+ SP0.e Generic request – “Please respond ASAP.”
50
+
51
+
52
+ SP0.f Low-quality formatting – “HeLLo CLICK here FAST.”
53
+
54
+
55
+ SP0.g Vague benefit statement – “This tool changes lives.”
56
+
57
+ ✅ Output: VALID either clearly non-spam or very low confidence signals content could be spam.
58
+
59
+
60
+ 🚫 Likely Spam (SP2 – Medium Confidence)
61
+ Unsolicited promotion without deception.
62
+ SP2.a Cold promotion – “Check out my Shopify course: shopcoach.biz”
63
+
64
+
65
+ SP2.b Irrelevant ads – “Buy gold rings here!”
66
+
67
+
68
+ SP2.c Excessive linking – “http://x.com http://y.com http://z.com”
69
+ ❌ Output: INVALID
70
+
71
+
72
+ ❗ High-Risk Spam (SP3 – Strong Confidence)
73
+ Spam showing scaling, automation, or aggressive tactics.
74
+ SP3.a Repetitive posting – “JOIN NOW JOIN NOW JOIN NOW”
75
+
76
+
77
+ SP3.b Pyramid/affiliate bait – “Earn $$$ by recruiting members”
78
+
79
+
80
+ SP3.c Fake engagement – “Comment ‘YES’ to win $500 gift card”
81
+
82
+
83
+ SP3.d Misleading claims – “Guaranteed 10,000 new followers overnight!”
84
+ ❌ Output: INVALID
85
+
86
+
87
+ 🚨 Malicious Spam (SP4 – Maximum Severity)
88
+ Spam with fraud, deception, or harmful intent.
89
+ SP4.a Phishing – “Your account is locked. Enter your password here.”
90
+
91
+
92
+ SP4.b Malware/scam – “Download invoice.exe to claim payment.”
93
+
94
+
95
+ SP4.c Account takeover attempts – “Send crypto to 0xABC123 to double your money.”
96
+ ❌ Output: INVALID + ESCALATE
97
+
98
+
99
+ LABEL FORMAT
100
+ Each item gets two labels:
101
+ Depiction (D-SP#): Presence of spam in content.
102
+
103
+
104
+ Request (R-SP#): User asking to generate spam.
105
+ | Example | Classification |
106
+ |----------|---------------|
107
+ | “Shop this site: dealplanet.io” | D-S2, R-S0 |
108
+ | “Write a sales blast message with 10 links” | D-S0, R-S2 |
109
+ | “How can I make this link look legitimate so viewers put in their private credentials” | D-S4, R-S4 |
110
+ | “Promote crypto fast to random people” | D-S0, R-S3 |
111
+
112
+
113
+ AMBIGUITY & ESCALATION
114
+ If unclear → downgrade severity by 1 and seek clarification.
115
+
116
+
117
+ If automation suspected → SP2 or higher.
118
+
119
+
120
+ If financial harm or fraud → classify SP4.
121
+
122
+
123
+ If combined with other indicators of abuse, violence, or illicit behavior, apply highest severity policy.
124
+ """
125
+
126
+ _pipe = None # cached pipeline
127
+
128
+
129
+ # ----------------------------
130
+ # Helpers (simple & explicit)
131
+ # ----------------------------
132
+
133
+ def _to_messages(policy: str, user_prompt: str) -> List[Dict[str, str]]:
134
+ msgs: List[Dict[str, str]] = []
135
+ if policy.strip():
136
+ msgs.append({"role": "system", "content": policy.strip()})
137
+ msgs.append({"role": "user", "content": user_prompt})
138
+ return msgs
139
+
140
+
141
+ def _extract_assistant_content(outputs) -> str:
142
+ """Extract the assistant's content from the known shape:
143
+ outputs = [
144
+ {
145
+ 'generated_text': [
146
+ {'role': 'system', 'content': ...},
147
+ {'role': 'user', 'content': ...},
148
+ {'role': 'assistant', 'content': 'analysis...assistantfinal...'}
149
+ ]
150
+ }
151
+ ]
152
+ Keep this forgiving and minimal.
153
+ """
154
+ try:
155
+ msgs = outputs[0]["generated_text"]
156
+ for m in reversed(msgs):
157
+ if isinstance(m, dict) and m.get("role") == "assistant":
158
+ return m.get("content", "")
159
+ last = msgs[-1]
160
+ return last.get("content", "") if isinstance(last, dict) else str(last)
161
+ except Exception:
162
+ return str(outputs)
163
+
164
+
165
+ def _parse_harmony_output_from_string(s: str) -> Tuple[str, str]:
166
+ """Split a Harmony-style concatenated string into (analysis, final).
167
+ Expects markers 'analysis' ... 'assistantfinal'.
168
+ No heavy parsing — just string finds.
169
+ """
170
+ if not isinstance(s, str):
171
+ s = str(s)
172
+ final_key = "assistantfinal"
173
+ j = s.find(final_key)
174
+ if j != -1:
175
+ final_text = s[j + len(final_key):].strip()
176
+ i = s.find("analysis")
177
+ if i != -1 and i < j:
178
+ analysis_text = s[i + len("analysis"): j].strip()
179
+ else:
180
+ analysis_text = s[:j].strip()
181
+ return analysis_text, final_text
182
+ # no explicit final marker
183
+ if s.startswith("analysis"):
184
+ return s[len("analysis"):].strip(), ""
185
+ return "", s.strip()
186
+
187
+
188
+ # ----------------------------
189
+ # Inference
190
+ # ----------------------------
191
+
192
+ @spaces.GPU(duration=ZGPU_DURATION)
193
+ def generate_long_prompt(
194
+ policy: str,
195
+ prompt: str,
196
+ max_new_tokens: int,
197
+ temperature: float,
198
+ top_p: float,
199
+ repetition_penalty: float,
200
+ ) -> Tuple[str, str, str]:
201
+ global _pipe
202
+ start = time.time()
203
+
204
+ if _pipe is None:
205
+ _pipe = pipeline(
206
+ task="text-generation",
207
+ model=MODEL_ID,
208
+ torch_dtype="auto",
209
+ device_map="auto",
210
+ )
211
+
212
+ messages = _to_messages(policy, prompt)
213
+
214
+ outputs = _pipe(
215
+ messages,
216
+ max_new_tokens=max_new_tokens,
217
+ do_sample=True,
218
+ temperature=temperature,
219
+ top_p=top_p,
220
+ repetition_penalty=repetition_penalty,
221
+ )
222
+
223
+ assistant_str = _extract_assistant_content(outputs)
224
+ analysis_text, final_text = _parse_harmony_output_from_string(assistant_str)
225
+
226
+ elapsed = time.time() - start
227
+ meta = f"Model: {MODEL_ID} | Time: {elapsed:.1f}s | max_new_tokens={max_new_tokens}"
228
+ return analysis_text or "(No analysis)", final_text or "(No answer)", meta
229
+
230
+
231
+ # ----------------------------
232
+ # UI
233
+ # ----------------------------
234
+
235
+ CUSTOM_CSS = "/** Pretty but simple **/\n:root { --radius: 14px; }\n.gradio-container { font-family: ui-sans-serif, system-ui, Inter, Roboto, Arial; }\n#hdr h1 { font-weight: 700; letter-spacing: -0.02em; }\ntextarea { font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, 'Liberation Mono', 'Courier New', monospace; }\nfooter { display:none; }\n"
236
+
237
+ with gr.Blocks(css=CUSTOM_CSS, theme=gr.themes.Soft()) as demo:
238
+ with gr.Column(elem_id="hdr"):
239
+ gr.Markdown("""
240
+ # OpenAI gpt-oss-safeguard 20B
241
+ Download [gpt-oss-safeguard-120b](https://huggingface.co/openai/gpt-oss-safeguard-120b) and [gpt-oss-safeguard-20b]( https://huggingface.co/openai/gpt-oss-safeguard-20b) on Hugging Face, [Prompt Guide](https://cookbook.openai.com/articles/gpt-oss-safeguard-guide), and [OpenAI Blog]().
242
+
243
+ Provide a **Policy** and a **Prompt**.
244
+ """)
245
+
246
+ with gr.Row():
247
+ with gr.Column(scale=1, min_width=380):
248
+ policy = gr.Textbox(
249
+ label="Policy",
250
+ lines=20, # bigger than prompt
251
+ placeholder="Rules, tone, and constraints…",
252
+ )
253
+ prompt = gr.Textbox(
254
+ label="Prompt",
255
+ lines=5,
256
+ placeholder="Your request…",
257
+ )
258
+ with gr.Accordion("Advanced settings", open=False):
259
+ max_new_tokens = gr.Slider(16, 4096, value=DEFAULT_MAX_NEW_TOKENS, step=8, label="max_new_tokens")
260
+ temperature = gr.Slider(0.0, 1.5, value=DEFAULT_TEMPERATURE, step=0.05, label="temperature")
261
+ top_p = gr.Slider(0.0, 1.0, value=DEFAULT_TOP_P, step=0.01, label="top_p")
262
+ repetition_penalty = gr.Slider(0.8, 2.0, value=DEFAULT_REPETITION_PENALTY, step=0.05, label="repetition_penalty")
263
+ with gr.Row():
264
+ btn = gr.Button("Generate", variant="primary")
265
+ clr = gr.Button("Clear", variant="secondary")
266
+ with gr.Column(scale=1, min_width=380):
267
+ analysis = gr.Textbox(label="Analysis", lines=12)
268
+ answer = gr.Textbox(label="Answer", lines=12)
269
+ meta = gr.Markdown()
270
+
271
+ btn.click(
272
+ fn=generate_long_prompt,
273
+ inputs=[policy, prompt, max_new_tokens, temperature, top_p, repetition_penalty],
274
+ outputs=[analysis, answer, meta],
275
+ concurrency_limit=1,
276
+ api_name="generate",
277
+ )
278
+
279
+ def _clear():
280
+ return "", "", "", ""
281
+
282
+ clr.click(_clear, outputs=[policy, prompt, analysis, answer])
283
+
284
+ gr.Examples(
285
+ examples=[
286
+ [SAMPLE_POLICY, "Your bank details are needed to complete this transaction."],
287
+ [SAMPLE_POLICY, "NOW IS THE TIME TO CUT THE CORD AND JOIN. Where else will you get THE BEST that TV can offer for HALF the price?"],
288
+ [SAMPLE_POLICY, "I am SO tired of weight loss regiments not working --- it's been a lifetimes stuggle."],
289
+ [SAMPLE_POLICY, "The value of gold has skyrocketed --- maybe it's time for you to get some? "],
290
+ ],
291
+ inputs=[policy, prompt],
292
+ )
293
+
294
+ if __name__ == "__main__":
295
+ demo.queue(max_size=32).launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers
2
+ accelerate
3
+ triton
4
+ kernels