Commit
·
5a92002
1
Parent(s):
bab6cf5
add highlight text
Browse files- app.py +104 -9
- checker.py +32 -8
app.py
CHANGED
@@ -1,9 +1,7 @@
|
|
1 |
-
|
2 |
-
# app.py
|
3 |
-
|
4 |
import os
|
5 |
import json
|
6 |
import importlib
|
|
|
7 |
import gradio as gr
|
8 |
|
9 |
import checker
|
@@ -11,10 +9,61 @@ import checker
|
|
11 |
APP_TITLE = "Fair Housing Image + Text Compliance Checker"
|
12 |
APP_DESC = (
|
13 |
"Upload an image (flyer) and/or paste text. The tool OCRs the image (if provided), "
|
14 |
-
"
|
15 |
-
"
|
16 |
)
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
def _parse_company_phones(s: str):
|
19 |
if not s:
|
20 |
return checker.COMPANY_PHONES_DEFAULT
|
@@ -27,7 +76,6 @@ def _parse_company_phones(s: str):
|
|
27 |
parts = [p.strip() for p in s.replace("\n", ",").split(",") if p.strip()]
|
28 |
return parts or checker.COMPANY_PHONES_DEFAULT
|
29 |
|
30 |
-
|
31 |
def _ensure_checker_reloaded(enable_ml: bool, hf_repo: str, hf_thresh: float,
|
32 |
req_disc_non_social: bool, phrases_path: str):
|
33 |
need_reload = False
|
@@ -51,11 +99,24 @@ def _ensure_checker_reloaded(enable_ml: bool, hf_repo: str, hf_thresh: float,
|
|
51 |
importlib.reload(checker)
|
52 |
return need_reload
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
def on_run(image, ptxt, social, agent_name, agent_phone,
|
56 |
company_name, company_phones_json, disclaimer,
|
57 |
enable_ml, hf_repo, hf_thresh,
|
58 |
req_disc_non_social, phrases_path):
|
|
|
59 |
reloaded = _ensure_checker_reloaded(
|
60 |
enable_ml=bool(enable_ml),
|
61 |
hf_repo=(hf_repo or checker.HF_REPO).strip(),
|
@@ -63,9 +124,11 @@ def on_run(image, ptxt, social, agent_name, agent_phone,
|
|
63 |
req_disc_non_social=bool(req_disc_non_social),
|
64 |
phrases_path=(phrases_path or str(checker.PHRASES_PATH)).strip(),
|
65 |
)
|
|
|
66 |
company_name = (company_name or checker.COMPANY_NAME_DEFAULT).strip()
|
67 |
company_phones = _parse_company_phones(company_phones_json)
|
68 |
disclaimer = (disclaimer or checker.DISCLAIMER_DEFAULT).strip()
|
|
|
69 |
results = checker.run_check(
|
70 |
image=image,
|
71 |
ptxt=ptxt or "",
|
@@ -77,20 +140,41 @@ def on_run(image, ptxt, social, agent_name, agent_phone,
|
|
77 |
disclaimer=disclaimer,
|
78 |
require_disclaimer_on_non_social=None,
|
79 |
)
|
|
|
80 |
fh_ok = results.get("Fair_Housing", {}).get("compliant", True)
|
81 |
img_ok = results.get("img", {}).get("compliant", True)
|
82 |
ptxt_ok = results.get("Ptxt", {}).get("compliant", True)
|
|
|
83 |
summary = []
|
84 |
summary.append(f"Fair Housing (rules+ML): {'OK' if fh_ok else 'Needs review'}")
|
85 |
summary.append(f"Image text balance/disclaimer: {'OK' if img_ok else 'Needs review'}")
|
86 |
summary.append(f"Post text balance/disclaimer: {'OK' if ptxt_ok else 'Needs review'}")
|
|
|
87 |
diag = results.get("Diagnostics", {})
|
88 |
badge = (
|
89 |
f"Tiny ML: {diag.get('USE_TINY_ML')} | Repo: {diag.get('HF_REPO')} | "
|
90 |
f"Thresh: {diag.get('HF_THRESH')} | Phrases: {diag.get('PhrasesLoaded')} | "
|
91 |
f"DisclaimerOnNonSocial: {diag.get('DisclaimerRequiredOnNonSocial')}"
|
92 |
)
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
with gr.Blocks(title=APP_TITLE) as demo:
|
96 |
gr.Markdown(f"# {APP_TITLE}\n{APP_DESC}")
|
@@ -129,6 +213,17 @@ with gr.Blocks(title=APP_TITLE) as demo:
|
|
129 |
gr.Markdown(
|
130 |
"If you change any of these, the backend module will hot‑reload. On Spaces, ensure requirements include transformers + pyyaml."
|
131 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
run_btn = gr.Button("Run Compliance Check", variant="primary")
|
133 |
run_btn.click(
|
134 |
fn=on_run,
|
@@ -137,9 +232,9 @@ with gr.Blocks(title=APP_TITLE) as demo:
|
|
137 |
company_name, company_phones_json, disclaimer,
|
138 |
enable_ml, hf_repo, hf_thresh, req_disc_non_social, phrases_path,
|
139 |
],
|
140 |
-
outputs=[summary_out, results_json, diag_badge, reloaded_flag],
|
141 |
)
|
142 |
|
143 |
if __name__ == "__main__":
|
144 |
-
demo.launch()
|
145 |
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import json
|
3 |
import importlib
|
4 |
+
import html
|
5 |
import gradio as gr
|
6 |
|
7 |
import checker
|
|
|
9 |
APP_TITLE = "Fair Housing Image + Text Compliance Checker"
|
10 |
APP_DESC = (
|
11 |
"Upload an image (flyer) and/or paste text. The tool OCRs the image (if provided), "
|
12 |
+
"highlights potential Fair Housing risks, and verifies brand/agent balance and disclaimer requirements. "
|
13 |
+
"This is not legal advice."
|
14 |
)
|
15 |
|
16 |
+
CATEGORY_COLORS = {
|
17 |
+
"Familial status": "#e57373",
|
18 |
+
"Religion": "#64b5f6",
|
19 |
+
"Disability": "#81c784",
|
20 |
+
"Sex": "#ba68c8",
|
21 |
+
"Race or color": "#4db6ac",
|
22 |
+
"National origin": "#ffd54f",
|
23 |
+
"Other preference": "#90a4ae",
|
24 |
+
}
|
25 |
+
|
26 |
+
STYLE_BLOCK = """
|
27 |
+
<style>
|
28 |
+
.mark { padding: 0.1em 0.25em; border-radius: 0.25rem; }
|
29 |
+
.badge { display: inline-block; padding: 0 0.35em; border-radius: 0.4rem; font-size: 0.8em; margin-left: 0.3em; opacity: 0.9; }
|
30 |
+
.legend { display:flex; flex-wrap:wrap; gap:8px; margin: 0.5rem 0 1rem; }
|
31 |
+
.legend .swatch { width: 12px; height: 12px; border-radius: 3px; display:inline-block; margin-right:6px; }
|
32 |
+
.hl-container { background: #ffffff; color: #000000; padding: 12px; border-radius: 8px; line-height: 1.7; border: 1px solid #eee; }
|
33 |
+
.notice { margin-top: 10px; padding: 8px 10px; border-radius: 8px; background: #ffcccb; }
|
34 |
+
</style>
|
35 |
+
"""
|
36 |
+
|
37 |
+
def _build_legend(categories: set[str]) -> str:
|
38 |
+
parts = ["<div class='legend'>"]
|
39 |
+
for cat in sorted(categories):
|
40 |
+
color = CATEGORY_COLORS.get(cat, "#bdbdbd")
|
41 |
+
parts.append(f"<span><span class='swatch' style='background:{color}'></span>{html.escape(cat)}</span>")
|
42 |
+
parts.append("</div>")
|
43 |
+
return "".join(parts)
|
44 |
+
|
45 |
+
def _highlight_html(text: str, spans: list[tuple[int,int,str]], cats: set[str]) -> str:
|
46 |
+
if not spans:
|
47 |
+
return STYLE_BLOCK + f"<div class='hl-container'>{html.escape(text or '')}</div>"
|
48 |
+
text = text or ""
|
49 |
+
spans = sorted(spans, key=lambda x: x[0])
|
50 |
+
cur = 0
|
51 |
+
out = [STYLE_BLOCK, _build_legend(cats), "<div class='hl-container'>"]
|
52 |
+
for s, e, cat in spans:
|
53 |
+
if s > cur:
|
54 |
+
out.append(html.escape(text[cur:s]))
|
55 |
+
frag = html.escape(text[s:e])
|
56 |
+
color = CATEGORY_COLORS.get(cat, "#bdbdbd")
|
57 |
+
out.append(
|
58 |
+
f"<span class='mark' style='background:{color}1A; outline: 1px solid {color}55'>"
|
59 |
+
f"{frag}<span class='badge' style='background:{color}33'>{html.escape(cat)}</span></span>"
|
60 |
+
)
|
61 |
+
cur = e
|
62 |
+
if cur < len(text):
|
63 |
+
out.append(html.escape(text[cur:]))
|
64 |
+
out.append("</div>")
|
65 |
+
return "".join(out)
|
66 |
+
|
67 |
def _parse_company_phones(s: str):
|
68 |
if not s:
|
69 |
return checker.COMPANY_PHONES_DEFAULT
|
|
|
76 |
parts = [p.strip() for p in s.replace("\n", ",").split(",") if p.strip()]
|
77 |
return parts or checker.COMPANY_PHONES_DEFAULT
|
78 |
|
|
|
79 |
def _ensure_checker_reloaded(enable_ml: bool, hf_repo: str, hf_thresh: float,
|
80 |
req_disc_non_social: bool, phrases_path: str):
|
81 |
need_reload = False
|
|
|
99 |
importlib.reload(checker)
|
100 |
return need_reload
|
101 |
|
102 |
+
def _build_report(findings: list[dict]) -> str:
|
103 |
+
if not findings:
|
104 |
+
return "No obvious risk phrases found by the rules engine."
|
105 |
+
rows = []
|
106 |
+
for f in findings:
|
107 |
+
sug = ", ".join(f.get("suggestions") or []) if f.get("suggestions") else "N/A"
|
108 |
+
rows.append(
|
109 |
+
f"- **{f['category']}** → “{f['match']}”\n"
|
110 |
+
f" \n _Context_: …{f['context']}…\n"
|
111 |
+
f" \n _Suggestions_: {sug}\n"
|
112 |
+
)
|
113 |
+
return "### Potential issues\n" + "\n".join(rows)
|
114 |
|
115 |
def on_run(image, ptxt, social, agent_name, agent_phone,
|
116 |
company_name, company_phones_json, disclaimer,
|
117 |
enable_ml, hf_repo, hf_thresh,
|
118 |
req_disc_non_social, phrases_path):
|
119 |
+
|
120 |
reloaded = _ensure_checker_reloaded(
|
121 |
enable_ml=bool(enable_ml),
|
122 |
hf_repo=(hf_repo or checker.HF_REPO).strip(),
|
|
|
124 |
req_disc_non_social=bool(req_disc_non_social),
|
125 |
phrases_path=(phrases_path or str(checker.PHRASES_PATH)).strip(),
|
126 |
)
|
127 |
+
|
128 |
company_name = (company_name or checker.COMPANY_NAME_DEFAULT).strip()
|
129 |
company_phones = _parse_company_phones(company_phones_json)
|
130 |
disclaimer = (disclaimer or checker.DISCLAIMER_DEFAULT).strip()
|
131 |
+
|
132 |
results = checker.run_check(
|
133 |
image=image,
|
134 |
ptxt=ptxt or "",
|
|
|
140 |
disclaimer=disclaimer,
|
141 |
require_disclaimer_on_non_social=None,
|
142 |
)
|
143 |
+
|
144 |
fh_ok = results.get("Fair_Housing", {}).get("compliant", True)
|
145 |
img_ok = results.get("img", {}).get("compliant", True)
|
146 |
ptxt_ok = results.get("Ptxt", {}).get("compliant", True)
|
147 |
+
|
148 |
summary = []
|
149 |
summary.append(f"Fair Housing (rules+ML): {'OK' if fh_ok else 'Needs review'}")
|
150 |
summary.append(f"Image text balance/disclaimer: {'OK' if img_ok else 'Needs review'}")
|
151 |
summary.append(f"Post text balance/disclaimer: {'OK' if ptxt_ok else 'Needs review'}")
|
152 |
+
|
153 |
diag = results.get("Diagnostics", {})
|
154 |
badge = (
|
155 |
f"Tiny ML: {diag.get('USE_TINY_ML')} | Repo: {diag.get('HF_REPO')} | "
|
156 |
f"Thresh: {diag.get('HF_THRESH')} | Phrases: {diag.get('PhrasesLoaded')} | "
|
157 |
f"DisclaimerOnNonSocial: {diag.get('DisclaimerRequiredOnNonSocial')}"
|
158 |
)
|
159 |
+
|
160 |
+
rm = results.get("RuleMatches", {})
|
161 |
+
ptxt_findings = (rm.get("ptxt") or {}).get("findings") or []
|
162 |
+
ptxt_spans = (rm.get("ptxt") or {}).get("spans") or []
|
163 |
+
ptxt_cats = {f["category"] for f in ptxt_findings}
|
164 |
+
marked_html_ptxt = _highlight_html(ptxt or "", ptxt_spans, ptxt_cats)
|
165 |
+
report_ptxt = _build_report(ptxt_findings)
|
166 |
+
|
167 |
+
img_findings = (rm.get("img") or {}).get("findings") or []
|
168 |
+
img_spans = (rm.get("img") or {}).get("spans") or []
|
169 |
+
# Build OCR text by re-running OCR or by reconstructing from spans. We'll reuse run_check's OCR text via spans length:
|
170 |
+
# If spans exist but you want to show the raw OCR text, you need it. We can derive it by calling ocr_image again if needed.
|
171 |
+
# However, results already computed OCR; to access the text, call checker.ocr_image(image) again safely:
|
172 |
+
ocr_text = checker.ocr_image(image) if image is not None else ""
|
173 |
+
img_cats = {f["category"] for f in img_findings}
|
174 |
+
marked_html_img = _highlight_html(ocr_text, img_spans, img_cats)
|
175 |
+
report_img = _build_report(img_findings)
|
176 |
+
|
177 |
+
return "\n".join(summary), json.dumps(results, indent=2), badge, reloaded, marked_html_ptxt, report_ptxt, marked_html_img, report_img
|
178 |
|
179 |
with gr.Blocks(title=APP_TITLE) as demo:
|
180 |
gr.Markdown(f"# {APP_TITLE}\n{APP_DESC}")
|
|
|
213 |
gr.Markdown(
|
214 |
"If you change any of these, the backend module will hot‑reload. On Spaces, ensure requirements include transformers + pyyaml."
|
215 |
)
|
216 |
+
|
217 |
+
with gr.Row():
|
218 |
+
marked_html_ptxt = gr.HTML(label="Highlighted text (Post)")
|
219 |
+
with gr.Row():
|
220 |
+
report_ptxt = gr.Markdown(label="Report (Post)")
|
221 |
+
|
222 |
+
with gr.Row():
|
223 |
+
marked_html_img = gr.HTML(label="Highlighted text (OCR Image)")
|
224 |
+
with gr.Row():
|
225 |
+
report_img = gr.Markdown(label="Report (OCR Image)")
|
226 |
+
|
227 |
run_btn = gr.Button("Run Compliance Check", variant="primary")
|
228 |
run_btn.click(
|
229 |
fn=on_run,
|
|
|
232 |
company_name, company_phones_json, disclaimer,
|
233 |
enable_ml, hf_repo, hf_thresh, req_disc_non_social, phrases_path,
|
234 |
],
|
235 |
+
outputs=[summary_out, results_json, diag_badge, reloaded_flag, marked_html_ptxt, report_ptxt, marked_html_img, report_img],
|
236 |
)
|
237 |
|
238 |
if __name__ == "__main__":
|
239 |
+
demo.queue(max_size=16).launch()
|
240 |
|
checker.py
CHANGED
@@ -1,15 +1,9 @@
|
|
1 |
-
# checker.py
|
2 |
-
|
3 |
"""
|
4 |
checker.py — core logic for Image + Text Compliance Check
|
5 |
-
|
6 |
-
This module is UI-agnostic (no FastAPI/Gradio). Import its functions from
|
7 |
-
app.py (Gradio) or an API layer. CPU-only; optional tiny HF classifier via env.
|
8 |
"""
|
9 |
-
|
10 |
from __future__ import annotations
|
11 |
from pathlib import Path
|
12 |
-
from typing import List, Optional, Dict, Any, Iterable, Union
|
13 |
import os
|
14 |
import re
|
15 |
import json
|
@@ -99,12 +93,14 @@ def contains_disclaimer(text: str, disclaimer: str) -> bool:
|
|
99 |
|
100 |
return squeeze(disclaimer) in squeeze(text)
|
101 |
|
|
|
102 |
@dataclass
|
103 |
class Rule:
|
104 |
regex: re.Pattern
|
105 |
category: str
|
106 |
suggests: list[str]
|
107 |
|
|
|
108 |
PHRASE_RULES: list[Rule] = []
|
109 |
PHRASES_ERROR: Optional[str] = None
|
110 |
|
@@ -260,6 +256,26 @@ def ocr_image(image: Union["Image.Image", bytes, None]) -> str:
|
|
260 |
return ""
|
261 |
|
262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
def send_email_notification(results: Dict[str, Any]):
|
264 |
if not EMAIL_ON_FAILURE or not SMTP_SERVER or not EMAIL_RECIPIENT:
|
265 |
return
|
@@ -335,10 +351,16 @@ def run_check(
|
|
335 |
disclaimer=disclaimer,
|
336 |
require_disclaimer_on_non_social=require_disclaimer_on_non_social,
|
337 |
)
|
|
|
|
|
338 |
results = {
|
339 |
"Fair_Housing": fair_housing_block,
|
340 |
"img": img_block,
|
341 |
"Ptxt": ptxt_block,
|
|
|
|
|
|
|
|
|
342 |
"Diagnostics": {
|
343 |
"USE_TINY_ML": USE_TINY_ML,
|
344 |
"HF_REPO": HF_REPO,
|
@@ -354,6 +376,7 @@ def run_check(
|
|
354 |
send_email_notification(results)
|
355 |
return results
|
356 |
|
|
|
357 |
__all__ = [
|
358 |
"COMPANY_NAME_DEFAULT",
|
359 |
"COMPANY_PHONES_DEFAULT",
|
@@ -369,6 +392,7 @@ __all__ = [
|
|
369 |
"fair_housing_flags",
|
370 |
"evaluate_section",
|
371 |
"ocr_image",
|
|
|
372 |
"run_check",
|
373 |
"send_email_notification",
|
374 |
-
]
|
|
|
|
|
|
|
1 |
"""
|
2 |
checker.py — core logic for Image + Text Compliance Check
|
|
|
|
|
|
|
3 |
"""
|
|
|
4 |
from __future__ import annotations
|
5 |
from pathlib import Path
|
6 |
+
from typing import List, Optional, Dict, Any, Iterable, Union, Tuple
|
7 |
import os
|
8 |
import re
|
9 |
import json
|
|
|
93 |
|
94 |
return squeeze(disclaimer) in squeeze(text)
|
95 |
|
96 |
+
|
97 |
@dataclass
|
98 |
class Rule:
|
99 |
regex: re.Pattern
|
100 |
category: str
|
101 |
suggests: list[str]
|
102 |
|
103 |
+
|
104 |
PHRASE_RULES: list[Rule] = []
|
105 |
PHRASES_ERROR: Optional[str] = None
|
106 |
|
|
|
256 |
return ""
|
257 |
|
258 |
|
259 |
+
def find_rule_matches(text: str) -> Tuple[List[Dict[str, Any]], List[Tuple[int, int, str]]]:
|
260 |
+
text = text or ""
|
261 |
+
findings: List[Dict[str, Any]] = []
|
262 |
+
spans: List[Tuple[int, int, str]] = []
|
263 |
+
for rule in PHRASE_RULES:
|
264 |
+
for m in rule.regex.finditer(text):
|
265 |
+
s, e = m.span()
|
266 |
+
snippet = text[max(0, s - 40): min(len(text), e + 40)]
|
267 |
+
findings.append({
|
268 |
+
"category": rule.category,
|
269 |
+
"match": m.group(0),
|
270 |
+
"start": s,
|
271 |
+
"end": e,
|
272 |
+
"context": snippet,
|
273 |
+
"suggestions": (rule.suggests or [])[:3],
|
274 |
+
})
|
275 |
+
spans.append((s, e, rule.category))
|
276 |
+
return findings, spans
|
277 |
+
|
278 |
+
|
279 |
def send_email_notification(results: Dict[str, Any]):
|
280 |
if not EMAIL_ON_FAILURE or not SMTP_SERVER or not EMAIL_RECIPIENT:
|
281 |
return
|
|
|
351 |
disclaimer=disclaimer,
|
352 |
require_disclaimer_on_non_social=require_disclaimer_on_non_social,
|
353 |
)
|
354 |
+
img_findings, img_spans = find_rule_matches(itxt)
|
355 |
+
ptxt_findings, ptxt_spans = find_rule_matches(ptxt)
|
356 |
results = {
|
357 |
"Fair_Housing": fair_housing_block,
|
358 |
"img": img_block,
|
359 |
"Ptxt": ptxt_block,
|
360 |
+
"RuleMatches": {
|
361 |
+
"img": {"findings": img_findings, "spans": img_spans},
|
362 |
+
"ptxt": {"findings": ptxt_findings, "spans": ptxt_spans},
|
363 |
+
},
|
364 |
"Diagnostics": {
|
365 |
"USE_TINY_ML": USE_TINY_ML,
|
366 |
"HF_REPO": HF_REPO,
|
|
|
376 |
send_email_notification(results)
|
377 |
return results
|
378 |
|
379 |
+
|
380 |
__all__ = [
|
381 |
"COMPANY_NAME_DEFAULT",
|
382 |
"COMPANY_PHONES_DEFAULT",
|
|
|
392 |
"fair_housing_flags",
|
393 |
"evaluate_section",
|
394 |
"ocr_image",
|
395 |
+
"find_rule_matches",
|
396 |
"run_check",
|
397 |
"send_email_notification",
|
398 |
+
]
|