tlogandesigns commited on
Commit
5a92002
·
1 Parent(s): bab6cf5

add highlight text

Browse files
Files changed (2) hide show
  1. app.py +104 -9
  2. checker.py +32 -8
app.py CHANGED
@@ -1,9 +1,7 @@
1
-
2
- # app.py
3
-
4
  import os
5
  import json
6
  import importlib
 
7
  import gradio as gr
8
 
9
  import checker
@@ -11,10 +9,61 @@ import checker
11
  APP_TITLE = "Fair Housing Image + Text Compliance Checker"
12
  APP_DESC = (
13
  "Upload an image (flyer) and/or paste text. The tool OCRs the image (if provided), "
14
- "checks Fair Housing phrases (rules + optional tiny ML), and verifies brand/agent balance "
15
- "and disclaimer requirements. CPU-only by default. This is not legal advice."
16
  )
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def _parse_company_phones(s: str):
19
  if not s:
20
  return checker.COMPANY_PHONES_DEFAULT
@@ -27,7 +76,6 @@ def _parse_company_phones(s: str):
27
  parts = [p.strip() for p in s.replace("\n", ",").split(",") if p.strip()]
28
  return parts or checker.COMPANY_PHONES_DEFAULT
29
 
30
-
31
  def _ensure_checker_reloaded(enable_ml: bool, hf_repo: str, hf_thresh: float,
32
  req_disc_non_social: bool, phrases_path: str):
33
  need_reload = False
@@ -51,11 +99,24 @@ def _ensure_checker_reloaded(enable_ml: bool, hf_repo: str, hf_thresh: float,
51
  importlib.reload(checker)
52
  return need_reload
53
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  def on_run(image, ptxt, social, agent_name, agent_phone,
56
  company_name, company_phones_json, disclaimer,
57
  enable_ml, hf_repo, hf_thresh,
58
  req_disc_non_social, phrases_path):
 
59
  reloaded = _ensure_checker_reloaded(
60
  enable_ml=bool(enable_ml),
61
  hf_repo=(hf_repo or checker.HF_REPO).strip(),
@@ -63,9 +124,11 @@ def on_run(image, ptxt, social, agent_name, agent_phone,
63
  req_disc_non_social=bool(req_disc_non_social),
64
  phrases_path=(phrases_path or str(checker.PHRASES_PATH)).strip(),
65
  )
 
66
  company_name = (company_name or checker.COMPANY_NAME_DEFAULT).strip()
67
  company_phones = _parse_company_phones(company_phones_json)
68
  disclaimer = (disclaimer or checker.DISCLAIMER_DEFAULT).strip()
 
69
  results = checker.run_check(
70
  image=image,
71
  ptxt=ptxt or "",
@@ -77,20 +140,41 @@ def on_run(image, ptxt, social, agent_name, agent_phone,
77
  disclaimer=disclaimer,
78
  require_disclaimer_on_non_social=None,
79
  )
 
80
  fh_ok = results.get("Fair_Housing", {}).get("compliant", True)
81
  img_ok = results.get("img", {}).get("compliant", True)
82
  ptxt_ok = results.get("Ptxt", {}).get("compliant", True)
 
83
  summary = []
84
  summary.append(f"Fair Housing (rules+ML): {'OK' if fh_ok else 'Needs review'}")
85
  summary.append(f"Image text balance/disclaimer: {'OK' if img_ok else 'Needs review'}")
86
  summary.append(f"Post text balance/disclaimer: {'OK' if ptxt_ok else 'Needs review'}")
 
87
  diag = results.get("Diagnostics", {})
88
  badge = (
89
  f"Tiny ML: {diag.get('USE_TINY_ML')} | Repo: {diag.get('HF_REPO')} | "
90
  f"Thresh: {diag.get('HF_THRESH')} | Phrases: {diag.get('PhrasesLoaded')} | "
91
  f"DisclaimerOnNonSocial: {diag.get('DisclaimerRequiredOnNonSocial')}"
92
  )
93
- return "\n".join(summary), json.dumps(results, indent=2), badge, reloaded
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
  with gr.Blocks(title=APP_TITLE) as demo:
96
  gr.Markdown(f"# {APP_TITLE}\n{APP_DESC}")
@@ -129,6 +213,17 @@ with gr.Blocks(title=APP_TITLE) as demo:
129
  gr.Markdown(
130
  "If you change any of these, the backend module will hot‑reload. On Spaces, ensure requirements include transformers + pyyaml."
131
  )
 
 
 
 
 
 
 
 
 
 
 
132
  run_btn = gr.Button("Run Compliance Check", variant="primary")
133
  run_btn.click(
134
  fn=on_run,
@@ -137,9 +232,9 @@ with gr.Blocks(title=APP_TITLE) as demo:
137
  company_name, company_phones_json, disclaimer,
138
  enable_ml, hf_repo, hf_thresh, req_disc_non_social, phrases_path,
139
  ],
140
- outputs=[summary_out, results_json, diag_badge, reloaded_flag],
141
  )
142
 
143
  if __name__ == "__main__":
144
- demo.launch()
145
 
 
 
 
 
1
  import os
2
  import json
3
  import importlib
4
+ import html
5
  import gradio as gr
6
 
7
  import checker
 
9
  APP_TITLE = "Fair Housing Image + Text Compliance Checker"
10
  APP_DESC = (
11
  "Upload an image (flyer) and/or paste text. The tool OCRs the image (if provided), "
12
+ "highlights potential Fair Housing risks, and verifies brand/agent balance and disclaimer requirements. "
13
+ "This is not legal advice."
14
  )
15
 
16
+ CATEGORY_COLORS = {
17
+ "Familial status": "#e57373",
18
+ "Religion": "#64b5f6",
19
+ "Disability": "#81c784",
20
+ "Sex": "#ba68c8",
21
+ "Race or color": "#4db6ac",
22
+ "National origin": "#ffd54f",
23
+ "Other preference": "#90a4ae",
24
+ }
25
+
26
+ STYLE_BLOCK = """
27
+ <style>
28
+ .mark { padding: 0.1em 0.25em; border-radius: 0.25rem; }
29
+ .badge { display: inline-block; padding: 0 0.35em; border-radius: 0.4rem; font-size: 0.8em; margin-left: 0.3em; opacity: 0.9; }
30
+ .legend { display:flex; flex-wrap:wrap; gap:8px; margin: 0.5rem 0 1rem; }
31
+ .legend .swatch { width: 12px; height: 12px; border-radius: 3px; display:inline-block; margin-right:6px; }
32
+ .hl-container { background: #ffffff; color: #000000; padding: 12px; border-radius: 8px; line-height: 1.7; border: 1px solid #eee; }
33
+ .notice { margin-top: 10px; padding: 8px 10px; border-radius: 8px; background: #ffcccb; }
34
+ </style>
35
+ """
36
+
37
+ def _build_legend(categories: set[str]) -> str:
38
+ parts = ["<div class='legend'>"]
39
+ for cat in sorted(categories):
40
+ color = CATEGORY_COLORS.get(cat, "#bdbdbd")
41
+ parts.append(f"<span><span class='swatch' style='background:{color}'></span>{html.escape(cat)}</span>")
42
+ parts.append("</div>")
43
+ return "".join(parts)
44
+
45
+ def _highlight_html(text: str, spans: list[tuple[int,int,str]], cats: set[str]) -> str:
46
+ if not spans:
47
+ return STYLE_BLOCK + f"<div class='hl-container'>{html.escape(text or '')}</div>"
48
+ text = text or ""
49
+ spans = sorted(spans, key=lambda x: x[0])
50
+ cur = 0
51
+ out = [STYLE_BLOCK, _build_legend(cats), "<div class='hl-container'>"]
52
+ for s, e, cat in spans:
53
+ if s > cur:
54
+ out.append(html.escape(text[cur:s]))
55
+ frag = html.escape(text[s:e])
56
+ color = CATEGORY_COLORS.get(cat, "#bdbdbd")
57
+ out.append(
58
+ f"<span class='mark' style='background:{color}1A; outline: 1px solid {color}55'>"
59
+ f"{frag}<span class='badge' style='background:{color}33'>{html.escape(cat)}</span></span>"
60
+ )
61
+ cur = e
62
+ if cur < len(text):
63
+ out.append(html.escape(text[cur:]))
64
+ out.append("</div>")
65
+ return "".join(out)
66
+
67
  def _parse_company_phones(s: str):
68
  if not s:
69
  return checker.COMPANY_PHONES_DEFAULT
 
76
  parts = [p.strip() for p in s.replace("\n", ",").split(",") if p.strip()]
77
  return parts or checker.COMPANY_PHONES_DEFAULT
78
 
 
79
  def _ensure_checker_reloaded(enable_ml: bool, hf_repo: str, hf_thresh: float,
80
  req_disc_non_social: bool, phrases_path: str):
81
  need_reload = False
 
99
  importlib.reload(checker)
100
  return need_reload
101
 
102
+ def _build_report(findings: list[dict]) -> str:
103
+ if not findings:
104
+ return "No obvious risk phrases found by the rules engine."
105
+ rows = []
106
+ for f in findings:
107
+ sug = ", ".join(f.get("suggestions") or []) if f.get("suggestions") else "N/A"
108
+ rows.append(
109
+ f"- **{f['category']}** → “{f['match']}”\n"
110
+ f" \n _Context_: …{f['context']}…\n"
111
+ f" \n _Suggestions_: {sug}\n"
112
+ )
113
+ return "### Potential issues\n" + "\n".join(rows)
114
 
115
  def on_run(image, ptxt, social, agent_name, agent_phone,
116
  company_name, company_phones_json, disclaimer,
117
  enable_ml, hf_repo, hf_thresh,
118
  req_disc_non_social, phrases_path):
119
+
120
  reloaded = _ensure_checker_reloaded(
121
  enable_ml=bool(enable_ml),
122
  hf_repo=(hf_repo or checker.HF_REPO).strip(),
 
124
  req_disc_non_social=bool(req_disc_non_social),
125
  phrases_path=(phrases_path or str(checker.PHRASES_PATH)).strip(),
126
  )
127
+
128
  company_name = (company_name or checker.COMPANY_NAME_DEFAULT).strip()
129
  company_phones = _parse_company_phones(company_phones_json)
130
  disclaimer = (disclaimer or checker.DISCLAIMER_DEFAULT).strip()
131
+
132
  results = checker.run_check(
133
  image=image,
134
  ptxt=ptxt or "",
 
140
  disclaimer=disclaimer,
141
  require_disclaimer_on_non_social=None,
142
  )
143
+
144
  fh_ok = results.get("Fair_Housing", {}).get("compliant", True)
145
  img_ok = results.get("img", {}).get("compliant", True)
146
  ptxt_ok = results.get("Ptxt", {}).get("compliant", True)
147
+
148
  summary = []
149
  summary.append(f"Fair Housing (rules+ML): {'OK' if fh_ok else 'Needs review'}")
150
  summary.append(f"Image text balance/disclaimer: {'OK' if img_ok else 'Needs review'}")
151
  summary.append(f"Post text balance/disclaimer: {'OK' if ptxt_ok else 'Needs review'}")
152
+
153
  diag = results.get("Diagnostics", {})
154
  badge = (
155
  f"Tiny ML: {diag.get('USE_TINY_ML')} | Repo: {diag.get('HF_REPO')} | "
156
  f"Thresh: {diag.get('HF_THRESH')} | Phrases: {diag.get('PhrasesLoaded')} | "
157
  f"DisclaimerOnNonSocial: {diag.get('DisclaimerRequiredOnNonSocial')}"
158
  )
159
+
160
+ rm = results.get("RuleMatches", {})
161
+ ptxt_findings = (rm.get("ptxt") or {}).get("findings") or []
162
+ ptxt_spans = (rm.get("ptxt") or {}).get("spans") or []
163
+ ptxt_cats = {f["category"] for f in ptxt_findings}
164
+ marked_html_ptxt = _highlight_html(ptxt or "", ptxt_spans, ptxt_cats)
165
+ report_ptxt = _build_report(ptxt_findings)
166
+
167
+ img_findings = (rm.get("img") or {}).get("findings") or []
168
+ img_spans = (rm.get("img") or {}).get("spans") or []
169
+ # Build OCR text by re-running OCR or by reconstructing from spans. We'll reuse run_check's OCR text via spans length:
170
+ # If spans exist but you want to show the raw OCR text, you need it. We can derive it by calling ocr_image again if needed.
171
+ # However, results already computed OCR; to access the text, call checker.ocr_image(image) again safely:
172
+ ocr_text = checker.ocr_image(image) if image is not None else ""
173
+ img_cats = {f["category"] for f in img_findings}
174
+ marked_html_img = _highlight_html(ocr_text, img_spans, img_cats)
175
+ report_img = _build_report(img_findings)
176
+
177
+ return "\n".join(summary), json.dumps(results, indent=2), badge, reloaded, marked_html_ptxt, report_ptxt, marked_html_img, report_img
178
 
179
  with gr.Blocks(title=APP_TITLE) as demo:
180
  gr.Markdown(f"# {APP_TITLE}\n{APP_DESC}")
 
213
  gr.Markdown(
214
  "If you change any of these, the backend module will hot‑reload. On Spaces, ensure requirements include transformers + pyyaml."
215
  )
216
+
217
+ with gr.Row():
218
+ marked_html_ptxt = gr.HTML(label="Highlighted text (Post)")
219
+ with gr.Row():
220
+ report_ptxt = gr.Markdown(label="Report (Post)")
221
+
222
+ with gr.Row():
223
+ marked_html_img = gr.HTML(label="Highlighted text (OCR Image)")
224
+ with gr.Row():
225
+ report_img = gr.Markdown(label="Report (OCR Image)")
226
+
227
  run_btn = gr.Button("Run Compliance Check", variant="primary")
228
  run_btn.click(
229
  fn=on_run,
 
232
  company_name, company_phones_json, disclaimer,
233
  enable_ml, hf_repo, hf_thresh, req_disc_non_social, phrases_path,
234
  ],
235
+ outputs=[summary_out, results_json, diag_badge, reloaded_flag, marked_html_ptxt, report_ptxt, marked_html_img, report_img],
236
  )
237
 
238
  if __name__ == "__main__":
239
+ demo.queue(max_size=16).launch()
240
 
checker.py CHANGED
@@ -1,15 +1,9 @@
1
- # checker.py
2
-
3
  """
4
  checker.py — core logic for Image + Text Compliance Check
5
-
6
- This module is UI-agnostic (no FastAPI/Gradio). Import its functions from
7
- app.py (Gradio) or an API layer. CPU-only; optional tiny HF classifier via env.
8
  """
9
-
10
  from __future__ import annotations
11
  from pathlib import Path
12
- from typing import List, Optional, Dict, Any, Iterable, Union
13
  import os
14
  import re
15
  import json
@@ -99,12 +93,14 @@ def contains_disclaimer(text: str, disclaimer: str) -> bool:
99
 
100
  return squeeze(disclaimer) in squeeze(text)
101
 
 
102
  @dataclass
103
  class Rule:
104
  regex: re.Pattern
105
  category: str
106
  suggests: list[str]
107
 
 
108
  PHRASE_RULES: list[Rule] = []
109
  PHRASES_ERROR: Optional[str] = None
110
 
@@ -260,6 +256,26 @@ def ocr_image(image: Union["Image.Image", bytes, None]) -> str:
260
  return ""
261
 
262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  def send_email_notification(results: Dict[str, Any]):
264
  if not EMAIL_ON_FAILURE or not SMTP_SERVER or not EMAIL_RECIPIENT:
265
  return
@@ -335,10 +351,16 @@ def run_check(
335
  disclaimer=disclaimer,
336
  require_disclaimer_on_non_social=require_disclaimer_on_non_social,
337
  )
 
 
338
  results = {
339
  "Fair_Housing": fair_housing_block,
340
  "img": img_block,
341
  "Ptxt": ptxt_block,
 
 
 
 
342
  "Diagnostics": {
343
  "USE_TINY_ML": USE_TINY_ML,
344
  "HF_REPO": HF_REPO,
@@ -354,6 +376,7 @@ def run_check(
354
  send_email_notification(results)
355
  return results
356
 
 
357
  __all__ = [
358
  "COMPANY_NAME_DEFAULT",
359
  "COMPANY_PHONES_DEFAULT",
@@ -369,6 +392,7 @@ __all__ = [
369
  "fair_housing_flags",
370
  "evaluate_section",
371
  "ocr_image",
 
372
  "run_check",
373
  "send_email_notification",
374
- ]
 
 
 
1
  """
2
  checker.py — core logic for Image + Text Compliance Check
 
 
 
3
  """
 
4
  from __future__ import annotations
5
  from pathlib import Path
6
+ from typing import List, Optional, Dict, Any, Iterable, Union, Tuple
7
  import os
8
  import re
9
  import json
 
93
 
94
  return squeeze(disclaimer) in squeeze(text)
95
 
96
+
97
  @dataclass
98
  class Rule:
99
  regex: re.Pattern
100
  category: str
101
  suggests: list[str]
102
 
103
+
104
  PHRASE_RULES: list[Rule] = []
105
  PHRASES_ERROR: Optional[str] = None
106
 
 
256
  return ""
257
 
258
 
259
+ def find_rule_matches(text: str) -> Tuple[List[Dict[str, Any]], List[Tuple[int, int, str]]]:
260
+ text = text or ""
261
+ findings: List[Dict[str, Any]] = []
262
+ spans: List[Tuple[int, int, str]] = []
263
+ for rule in PHRASE_RULES:
264
+ for m in rule.regex.finditer(text):
265
+ s, e = m.span()
266
+ snippet = text[max(0, s - 40): min(len(text), e + 40)]
267
+ findings.append({
268
+ "category": rule.category,
269
+ "match": m.group(0),
270
+ "start": s,
271
+ "end": e,
272
+ "context": snippet,
273
+ "suggestions": (rule.suggests or [])[:3],
274
+ })
275
+ spans.append((s, e, rule.category))
276
+ return findings, spans
277
+
278
+
279
  def send_email_notification(results: Dict[str, Any]):
280
  if not EMAIL_ON_FAILURE or not SMTP_SERVER or not EMAIL_RECIPIENT:
281
  return
 
351
  disclaimer=disclaimer,
352
  require_disclaimer_on_non_social=require_disclaimer_on_non_social,
353
  )
354
+ img_findings, img_spans = find_rule_matches(itxt)
355
+ ptxt_findings, ptxt_spans = find_rule_matches(ptxt)
356
  results = {
357
  "Fair_Housing": fair_housing_block,
358
  "img": img_block,
359
  "Ptxt": ptxt_block,
360
+ "RuleMatches": {
361
+ "img": {"findings": img_findings, "spans": img_spans},
362
+ "ptxt": {"findings": ptxt_findings, "spans": ptxt_spans},
363
+ },
364
  "Diagnostics": {
365
  "USE_TINY_ML": USE_TINY_ML,
366
  "HF_REPO": HF_REPO,
 
376
  send_email_notification(results)
377
  return results
378
 
379
+
380
  __all__ = [
381
  "COMPANY_NAME_DEFAULT",
382
  "COMPANY_PHONES_DEFAULT",
 
392
  "fair_housing_flags",
393
  "evaluate_section",
394
  "ocr_image",
395
+ "find_rule_matches",
396
  "run_check",
397
  "send_email_notification",
398
+ ]