Spaces:

tlogandesigns
/

image-text-compliance

Sleeping

App Files Files Community

tlogandesigns commited on Aug 19

Commit

5a92002

1 Parent(s): bab6cf5

add highlight text

Browse files

Files changed (2) hide show

app.py +104 -9
checker.py +32 -8

app.py CHANGED Viewed

@@ -1,9 +1,7 @@
-# app.py
 import os
 import json
 import importlib
 import gradio as gr
 import checker
@@ -11,10 +9,61 @@ import checker
 APP_TITLE = "Fair Housing Image + Text Compliance Checker"
 APP_DESC = (
     "Upload an image (flyer) and/or paste text. The tool OCRs the image (if provided), "
-    "checks Fair Housing phrases (rules + optional tiny ML), and verifies brand/agent balance "
-    "and disclaimer requirements. CPU-only by default. This is not legal advice."
 )
 def _parse_company_phones(s: str):
     if not s:
         return checker.COMPANY_PHONES_DEFAULT
@@ -27,7 +76,6 @@ def _parse_company_phones(s: str):
     parts = [p.strip() for p in s.replace("\n", ",").split(",") if p.strip()]
     return parts or checker.COMPANY_PHONES_DEFAULT
 def _ensure_checker_reloaded(enable_ml: bool, hf_repo: str, hf_thresh: float,
                              req_disc_non_social: bool, phrases_path: str):
     need_reload = False
@@ -51,11 +99,24 @@ def _ensure_checker_reloaded(enable_ml: bool, hf_repo: str, hf_thresh: float,
         importlib.reload(checker)
     return need_reload
 def on_run(image, ptxt, social, agent_name, agent_phone,
            company_name, company_phones_json, disclaimer,
            enable_ml, hf_repo, hf_thresh,
            req_disc_non_social, phrases_path):
     reloaded = _ensure_checker_reloaded(
         enable_ml=bool(enable_ml),
         hf_repo=(hf_repo or checker.HF_REPO).strip(),
@@ -63,9 +124,11 @@ def on_run(image, ptxt, social, agent_name, agent_phone,
         req_disc_non_social=bool(req_disc_non_social),
         phrases_path=(phrases_path or str(checker.PHRASES_PATH)).strip(),
     )
     company_name = (company_name or checker.COMPANY_NAME_DEFAULT).strip()
     company_phones = _parse_company_phones(company_phones_json)
     disclaimer = (disclaimer or checker.DISCLAIMER_DEFAULT).strip()
     results = checker.run_check(
         image=image,
         ptxt=ptxt or "",
@@ -77,20 +140,41 @@ def on_run(image, ptxt, social, agent_name, agent_phone,
         disclaimer=disclaimer,
         require_disclaimer_on_non_social=None,
     )
     fh_ok = results.get("Fair_Housing", {}).get("compliant", True)
     img_ok = results.get("img", {}).get("compliant", True)
     ptxt_ok = results.get("Ptxt", {}).get("compliant", True)
     summary = []
     summary.append(f"Fair Housing (rules+ML): {'OK' if fh_ok else 'Needs review'}")
     summary.append(f"Image text balance/disclaimer: {'OK' if img_ok else 'Needs review'}")
     summary.append(f"Post text balance/disclaimer: {'OK' if ptxt_ok else 'Needs review'}")
     diag = results.get("Diagnostics", {})
     badge = (
         f"Tiny ML: {diag.get('USE_TINY_ML')} | Repo: {diag.get('HF_REPO')} | "
         f"Thresh: {diag.get('HF_THRESH')} | Phrases: {diag.get('PhrasesLoaded')} | "
         f"DisclaimerOnNonSocial: {diag.get('DisclaimerRequiredOnNonSocial')}"
     )
-    return "\n".join(summary), json.dumps(results, indent=2), badge, reloaded
 with gr.Blocks(title=APP_TITLE) as demo:
     gr.Markdown(f"# {APP_TITLE}\n{APP_DESC}")
@@ -129,6 +213,17 @@ with gr.Blocks(title=APP_TITLE) as demo:
         gr.Markdown(
             "If you change any of these, the backend module will hot‑reload. On Spaces, ensure requirements include transformers + pyyaml."
         )
     run_btn = gr.Button("Run Compliance Check", variant="primary")
     run_btn.click(
         fn=on_run,
@@ -137,9 +232,9 @@ with gr.Blocks(title=APP_TITLE) as demo:
             company_name, company_phones_json, disclaimer,
             enable_ml, hf_repo, hf_thresh, req_disc_non_social, phrases_path,
         ],
-        outputs=[summary_out, results_json, diag_badge, reloaded_flag],
     )
 if __name__ == "__main__":
-    demo.launch()

 import os
 import json
 import importlib
+import html
 import gradio as gr
 import checker
 APP_TITLE = "Fair Housing Image + Text Compliance Checker"
 APP_DESC = (
     "Upload an image (flyer) and/or paste text. The tool OCRs the image (if provided), "
+    "highlights potential Fair Housing risks, and verifies brand/agent balance and disclaimer requirements. "
+    "This is not legal advice."
 )
+CATEGORY_COLORS = {
+    "Familial status": "#e57373",
+    "Religion": "#64b5f6",
+    "Disability": "#81c784",
+    "Sex": "#ba68c8",
+    "Race or color": "#4db6ac",
+    "National origin": "#ffd54f",
+    "Other preference": "#90a4ae",
+}
+STYLE_BLOCK = """
+<style>
+.mark { padding: 0.1em 0.25em; border-radius: 0.25rem; }
+.badge { display: inline-block; padding: 0 0.35em; border-radius: 0.4rem; font-size: 0.8em; margin-left: 0.3em; opacity: 0.9; }
+.legend { display:flex; flex-wrap:wrap; gap:8px; margin: 0.5rem 0 1rem; }
+.legend .swatch { width: 12px; height: 12px; border-radius: 3px; display:inline-block; margin-right:6px; }
+.hl-container { background: #ffffff; color: #000000; padding: 12px; border-radius: 8px; line-height: 1.7; border: 1px solid #eee; }
+.notice { margin-top: 10px; padding: 8px 10px; border-radius: 8px; background: #ffcccb; }
+</style>
+"""
+def _build_legend(categories: set[str]) -> str:
+    parts = ["<div class='legend'>"]
+    for cat in sorted(categories):
+        color = CATEGORY_COLORS.get(cat, "#bdbdbd")
+        parts.append(f"<span><span class='swatch' style='background:{color}'></span>{html.escape(cat)}</span>")
+    parts.append("</div>")
+    return "".join(parts)
+def _highlight_html(text: str, spans: list[tuple[int,int,str]], cats: set[str]) -> str:
+    if not spans:
+        return STYLE_BLOCK + f"<div class='hl-container'>{html.escape(text or '')}</div>"
+    text = text or ""
+    spans = sorted(spans, key=lambda x: x[0])
+    cur = 0
+    out = [STYLE_BLOCK, _build_legend(cats), "<div class='hl-container'>"]
+    for s, e, cat in spans:
+        if s > cur:
+            out.append(html.escape(text[cur:s]))
+        frag = html.escape(text[s:e])
+        color = CATEGORY_COLORS.get(cat, "#bdbdbd")
+        out.append(
+            f"<span class='mark' style='background:{color}1A; outline: 1px solid {color}55'>"
+            f"{frag}<span class='badge' style='background:{color}33'>{html.escape(cat)}</span></span>"
+        )
+        cur = e
+    if cur < len(text):
+        out.append(html.escape(text[cur:]))
+    out.append("</div>")
+    return "".join(out)
 def _parse_company_phones(s: str):
     if not s:
         return checker.COMPANY_PHONES_DEFAULT
     parts = [p.strip() for p in s.replace("\n", ",").split(",") if p.strip()]
     return parts or checker.COMPANY_PHONES_DEFAULT
 def _ensure_checker_reloaded(enable_ml: bool, hf_repo: str, hf_thresh: float,
                              req_disc_non_social: bool, phrases_path: str):
     need_reload = False
         importlib.reload(checker)
     return need_reload
+def _build_report(findings: list[dict]) -> str:
+    if not findings:
+        return "No obvious risk phrases found by the rules engine."
+    rows = []
+    for f in findings:
+        sug = ", ".join(f.get("suggestions") or []) if f.get("suggestions") else "N/A"
+        rows.append(
+            f"- **{f['category']}** → “{f['match']}”\n"
+            f"  \n  _Context_: …{f['context']}…\n"
+            f"  \n  _Suggestions_: {sug}\n"
+        )
+    return "### Potential issues\n" + "\n".join(rows)
 def on_run(image, ptxt, social, agent_name, agent_phone,
            company_name, company_phones_json, disclaimer,
            enable_ml, hf_repo, hf_thresh,
            req_disc_non_social, phrases_path):
     reloaded = _ensure_checker_reloaded(
         enable_ml=bool(enable_ml),
         hf_repo=(hf_repo or checker.HF_REPO).strip(),
         req_disc_non_social=bool(req_disc_non_social),
         phrases_path=(phrases_path or str(checker.PHRASES_PATH)).strip(),
     )
     company_name = (company_name or checker.COMPANY_NAME_DEFAULT).strip()
     company_phones = _parse_company_phones(company_phones_json)
     disclaimer = (disclaimer or checker.DISCLAIMER_DEFAULT).strip()
     results = checker.run_check(
         image=image,
         ptxt=ptxt or "",
         disclaimer=disclaimer,
         require_disclaimer_on_non_social=None,
     )
     fh_ok = results.get("Fair_Housing", {}).get("compliant", True)
     img_ok = results.get("img", {}).get("compliant", True)
     ptxt_ok = results.get("Ptxt", {}).get("compliant", True)
     summary = []
     summary.append(f"Fair Housing (rules+ML): {'OK' if fh_ok else 'Needs review'}")
     summary.append(f"Image text balance/disclaimer: {'OK' if img_ok else 'Needs review'}")
     summary.append(f"Post text balance/disclaimer: {'OK' if ptxt_ok else 'Needs review'}")
     diag = results.get("Diagnostics", {})
     badge = (
         f"Tiny ML: {diag.get('USE_TINY_ML')} | Repo: {diag.get('HF_REPO')} | "
         f"Thresh: {diag.get('HF_THRESH')} | Phrases: {diag.get('PhrasesLoaded')} | "
         f"DisclaimerOnNonSocial: {diag.get('DisclaimerRequiredOnNonSocial')}"
     )
+    rm = results.get("RuleMatches", {})
+    ptxt_findings = (rm.get("ptxt") or {}).get("findings") or []
+    ptxt_spans = (rm.get("ptxt") or {}).get("spans") or []
+    ptxt_cats = {f["category"] for f in ptxt_findings}
+    marked_html_ptxt = _highlight_html(ptxt or "", ptxt_spans, ptxt_cats)
+    report_ptxt = _build_report(ptxt_findings)
+    img_findings = (rm.get("img") or {}).get("findings") or []
+    img_spans = (rm.get("img") or {}).get("spans") or []
+    # Build OCR text by re-running OCR or by reconstructing from spans. We'll reuse run_check's OCR text via spans length:
+    # If spans exist but you want to show the raw OCR text, you need it. We can derive it by calling ocr_image again if needed.
+    # However, results already computed OCR; to access the text, call checker.ocr_image(image) again safely:
+    ocr_text = checker.ocr_image(image) if image is not None else ""
+    img_cats = {f["category"] for f in img_findings}
+    marked_html_img = _highlight_html(ocr_text, img_spans, img_cats)
+    report_img = _build_report(img_findings)
+    return "\n".join(summary), json.dumps(results, indent=2), badge, reloaded, marked_html_ptxt, report_ptxt, marked_html_img, report_img
 with gr.Blocks(title=APP_TITLE) as demo:
     gr.Markdown(f"# {APP_TITLE}\n{APP_DESC}")
         gr.Markdown(
             "If you change any of these, the backend module will hot‑reload. On Spaces, ensure requirements include transformers + pyyaml."
         )
+    with gr.Row():
+        marked_html_ptxt = gr.HTML(label="Highlighted text (Post)")
+    with gr.Row():
+        report_ptxt = gr.Markdown(label="Report (Post)")
+    with gr.Row():
+        marked_html_img = gr.HTML(label="Highlighted text (OCR Image)")
+    with gr.Row():
+        report_img = gr.Markdown(label="Report (OCR Image)")
     run_btn = gr.Button("Run Compliance Check", variant="primary")
     run_btn.click(
         fn=on_run,
             company_name, company_phones_json, disclaimer,
             enable_ml, hf_repo, hf_thresh, req_disc_non_social, phrases_path,
         ],
+        outputs=[summary_out, results_json, diag_badge, reloaded_flag, marked_html_ptxt, report_ptxt, marked_html_img, report_img],
     )
 if __name__ == "__main__":
+    demo.queue(max_size=16).launch()

checker.py CHANGED Viewed

@@ -1,15 +1,9 @@
-# checker.py
 """
 checker.py — core logic for Image + Text Compliance Check
-This module is UI-agnostic (no FastAPI/Gradio). Import its functions from
-app.py (Gradio) or an API layer. CPU-only; optional tiny HF classifier via env.
 """
 from __future__ import annotations
 from pathlib import Path
-from typing import List, Optional, Dict, Any, Iterable, Union
 import os
 import re
 import json
@@ -99,12 +93,14 @@ def contains_disclaimer(text: str, disclaimer: str) -> bool:
     return squeeze(disclaimer) in squeeze(text)
 @dataclass
 class Rule:
     regex: re.Pattern
     category: str
     suggests: list[str]
 PHRASE_RULES: list[Rule] = []
 PHRASES_ERROR: Optional[str] = None
@@ -260,6 +256,26 @@ def ocr_image(image: Union["Image.Image", bytes, None]) -> str:
         return ""
 def send_email_notification(results: Dict[str, Any]):
     if not EMAIL_ON_FAILURE or not SMTP_SERVER or not EMAIL_RECIPIENT:
         return
@@ -335,10 +351,16 @@ def run_check(
         disclaimer=disclaimer,
         require_disclaimer_on_non_social=require_disclaimer_on_non_social,
     )
     results = {
         "Fair_Housing": fair_housing_block,
         "img": img_block,
         "Ptxt": ptxt_block,
         "Diagnostics": {
             "USE_TINY_ML": USE_TINY_ML,
             "HF_REPO": HF_REPO,
@@ -354,6 +376,7 @@ def run_check(
     send_email_notification(results)
     return results
 __all__ = [
     "COMPANY_NAME_DEFAULT",
     "COMPANY_PHONES_DEFAULT",
@@ -369,6 +392,7 @@ __all__ = [
     "fair_housing_flags",
     "evaluate_section",
     "ocr_image",
     "run_check",
     "send_email_notification",
-]

 """
 checker.py — core logic for Image + Text Compliance Check
 """
 from __future__ import annotations
 from pathlib import Path
+from typing import List, Optional, Dict, Any, Iterable, Union, Tuple
 import os
 import re
 import json
     return squeeze(disclaimer) in squeeze(text)
 @dataclass
 class Rule:
     regex: re.Pattern
     category: str
     suggests: list[str]
 PHRASE_RULES: list[Rule] = []
 PHRASES_ERROR: Optional[str] = None
         return ""
+def find_rule_matches(text: str) -> Tuple[List[Dict[str, Any]], List[Tuple[int, int, str]]]:
+    text = text or ""
+    findings: List[Dict[str, Any]] = []
+    spans: List[Tuple[int, int, str]] = []
+    for rule in PHRASE_RULES:
+        for m in rule.regex.finditer(text):
+            s, e = m.span()
+            snippet = text[max(0, s - 40): min(len(text), e + 40)]
+            findings.append({
+                "category": rule.category,
+                "match": m.group(0),
+                "start": s,
+                "end": e,
+                "context": snippet,
+                "suggestions": (rule.suggests or [])[:3],
+            })
+            spans.append((s, e, rule.category))
+    return findings, spans
 def send_email_notification(results: Dict[str, Any]):
     if not EMAIL_ON_FAILURE or not SMTP_SERVER or not EMAIL_RECIPIENT:
         return
         disclaimer=disclaimer,
         require_disclaimer_on_non_social=require_disclaimer_on_non_social,
     )
+    img_findings, img_spans = find_rule_matches(itxt)
+    ptxt_findings, ptxt_spans = find_rule_matches(ptxt)
     results = {
         "Fair_Housing": fair_housing_block,
         "img": img_block,
         "Ptxt": ptxt_block,
+        "RuleMatches": {
+            "img": {"findings": img_findings, "spans": img_spans},
+            "ptxt": {"findings": ptxt_findings, "spans": ptxt_spans},
+        },
         "Diagnostics": {
             "USE_TINY_ML": USE_TINY_ML,
             "HF_REPO": HF_REPO,
     send_email_notification(results)
     return results
 __all__ = [
     "COMPANY_NAME_DEFAULT",
     "COMPANY_PHONES_DEFAULT",
     "fair_housing_flags",
     "evaluate_section",
     "ocr_image",
+    "find_rule_matches",
     "run_check",
     "send_email_notification",
+]