File size: 9,406 Bytes
9da12e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
"""
checker.py — core logic for Image + Text Compliance Check

This module is UI-agnostic (no FastAPI/Gradio). Import its functions from
app.py (Gradio) or an API layer. CPU-only; optional tiny HF classifier via env.
"""
from __future__ import annotations

from typing import List, Optional, Dict, Any, Iterable, Union
import os
import re
import json

try:
    from PIL import Image  # type: ignore
except Exception:
    Image = None  # Allows import without PIL when not doing OCR

try:
    import pytesseract  # type: ignore
except Exception:
    pytesseract = None

# -----------------------------
# Config & Constants
# -----------------------------
COMPANY_NAME_DEFAULT = "Berkshire Hathaway HomeServices Beazley, REALTORS"
COMPANY_PHONES_DEFAULT = ["7068631775", "8032337111"]
DISCLAIMER_DEFAULT = (
    "©2025 BHH Affiliates, LLC. An independently owned and operated franchisee of BHH Affiliates, LLC. "
    "Berkshire Hathaway HomeServices and the Berkshire Hathaway HomeServices symbol are registered service marks "
    "of Columbia Insurance Company, a Berkshire Hathaway affiliate. Equal Housing Opportunity."
)

# Behavior toggle for social posts requiring disclaimer (choose True/False)
REQUIRE_DISCLAIMER_ON_SOCIAL = os.getenv("REQUIRE_DISCLAIMER_ON_SOCIAL", "1") == "1"

# Optional HF classifier (tiny) – set USE_TINY_ML=1 to enable
USE_TINY_ML = os.getenv("USE_TINY_ML", "0") == "1"
HF_REPO = os.getenv("HF_REPO", "tlogandesigns/fairhousing-bert-tiny")
HF_THRESH = float(os.getenv("HF_THRESH", "0.75"))

# Rule-based phrases file (optional). If present, we use it for flags.
PHRASES_PATH = os.getenv("PHRASES_PATH", "phrases.yaml")

# -----------------------------
# Utilities
# -----------------------------
PHONE_RE = re.compile(r"\+?1?\D*([2-9]\d{2})\D*(\d{3})\D*(\d{4})")


def normalize_phone(s: str) -> str:
    digits = re.sub(r"\D", "", s or "")
    if len(digits) == 11 and digits.startswith("1"):
        digits = digits[1:]
    return digits


def count_phone_instances(text: str, target_numbers: Iterable[str]) -> int:
    targets = {normalize_phone(n) for n in (target_numbers or []) if n}
    count = 0
    for m in PHONE_RE.finditer(text or ""):
        num = "".join(m.groups())
        if num in targets:
            count += 1
    return count


def escape_name_regex(name: str) -> str:
    # Allow flexible whitespace and optional punctuation inside the name
    parts = [re.escape(p) for p in (name or "").split() if p]
    if not parts:
        return r""  # no name
    # Join with one-or-more whitespace OR punctuation between tokens
    return r"\b" + r"[\s\-.,]+".join(parts) + r"\b"


def count_name_instances(text: str, name: str) -> int:
    if not (name or "").strip():
        return 0
    pattern = re.compile(escape_name_regex(name), re.IGNORECASE)
    return len(pattern.findall(text or ""))


def contains_disclaimer(text: str, disclaimer: str) -> bool:
    if not disclaimer:
        return False
    # Relax matching a bit: compress whitespace in both
    def squeeze(s: str) -> str:
        return re.sub(r"\s+", " ", s or "").strip().lower()

    return squeeze(disclaimer) in squeeze(text)


# -----------------------------
# Fair Housing Classifier (hybrid)
# -----------------------------
try:
    import yaml  # type: ignore
except Exception:
    yaml = None

PHRASE_PATTERNS: List[re.Pattern] = []
if yaml and os.path.exists(PHRASES_PATH):
    try:
        data = yaml.safe_load(open(PHRASES_PATH, "r", encoding="utf-8").read()) or {}
        for rx in data.get("patterns", []):
            # compile as case-insensitive
            PHRASE_PATTERNS.append(re.compile(rx, re.IGNORECASE))
    except Exception as e:
        print("Failed loading phrases.yaml:", e)

# Optional HF pipeline (disabled by default to keep CPU/lightweight)
hf_pipe = None
if USE_TINY_ML:
    try:
        from transformers import pipeline  # type: ignore

        hf_pipe = pipeline("text-classification", model=HF_REPO)
    except Exception as e:
        print("HF model unavailable:", e)
        hf_pipe = None


def fair_housing_flags(text: str) -> List[str]:
    flags: List[str] = []
    t = text or ""

    # Rule-based first
    for pat in PHRASE_PATTERNS:
        for m in pat.finditer(t):
            snippet = t[max(0, m.start() - 30) : m.end() + 30]
            flags.append(
                f"RuleFlag: pattern '{pat.pattern}' matched around: {snippet!r}"
            )

    # Optional tiny model
    if hf_pipe:
        try:
            pred = hf_pipe(t[:2000])  # keep it small
            # Expecting [{'label': 'LABEL_1'/'LABEL_0', 'score': 0.x}] or custom labels
            lbl = pred[0]["label"]
            score = float(pred[0]["score"])
            # Assume LABEL_1 = potential violation (adjust to your model labels)
            if (lbl in ("1", "LABEL_1", "violation", "POSITIVE")) and score >= HF_THRESH:
                flags.append(f"MLFlag: model={HF_REPO} label={lbl} score={score:.2f}")
        except Exception as e:
            flags.append(f"MLFlag: inference error: {e}")

    return flags


# -----------------------------
# Core evaluation logic
# -----------------------------

def evaluate_section(
    text: str,
    social: bool,
    company_name: str,
    company_phones: List[str],
    agent_name: str,
    agent_phone: str,
    disclaimer: str,
    require_disclaimer_on_social: bool,
) -> Dict[str, Any]:
    flags: List[str] = []

    # Counts
    company_name_count = count_name_instances(text, company_name)
    agent_name_count = count_name_instances(text, agent_name)

    office_phone_count = count_phone_instances(text, company_phones)
    agent_phone_count = count_phone_instances(text, [agent_phone] if agent_phone else [])

    # Equality checks
    name_equal = company_name_count == agent_name_count
    phone_equal = office_phone_count == agent_phone_count

    # Disclaimer logic
    disclaimer_ok = True
    if social and require_disclaimer_on_social:
        disclaimer_ok = contains_disclaimer(text, disclaimer)
        if not disclaimer_ok:
            flags.append("Missing disclaimer on social content")

    if not name_equal:
        flags.append(
            f"Name imbalance: company={company_name_count} vs agent={agent_name_count}"
        )
    if not phone_equal:
        flags.append(
            f"Phone imbalance: office={office_phone_count} vs agent={agent_phone_count}"
        )

    compliant = name_equal and phone_equal and disclaimer_ok

    return {
        "compliant": compliant,
        "Flags": flags,
    }


# -----------------------------
# OCR helper (optional)
# -----------------------------

def ocr_image(image: Union["Image.Image", bytes, None]) -> str:
    """OCR a PIL image or raw bytes. Returns empty string if OCR not available."""
    if image is None or pytesseract is None:
        return ""
    try:
        if isinstance(image, bytes):
            if Image is None:
                return ""
            from io import BytesIO

            image = Image.open(BytesIO(image)).convert("RGB")
        return pytesseract.image_to_string(image)  # type: ignore[arg-type]
    except Exception:
        return ""


# -----------------------------
# Orchestration (UI-agnostic)
# -----------------------------

def run_check(
    image: Optional["Image.Image"],
    ptxt: str,
    social: bool,
    agent_name: str,
    agent_phone: str,
    *,
    company_name: str = COMPANY_NAME_DEFAULT,
    company_phones: Optional[List[str]] = None,
    disclaimer: str = DISCLAIMER_DEFAULT,
    require_disclaimer_on_social: Optional[bool] = None,
) -> Dict[str, Any]:
    """
    Execute full pipeline and return payload dict with keys:
    - Fair_Housing
    - img
    - Ptxt
    """
    company_phones = company_phones or COMPANY_PHONES_DEFAULT
    if require_disclaimer_on_social is None:
        require_disclaimer_on_social = REQUIRE_DISCLAIMER_ON_SOCIAL

    itxt = ocr_image(image)

    # Compose combined content
    content = "\n\n".join(x for x in [itxt, ptxt or "", f"Social={social}"] if x)

    # Fair-housing flags on combined content
    fh_flags = fair_housing_flags(content)
    fair_housing_block = {"compliant": len(fh_flags) == 0, "Flags": fh_flags}

    # Evaluate image text section
    img_block = evaluate_section(
        text=itxt,
        social=social,
        company_name=company_name,
        company_phones=company_phones,
        agent_name=agent_name,
        agent_phone=agent_phone,
        disclaimer=disclaimer,
        require_disclaimer_on_social=require_disclaimer_on_social,
    )

    # Evaluate post text section
    ptxt_block = evaluate_section(
        text=ptxt or "",
        social=social,
        company_name=company_name,
        company_phones=company_phones,
        agent_name=agent_name,
        agent_phone=agent_phone,
        disclaimer=disclaimer,
        require_disclaimer_on_social=require_disclaimer_on_social,
    )

    return {
        "Fair_Housing": fair_housing_block,
        "img": img_block,
        "Ptxt": ptxt_block,
    }


__all__ = [
    "COMPANY_NAME_DEFAULT",
    "COMPANY_PHONES_DEFAULT",
    "DISCLAIMER_DEFAULT",
    "REQUIRE_DISCLAIMER_ON_SOCIAL",
    "USE_TINY_ML",
    "HF_REPO",
    "HF_THRESH",
    "PHRASES_PATH",
    "count_phone_instances",
    "count_name_instances",
    "contains_disclaimer",
    "fair_housing_flags",
    "evaluate_section",
    "ocr_image",
    "run_check",
]