Spaces:

tlogandesigns
/

image-text-compliance

Sleeping

App Files Files Community

image-text-compliance / checker.py

tlogandesigns

add init

9da12e6 3 months ago

raw

history blame

9.41 kB

	"""
	checker.py — core logic for Image + Text Compliance Check

	This module is UI-agnostic (no FastAPI/Gradio). Import its functions from
	app.py (Gradio) or an API layer. CPU-only; optional tiny HF classifier via env.
	"""
	from __future__ import annotations

	from typing import List, Optional, Dict, Any, Iterable, Union
	import os
	import re
	import json

	try:
	from PIL import Image # type: ignore
	except Exception:
	Image = None # Allows import without PIL when not doing OCR

	try:
	import pytesseract # type: ignore
	except Exception:
	pytesseract = None

	# -----------------------------
	# Config & Constants
	# -----------------------------
	COMPANY_NAME_DEFAULT = "Berkshire Hathaway HomeServices Beazley, REALTORS"
	COMPANY_PHONES_DEFAULT = ["7068631775", "8032337111"]
	DISCLAIMER_DEFAULT = (
	"©2025 BHH Affiliates, LLC. An independently owned and operated franchisee of BHH Affiliates, LLC. "
	"Berkshire Hathaway HomeServices and the Berkshire Hathaway HomeServices symbol are registered service marks "
	"of Columbia Insurance Company, a Berkshire Hathaway affiliate. Equal Housing Opportunity."
	)

	# Behavior toggle for social posts requiring disclaimer (choose True/False)
	REQUIRE_DISCLAIMER_ON_SOCIAL = os.getenv("REQUIRE_DISCLAIMER_ON_SOCIAL", "1") == "1"

	# Optional HF classifier (tiny) – set USE_TINY_ML=1 to enable
	USE_TINY_ML = os.getenv("USE_TINY_ML", "0") == "1"
	HF_REPO = os.getenv("HF_REPO", "tlogandesigns/fairhousing-bert-tiny")
	HF_THRESH = float(os.getenv("HF_THRESH", "0.75"))

	# Rule-based phrases file (optional). If present, we use it for flags.
	PHRASES_PATH = os.getenv("PHRASES_PATH", "phrases.yaml")

	# -----------------------------
	# Utilities
	# -----------------------------
	PHONE_RE = re.compile(r"\+?1?\D([2-9]\d{2})\D(\d{3})\D*(\d{4})")


	def normalize_phone(s: str) -> str:
	digits = re.sub(r"\D", "", s or "")
	if len(digits) == 11 and digits.startswith("1"):
	digits = digits[1:]
	return digits


	def count_phone_instances(text: str, target_numbers: Iterable[str]) -> int:
	targets = {normalize_phone(n) for n in (target_numbers or []) if n}
	count = 0
	for m in PHONE_RE.finditer(text or ""):
	num = "".join(m.groups())
	if num in targets:
	count += 1
	return count


	def escape_name_regex(name: str) -> str:
	# Allow flexible whitespace and optional punctuation inside the name
	parts = [re.escape(p) for p in (name or "").split() if p]
	if not parts:
	return r"" # no name
	# Join with one-or-more whitespace OR punctuation between tokens
	return r"\b" + r"[\s\-.,]+".join(parts) + r"\b"


	def count_name_instances(text: str, name: str) -> int:
	if not (name or "").strip():
	return 0
	pattern = re.compile(escape_name_regex(name), re.IGNORECASE)
	return len(pattern.findall(text or ""))


	def contains_disclaimer(text: str, disclaimer: str) -> bool:
	if not disclaimer:
	return False
	# Relax matching a bit: compress whitespace in both
	def squeeze(s: str) -> str:
	return re.sub(r"\s+", " ", s or "").strip().lower()

	return squeeze(disclaimer) in squeeze(text)


	# -----------------------------
	# Fair Housing Classifier (hybrid)
	# -----------------------------
	try:
	import yaml # type: ignore
	except Exception:
	yaml = None

	PHRASE_PATTERNS: List[re.Pattern] = []
	if yaml and os.path.exists(PHRASES_PATH):
	try:
	data = yaml.safe_load(open(PHRASES_PATH, "r", encoding="utf-8").read()) or {}
	for rx in data.get("patterns", []):
	# compile as case-insensitive
	PHRASE_PATTERNS.append(re.compile(rx, re.IGNORECASE))
	except Exception as e:
	print("Failed loading phrases.yaml:", e)

	# Optional HF pipeline (disabled by default to keep CPU/lightweight)
	hf_pipe = None
	if USE_TINY_ML:
	try:
	from transformers import pipeline # type: ignore

	hf_pipe = pipeline("text-classification", model=HF_REPO)
	except Exception as e:
	print("HF model unavailable:", e)
	hf_pipe = None


	def fair_housing_flags(text: str) -> List[str]:
	flags: List[str] = []
	t = text or ""

	# Rule-based first
	for pat in PHRASE_PATTERNS:
	for m in pat.finditer(t):
	snippet = t[max(0, m.start() - 30) : m.end() + 30]
	flags.append(
	f"RuleFlag: pattern '{pat.pattern}' matched around: {snippet!r}"
	)

	# Optional tiny model
	if hf_pipe:
	try:
	pred = hf_pipe(t[:2000]) # keep it small
	# Expecting [{'label': 'LABEL_1'/'LABEL_0', 'score': 0.x}] or custom labels
	lbl = pred[0]["label"]
	score = float(pred[0]["score"])
	# Assume LABEL_1 = potential violation (adjust to your model labels)
	if (lbl in ("1", "LABEL_1", "violation", "POSITIVE")) and score >= HF_THRESH:
	flags.append(f"MLFlag: model={HF_REPO} label={lbl} score={score:.2f}")
	except Exception as e:
	flags.append(f"MLFlag: inference error: {e}")

	return flags


	# -----------------------------
	# Core evaluation logic
	# -----------------------------

	def evaluate_section(
	text: str,
	social: bool,
	company_name: str,
	company_phones: List[str],
	agent_name: str,
	agent_phone: str,
	disclaimer: str,
	require_disclaimer_on_social: bool,
	) -> Dict[str, Any]:
	flags: List[str] = []

	# Counts
	company_name_count = count_name_instances(text, company_name)
	agent_name_count = count_name_instances(text, agent_name)

	office_phone_count = count_phone_instances(text, company_phones)
	agent_phone_count = count_phone_instances(text, [agent_phone] if agent_phone else [])

	# Equality checks
	name_equal = company_name_count == agent_name_count
	phone_equal = office_phone_count == agent_phone_count

	# Disclaimer logic
	disclaimer_ok = True
	if social and require_disclaimer_on_social:
	disclaimer_ok = contains_disclaimer(text, disclaimer)
	if not disclaimer_ok:
	flags.append("Missing disclaimer on social content")

	if not name_equal:
	flags.append(
	f"Name imbalance: company={company_name_count} vs agent={agent_name_count}"
	)
	if not phone_equal:
	flags.append(
	f"Phone imbalance: office={office_phone_count} vs agent={agent_phone_count}"
	)

	compliant = name_equal and phone_equal and disclaimer_ok

	return {
	"compliant": compliant,
	"Flags": flags,
	}


	# -----------------------------
	# OCR helper (optional)
	# -----------------------------

	def ocr_image(image: Union["Image.Image", bytes, None]) -> str:
	"""OCR a PIL image or raw bytes. Returns empty string if OCR not available."""
	if image is None or pytesseract is None:
	return ""
	try:
	if isinstance(image, bytes):
	if Image is None:
	return ""
	from io import BytesIO

	image = Image.open(BytesIO(image)).convert("RGB")
	return pytesseract.image_to_string(image) # type: ignore[arg-type]
	except Exception:
	return ""


	# -----------------------------
	# Orchestration (UI-agnostic)
	# -----------------------------

	def run_check(
	image: Optional["Image.Image"],
	ptxt: str,
	social: bool,
	agent_name: str,
	agent_phone: str,
	*,
	company_name: str = COMPANY_NAME_DEFAULT,
	company_phones: Optional[List[str]] = None,
	disclaimer: str = DISCLAIMER_DEFAULT,
	require_disclaimer_on_social: Optional[bool] = None,
	) -> Dict[str, Any]:
	"""
	Execute full pipeline and return payload dict with keys:
	- Fair_Housing
	- img
	- Ptxt
	"""
	company_phones = company_phones or COMPANY_PHONES_DEFAULT
	if require_disclaimer_on_social is None:
	require_disclaimer_on_social = REQUIRE_DISCLAIMER_ON_SOCIAL

	itxt = ocr_image(image)

	# Compose combined content
	content = "\n\n".join(x for x in [itxt, ptxt or "", f"Social={social}"] if x)

	# Fair-housing flags on combined content
	fh_flags = fair_housing_flags(content)
	fair_housing_block = {"compliant": len(fh_flags) == 0, "Flags": fh_flags}

	# Evaluate image text section
	img_block = evaluate_section(
	text=itxt,
	social=social,
	company_name=company_name,
	company_phones=company_phones,
	agent_name=agent_name,
	agent_phone=agent_phone,
	disclaimer=disclaimer,
	require_disclaimer_on_social=require_disclaimer_on_social,
	)

	# Evaluate post text section
	ptxt_block = evaluate_section(
	text=ptxt or "",
	social=social,
	company_name=company_name,
	company_phones=company_phones,
	agent_name=agent_name,
	agent_phone=agent_phone,
	disclaimer=disclaimer,
	require_disclaimer_on_social=require_disclaimer_on_social,
	)

	return {
	"Fair_Housing": fair_housing_block,
	"img": img_block,
	"Ptxt": ptxt_block,
	}


	__all__ = [
	"COMPANY_NAME_DEFAULT",
	"COMPANY_PHONES_DEFAULT",
	"DISCLAIMER_DEFAULT",
	"REQUIRE_DISCLAIMER_ON_SOCIAL",
	"USE_TINY_ML",
	"HF_REPO",
	"HF_THRESH",
	"PHRASES_PATH",
	"count_phone_instances",
	"count_name_instances",
	"contains_disclaimer",
	"fair_housing_flags",
	"evaluate_section",
	"ocr_image",
	"run_check",
	]