Files
AR-House/scripts/backfill_pa_photos.py
2026-07-03 12:24:58 -04:00

102 lines
3.2 KiB
Python

"""Backfill clerk deal photos via County Property Appraiser sites (GRATIS).
Alternativa a backfill_zillow_photos.py — usa Playwright sobre PA sites,
cero costo Firecrawl.
Coverage actual: solo Broward (~70 deals). Phase 3.5.B: agregar Duval, etc.
Solo procesa deals que NO tienen foto AUN. Idempotent.
"""
from __future__ import annotations
import argparse, io, json, sys, time
from pathlib import Path
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from deals_db import init_db, _get_conn
from data_fetchers.pa_photo_lookup import _fetch_broward_batch
COUNTY_TO_SOURCE = {
"Broward": "broward_clerk",
# "Duval": "duval_clerk", # Phase 3.5.B
# "Hillsborough": "hillsborough_clerk", # Phase 3.5.B
}
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--county", default="Broward", help="County to backfill (default Broward)")
ap.add_argument("--limit", type=int, default=None)
ap.add_argument("--dry-run", action="store_true")
args = ap.parse_args()
init_db()
conn = _get_conn()
source = COUNTY_TO_SOURCE.get(args.county)
if not source:
print(f"ERROR: county '{args.county}' not yet supported. Available: {list(COUNTY_TO_SOURCE.keys())}")
return 1
# Find clerk deals WITHOUT photo
q = (
"SELECT id, parcel_id, address FROM deals "
"WHERE source = ? "
"AND parcel_id IS NOT NULL AND parcel_id != '' "
"AND (photos_urls IS NULL OR photos_urls = '' OR photos_urls = '[]') "
"ORDER BY id"
)
if args.limit:
q += f" LIMIT {args.limit}"
rows = conn.execute(q, (source,)).fetchall()
print(f"Found {len(rows)} {args.county} deals sin foto")
if not rows:
return 0
parcel_ids = [r["parcel_id"] for r in rows]
print(f"Starting batch fetch via {args.county} PA (Playwright, gratis)...")
print(f"Estimated time: ~{len(parcel_ids) * 12}s ({len(parcel_ids) * 12 // 60}m)")
print()
t0 = time.perf_counter()
results = _fetch_broward_batch(parcel_ids, timeout_seconds=20)
elapsed = time.perf_counter() - t0
hits = 0
misses = 0
for r in rows:
photo = results.get(r["parcel_id"])
if photo:
hits += 1
if not args.dry_run:
conn.execute(
"UPDATE deals SET photos_urls = ? WHERE id = ?",
(json.dumps([photo]), r["id"]),
)
print(f" ✓ id={r['id']} parcel={r['parcel_id']}{photo[-60:]}")
else:
misses += 1
if not args.dry_run:
conn.execute(
"UPDATE deals SET photos_urls = ? WHERE id = ?",
("[]", r["id"]),
)
print(f" ✗ id={r['id']} parcel={r['parcel_id']} no photo found")
print()
print("=" * 50)
print(f"DONE in {elapsed:.0f}s ({elapsed/60:.1f} min)")
print(f" Hits: {hits}/{len(rows)}")
print(f" Misses: {misses}/{len(rows)}")
print(f" Hit rate: {hits*100//len(rows)}%")
print(f" Cost: $0 (Playwright gratis)")
return 0
if __name__ == "__main__":
sys.exit(main())