"""scripts/backfill_zillow_photos.py — Buscar fotos de Zillow para deals sin foto. ESTRATEGIA: 1. Query deals WHERE source IN (clerks) AND photos_urls IS NULL/empty 2. Para cada deal: address + county + 'FL' → fetch_zillow_photos_by_address 3. UPDATE photos_urls (JSON list) 4. Si no encuentra: graba '[]' como sentinel (no re-intentar) RATE LIMIT: 1.2s entre requests (Firecrawl + ser ciudadano de bien con Zillow). BUDGET: 1 credit por request. USO: python scripts/backfill_zillow_photos.py # all clerks pendientes python scripts/backfill_zillow_photos.py --limit 20 # solo 20 deals python scripts/backfill_zillow_photos.py --source duval_clerk # un source """ from __future__ import annotations import argparse, io, json, sys, time from pathlib import Path from collections import Counter sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT)) from dotenv import load_dotenv load_dotenv(ROOT / ".env") from deals_db import init_db, _get_conn, firecrawl_budget_status, record_firecrawl_usage from data_fetchers.zillow_photo_lookup import fetch_zillow_photos_by_address CLERK_SOURCES = ( "miami_dade_clerk", "duval_clerk", "broward_clerk", "hillsborough_clerk", "orange_clerk", "indian_river_clerk", "st_lucie_clerk", "martin_clerk", "lee_clerk", "sarasota_clerk", "pinellas_clerk", "volusia_clerk", "brevard_clerk", "pasco_clerk", "polk_clerk", "lake_clerk", "osceola_clerk", "seminole_clerk", "manatee_clerk", "marion_clerk", ) RATE_LIMIT_SECONDS = 1.2 def build_zillow_query_address(deal: dict) -> str: """Construye el address string para query Zillow. Format: 'STREET, {COUNTY} COUNTY, {STATE}' Probado en small sample: county-level query da mejor hit rate que sin city (Zillow no encuentra) o con city wrong (mistakeo de ciudad). """ addr = (deal.get("address") or "").strip() county = (deal.get("county") or "").strip() state = (deal.get("state") or "FL").strip() if county: # Strip "County" suffix si ya viene en el campo c = county.replace(" County", "").strip() return f"{addr}, {c} COUNTY, {state}" return f"{addr}, {state}" def main(): ap = argparse.ArgumentParser() ap.add_argument("--limit", type=int, default=None, help="Cap on deals to process") ap.add_argument("--source", type=str, default=None, help="Filter to one source") ap.add_argument("--max-budget-pct", type=int, default=80, help="Stop si Firecrawl usage hits N%% del budget mensual") ap.add_argument("--dry-run", action="store_true", help="No DB writes") args = ap.parse_args() init_db() conn = _get_conn() # Build query: clerks with NO photos source_filter = ( " AND source = ?" if args.source else " AND source IN ({})".format(",".join("?" * len(CLERK_SOURCES))) ) params = [args.source] if args.source else list(CLERK_SOURCES) query = ( "SELECT id, source, address, city, state, county FROM deals " "WHERE (photos_urls IS NULL OR photos_urls = '' OR photos_urls = '[]')" + source_filter + " AND address IS NOT NULL AND address != ''" " ORDER BY id" ) if args.limit: query += f" LIMIT {args.limit}" rows = conn.execute(query, params).fetchall() total = len(rows) print(f"Found {total} deals sin foto en clerks") if not total: return 0 # Budget check b = firecrawl_budget_status() print(f"Firecrawl budget: {b['credits_used']}/{b['credits_budget']} " f"used ({b['usage_pct']}%) — pause threshold {b['pause_threshold_pct']}%") if b["usage_pct"] >= args.max_budget_pct: print(f"STOP: budget usage ({b['usage_pct']}%) >= max-budget-pct ({args.max_budget_pct}%)") return 1 stats = Counter() t0 = time.perf_counter() for i, row in enumerate(rows, 1): deal = dict(row) addr_query = build_zillow_query_address(deal) # Re-check budget mid-loop every 20 deals if i % 20 == 0: b = firecrawl_budget_status() if b["usage_pct"] >= args.max_budget_pct: print(f"[{i}/{total}] Budget reached {b['usage_pct']}% — stopping") break # Throttle if i > 1: time.sleep(RATE_LIMIT_SECONDS) print(f"[{i}/{total}] deal #{deal['id']} ({deal['source']}) — {addr_query[:70]}") photos, meta = fetch_zillow_photos_by_address(addr_query) # Record Firecrawl usage if meta.get("credits_used") and not args.dry_run: try: record_firecrawl_usage( source="zillow_photo_backfill", credits=meta["credits_used"], url=meta.get("url_attempted"), description=f"photos for clerk deal id={deal['id']} ({deal['source']})", ) except Exception as e: print(f" [warn] could not record firecrawl_usage: {e}") if photos: stats["found"] += 1 print(f" OK: {len(photos)} fotos") if not args.dry_run: conn.execute( "UPDATE deals SET photos_urls = ?, last_seen_at = ? WHERE id = ?", (json.dumps(photos), time.strftime("%Y-%m-%dT%H:%M:%S"), deal["id"]), ) else: stats["empty"] += 1 reason = meta.get("error") or "no photos in markdown" print(f" EMPTY: {reason[:100]}") if not args.dry_run: # Mark as attempted by setting empty list (vs NULL) conn.execute( "UPDATE deals SET photos_urls = ? WHERE id = ?", ("[]", deal["id"]), ) elapsed = time.perf_counter() - t0 print() print("=" * 60) print(f"DONE in {elapsed:.0f}s") print(f" Photos found: {stats['found']}/{total}") print(f" Empty result: {stats['empty']}/{total}") print(f" Hit rate: {stats['found']/total*100:.1f}%") b = firecrawl_budget_status() print(f" Firecrawl now: {b['credits_used']}/{b['credits_budget']} ({b['usage_pct']}%)") return 0 if __name__ == "__main__": sys.exit(main())