Files
AR-House/scripts/backfill_zillow_photos.py
T
2026-07-03 12:24:58 -04:00

171 lines
6.2 KiB
Python

"""scripts/backfill_zillow_photos.py — Buscar fotos de Zillow para deals sin foto.
ESTRATEGIA:
1. Query deals WHERE source IN (clerks) AND photos_urls IS NULL/empty
2. Para cada deal: address + county + 'FL' → fetch_zillow_photos_by_address
3. UPDATE photos_urls (JSON list)
4. Si no encuentra: graba '[]' como sentinel (no re-intentar)
RATE LIMIT: 1.2s entre requests (Firecrawl + ser ciudadano de bien con Zillow).
BUDGET: 1 credit por request.
USO:
python scripts/backfill_zillow_photos.py # all clerks pendientes
python scripts/backfill_zillow_photos.py --limit 20 # solo 20 deals
python scripts/backfill_zillow_photos.py --source duval_clerk # un source
"""
from __future__ import annotations
import argparse, io, json, sys, time
from pathlib import Path
from collections import Counter
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from dotenv import load_dotenv
load_dotenv(ROOT / ".env")
from deals_db import init_db, _get_conn, firecrawl_budget_status, record_firecrawl_usage
from data_fetchers.zillow_photo_lookup import fetch_zillow_photos_by_address
CLERK_SOURCES = (
"miami_dade_clerk", "duval_clerk", "broward_clerk",
"hillsborough_clerk", "orange_clerk", "indian_river_clerk",
"st_lucie_clerk", "martin_clerk", "lee_clerk", "sarasota_clerk",
"pinellas_clerk", "volusia_clerk", "brevard_clerk", "pasco_clerk",
"polk_clerk", "lake_clerk", "osceola_clerk", "seminole_clerk",
"manatee_clerk", "marion_clerk",
)
RATE_LIMIT_SECONDS = 1.2
def build_zillow_query_address(deal: dict) -> str:
"""Construye el address string para query Zillow.
Format: 'STREET, {COUNTY} COUNTY, {STATE}'
Probado en small sample: county-level query da mejor hit rate
que sin city (Zillow no encuentra) o con city wrong (mistakeo de ciudad).
"""
addr = (deal.get("address") or "").strip()
county = (deal.get("county") or "").strip()
state = (deal.get("state") or "FL").strip()
if county:
# Strip "County" suffix si ya viene en el campo
c = county.replace(" County", "").strip()
return f"{addr}, {c} COUNTY, {state}"
return f"{addr}, {state}"
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--limit", type=int, default=None, help="Cap on deals to process")
ap.add_argument("--source", type=str, default=None, help="Filter to one source")
ap.add_argument("--max-budget-pct", type=int, default=80,
help="Stop si Firecrawl usage hits N%% del budget mensual")
ap.add_argument("--dry-run", action="store_true", help="No DB writes")
args = ap.parse_args()
init_db()
conn = _get_conn()
# Build query: clerks with NO photos
source_filter = (
" AND source = ?" if args.source else
" AND source IN ({})".format(",".join("?" * len(CLERK_SOURCES)))
)
params = [args.source] if args.source else list(CLERK_SOURCES)
query = (
"SELECT id, source, address, city, state, county FROM deals "
"WHERE (photos_urls IS NULL OR photos_urls = '' OR photos_urls = '[]')"
+ source_filter
+ " AND address IS NOT NULL AND address != ''"
" ORDER BY id"
)
if args.limit:
query += f" LIMIT {args.limit}"
rows = conn.execute(query, params).fetchall()
total = len(rows)
print(f"Found {total} deals sin foto en clerks")
if not total:
return 0
# Budget check
b = firecrawl_budget_status()
print(f"Firecrawl budget: {b['credits_used']}/{b['credits_budget']} "
f"used ({b['usage_pct']}%) — pause threshold {b['pause_threshold_pct']}%")
if b["usage_pct"] >= args.max_budget_pct:
print(f"STOP: budget usage ({b['usage_pct']}%) >= max-budget-pct ({args.max_budget_pct}%)")
return 1
stats = Counter()
t0 = time.perf_counter()
for i, row in enumerate(rows, 1):
deal = dict(row)
addr_query = build_zillow_query_address(deal)
# Re-check budget mid-loop every 20 deals
if i % 20 == 0:
b = firecrawl_budget_status()
if b["usage_pct"] >= args.max_budget_pct:
print(f"[{i}/{total}] Budget reached {b['usage_pct']}% — stopping")
break
# Throttle
if i > 1:
time.sleep(RATE_LIMIT_SECONDS)
print(f"[{i}/{total}] deal #{deal['id']} ({deal['source']}) — {addr_query[:70]}")
photos, meta = fetch_zillow_photos_by_address(addr_query)
# Record Firecrawl usage
if meta.get("credits_used") and not args.dry_run:
try:
record_firecrawl_usage(
source="zillow_photo_backfill",
credits=meta["credits_used"],
url=meta.get("url_attempted"),
description=f"photos for clerk deal id={deal['id']} ({deal['source']})",
)
except Exception as e:
print(f" [warn] could not record firecrawl_usage: {e}")
if photos:
stats["found"] += 1
print(f" OK: {len(photos)} fotos")
if not args.dry_run:
conn.execute(
"UPDATE deals SET photos_urls = ?, last_seen_at = ? WHERE id = ?",
(json.dumps(photos), time.strftime("%Y-%m-%dT%H:%M:%S"), deal["id"]),
)
else:
stats["empty"] += 1
reason = meta.get("error") or "no photos in markdown"
print(f" EMPTY: {reason[:100]}")
if not args.dry_run:
# Mark as attempted by setting empty list (vs NULL)
conn.execute(
"UPDATE deals SET photos_urls = ? WHERE id = ?",
("[]", deal["id"]),
)
elapsed = time.perf_counter() - t0
print()
print("=" * 60)
print(f"DONE in {elapsed:.0f}s")
print(f" Photos found: {stats['found']}/{total}")
print(f" Empty result: {stats['empty']}/{total}")
print(f" Hit rate: {stats['found']/total*100:.1f}%")
b = firecrawl_budget_status()
print(f" Firecrawl now: {b['credits_used']}/{b['credits_budget']} ({b['usage_pct']}%)")
return 0
if __name__ == "__main__":
sys.exit(main())