171 lines
6.2 KiB
Python
171 lines
6.2 KiB
Python
"""scripts/backfill_zillow_photos.py — Buscar fotos de Zillow para deals sin foto.
|
|
|
|
ESTRATEGIA:
|
|
1. Query deals WHERE source IN (clerks) AND photos_urls IS NULL/empty
|
|
2. Para cada deal: address + county + 'FL' → fetch_zillow_photos_by_address
|
|
3. UPDATE photos_urls (JSON list)
|
|
4. Si no encuentra: graba '[]' como sentinel (no re-intentar)
|
|
|
|
RATE LIMIT: 1.2s entre requests (Firecrawl + ser ciudadano de bien con Zillow).
|
|
BUDGET: 1 credit por request.
|
|
|
|
USO:
|
|
python scripts/backfill_zillow_photos.py # all clerks pendientes
|
|
python scripts/backfill_zillow_photos.py --limit 20 # solo 20 deals
|
|
python scripts/backfill_zillow_photos.py --source duval_clerk # un source
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse, io, json, sys, time
|
|
from pathlib import Path
|
|
from collections import Counter
|
|
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(ROOT))
|
|
|
|
from dotenv import load_dotenv
|
|
load_dotenv(ROOT / ".env")
|
|
|
|
from deals_db import init_db, _get_conn, firecrawl_budget_status, record_firecrawl_usage
|
|
from data_fetchers.zillow_photo_lookup import fetch_zillow_photos_by_address
|
|
|
|
|
|
CLERK_SOURCES = (
|
|
"miami_dade_clerk", "duval_clerk", "broward_clerk",
|
|
"hillsborough_clerk", "orange_clerk", "indian_river_clerk",
|
|
"st_lucie_clerk", "martin_clerk", "lee_clerk", "sarasota_clerk",
|
|
"pinellas_clerk", "volusia_clerk", "brevard_clerk", "pasco_clerk",
|
|
"polk_clerk", "lake_clerk", "osceola_clerk", "seminole_clerk",
|
|
"manatee_clerk", "marion_clerk",
|
|
)
|
|
|
|
RATE_LIMIT_SECONDS = 1.2
|
|
|
|
|
|
def build_zillow_query_address(deal: dict) -> str:
|
|
"""Construye el address string para query Zillow.
|
|
|
|
Format: 'STREET, {COUNTY} COUNTY, {STATE}'
|
|
Probado en small sample: county-level query da mejor hit rate
|
|
que sin city (Zillow no encuentra) o con city wrong (mistakeo de ciudad).
|
|
"""
|
|
addr = (deal.get("address") or "").strip()
|
|
county = (deal.get("county") or "").strip()
|
|
state = (deal.get("state") or "FL").strip()
|
|
|
|
if county:
|
|
# Strip "County" suffix si ya viene en el campo
|
|
c = county.replace(" County", "").strip()
|
|
return f"{addr}, {c} COUNTY, {state}"
|
|
return f"{addr}, {state}"
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--limit", type=int, default=None, help="Cap on deals to process")
|
|
ap.add_argument("--source", type=str, default=None, help="Filter to one source")
|
|
ap.add_argument("--max-budget-pct", type=int, default=80,
|
|
help="Stop si Firecrawl usage hits N%% del budget mensual")
|
|
ap.add_argument("--dry-run", action="store_true", help="No DB writes")
|
|
args = ap.parse_args()
|
|
|
|
init_db()
|
|
conn = _get_conn()
|
|
|
|
# Build query: clerks with NO photos
|
|
source_filter = (
|
|
" AND source = ?" if args.source else
|
|
" AND source IN ({})".format(",".join("?" * len(CLERK_SOURCES)))
|
|
)
|
|
params = [args.source] if args.source else list(CLERK_SOURCES)
|
|
query = (
|
|
"SELECT id, source, address, city, state, county FROM deals "
|
|
"WHERE (photos_urls IS NULL OR photos_urls = '' OR photos_urls = '[]')"
|
|
+ source_filter
|
|
+ " AND address IS NOT NULL AND address != ''"
|
|
" ORDER BY id"
|
|
)
|
|
if args.limit:
|
|
query += f" LIMIT {args.limit}"
|
|
|
|
rows = conn.execute(query, params).fetchall()
|
|
total = len(rows)
|
|
print(f"Found {total} deals sin foto en clerks")
|
|
if not total:
|
|
return 0
|
|
|
|
# Budget check
|
|
b = firecrawl_budget_status()
|
|
print(f"Firecrawl budget: {b['credits_used']}/{b['credits_budget']} "
|
|
f"used ({b['usage_pct']}%) — pause threshold {b['pause_threshold_pct']}%")
|
|
if b["usage_pct"] >= args.max_budget_pct:
|
|
print(f"STOP: budget usage ({b['usage_pct']}%) >= max-budget-pct ({args.max_budget_pct}%)")
|
|
return 1
|
|
|
|
stats = Counter()
|
|
t0 = time.perf_counter()
|
|
|
|
for i, row in enumerate(rows, 1):
|
|
deal = dict(row)
|
|
addr_query = build_zillow_query_address(deal)
|
|
|
|
# Re-check budget mid-loop every 20 deals
|
|
if i % 20 == 0:
|
|
b = firecrawl_budget_status()
|
|
if b["usage_pct"] >= args.max_budget_pct:
|
|
print(f"[{i}/{total}] Budget reached {b['usage_pct']}% — stopping")
|
|
break
|
|
|
|
# Throttle
|
|
if i > 1:
|
|
time.sleep(RATE_LIMIT_SECONDS)
|
|
|
|
print(f"[{i}/{total}] deal #{deal['id']} ({deal['source']}) — {addr_query[:70]}")
|
|
photos, meta = fetch_zillow_photos_by_address(addr_query)
|
|
|
|
# Record Firecrawl usage
|
|
if meta.get("credits_used") and not args.dry_run:
|
|
try:
|
|
record_firecrawl_usage(
|
|
source="zillow_photo_backfill",
|
|
credits=meta["credits_used"],
|
|
url=meta.get("url_attempted"),
|
|
description=f"photos for clerk deal id={deal['id']} ({deal['source']})",
|
|
)
|
|
except Exception as e:
|
|
print(f" [warn] could not record firecrawl_usage: {e}")
|
|
|
|
if photos:
|
|
stats["found"] += 1
|
|
print(f" OK: {len(photos)} fotos")
|
|
if not args.dry_run:
|
|
conn.execute(
|
|
"UPDATE deals SET photos_urls = ?, last_seen_at = ? WHERE id = ?",
|
|
(json.dumps(photos), time.strftime("%Y-%m-%dT%H:%M:%S"), deal["id"]),
|
|
)
|
|
else:
|
|
stats["empty"] += 1
|
|
reason = meta.get("error") or "no photos in markdown"
|
|
print(f" EMPTY: {reason[:100]}")
|
|
if not args.dry_run:
|
|
# Mark as attempted by setting empty list (vs NULL)
|
|
conn.execute(
|
|
"UPDATE deals SET photos_urls = ? WHERE id = ?",
|
|
("[]", deal["id"]),
|
|
)
|
|
|
|
elapsed = time.perf_counter() - t0
|
|
print()
|
|
print("=" * 60)
|
|
print(f"DONE in {elapsed:.0f}s")
|
|
print(f" Photos found: {stats['found']}/{total}")
|
|
print(f" Empty result: {stats['empty']}/{total}")
|
|
print(f" Hit rate: {stats['found']/total*100:.1f}%")
|
|
b = firecrawl_budget_status()
|
|
print(f" Firecrawl now: {b['credits_used']}/{b['credits_budget']} ({b['usage_pct']}%)")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|