feat: AR-House initial commit
This commit is contained in:
@@ -0,0 +1,170 @@
|
||||
"""scripts/backfill_zillow_photos.py — Buscar fotos de Zillow para deals sin foto.
|
||||
|
||||
ESTRATEGIA:
|
||||
1. Query deals WHERE source IN (clerks) AND photos_urls IS NULL/empty
|
||||
2. Para cada deal: address + county + 'FL' → fetch_zillow_photos_by_address
|
||||
3. UPDATE photos_urls (JSON list)
|
||||
4. Si no encuentra: graba '[]' como sentinel (no re-intentar)
|
||||
|
||||
RATE LIMIT: 1.2s entre requests (Firecrawl + ser ciudadano de bien con Zillow).
|
||||
BUDGET: 1 credit por request.
|
||||
|
||||
USO:
|
||||
python scripts/backfill_zillow_photos.py # all clerks pendientes
|
||||
python scripts/backfill_zillow_photos.py --limit 20 # solo 20 deals
|
||||
python scripts/backfill_zillow_photos.py --source duval_clerk # un source
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse, io, json, sys, time
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
||||
ROOT = Path(__file__).resolve().parent.parent
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(ROOT / ".env")
|
||||
|
||||
from deals_db import init_db, _get_conn, firecrawl_budget_status, record_firecrawl_usage
|
||||
from data_fetchers.zillow_photo_lookup import fetch_zillow_photos_by_address
|
||||
|
||||
|
||||
CLERK_SOURCES = (
|
||||
"miami_dade_clerk", "duval_clerk", "broward_clerk",
|
||||
"hillsborough_clerk", "orange_clerk", "indian_river_clerk",
|
||||
"st_lucie_clerk", "martin_clerk", "lee_clerk", "sarasota_clerk",
|
||||
"pinellas_clerk", "volusia_clerk", "brevard_clerk", "pasco_clerk",
|
||||
"polk_clerk", "lake_clerk", "osceola_clerk", "seminole_clerk",
|
||||
"manatee_clerk", "marion_clerk",
|
||||
)
|
||||
|
||||
RATE_LIMIT_SECONDS = 1.2
|
||||
|
||||
|
||||
def build_zillow_query_address(deal: dict) -> str:
|
||||
"""Construye el address string para query Zillow.
|
||||
|
||||
Format: 'STREET, {COUNTY} COUNTY, {STATE}'
|
||||
Probado en small sample: county-level query da mejor hit rate
|
||||
que sin city (Zillow no encuentra) o con city wrong (mistakeo de ciudad).
|
||||
"""
|
||||
addr = (deal.get("address") or "").strip()
|
||||
county = (deal.get("county") or "").strip()
|
||||
state = (deal.get("state") or "FL").strip()
|
||||
|
||||
if county:
|
||||
# Strip "County" suffix si ya viene en el campo
|
||||
c = county.replace(" County", "").strip()
|
||||
return f"{addr}, {c} COUNTY, {state}"
|
||||
return f"{addr}, {state}"
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--limit", type=int, default=None, help="Cap on deals to process")
|
||||
ap.add_argument("--source", type=str, default=None, help="Filter to one source")
|
||||
ap.add_argument("--max-budget-pct", type=int, default=80,
|
||||
help="Stop si Firecrawl usage hits N%% del budget mensual")
|
||||
ap.add_argument("--dry-run", action="store_true", help="No DB writes")
|
||||
args = ap.parse_args()
|
||||
|
||||
init_db()
|
||||
conn = _get_conn()
|
||||
|
||||
# Build query: clerks with NO photos
|
||||
source_filter = (
|
||||
" AND source = ?" if args.source else
|
||||
" AND source IN ({})".format(",".join("?" * len(CLERK_SOURCES)))
|
||||
)
|
||||
params = [args.source] if args.source else list(CLERK_SOURCES)
|
||||
query = (
|
||||
"SELECT id, source, address, city, state, county FROM deals "
|
||||
"WHERE (photos_urls IS NULL OR photos_urls = '' OR photos_urls = '[]')"
|
||||
+ source_filter
|
||||
+ " AND address IS NOT NULL AND address != ''"
|
||||
" ORDER BY id"
|
||||
)
|
||||
if args.limit:
|
||||
query += f" LIMIT {args.limit}"
|
||||
|
||||
rows = conn.execute(query, params).fetchall()
|
||||
total = len(rows)
|
||||
print(f"Found {total} deals sin foto en clerks")
|
||||
if not total:
|
||||
return 0
|
||||
|
||||
# Budget check
|
||||
b = firecrawl_budget_status()
|
||||
print(f"Firecrawl budget: {b['credits_used']}/{b['credits_budget']} "
|
||||
f"used ({b['usage_pct']}%) — pause threshold {b['pause_threshold_pct']}%")
|
||||
if b["usage_pct"] >= args.max_budget_pct:
|
||||
print(f"STOP: budget usage ({b['usage_pct']}%) >= max-budget-pct ({args.max_budget_pct}%)")
|
||||
return 1
|
||||
|
||||
stats = Counter()
|
||||
t0 = time.perf_counter()
|
||||
|
||||
for i, row in enumerate(rows, 1):
|
||||
deal = dict(row)
|
||||
addr_query = build_zillow_query_address(deal)
|
||||
|
||||
# Re-check budget mid-loop every 20 deals
|
||||
if i % 20 == 0:
|
||||
b = firecrawl_budget_status()
|
||||
if b["usage_pct"] >= args.max_budget_pct:
|
||||
print(f"[{i}/{total}] Budget reached {b['usage_pct']}% — stopping")
|
||||
break
|
||||
|
||||
# Throttle
|
||||
if i > 1:
|
||||
time.sleep(RATE_LIMIT_SECONDS)
|
||||
|
||||
print(f"[{i}/{total}] deal #{deal['id']} ({deal['source']}) — {addr_query[:70]}")
|
||||
photos, meta = fetch_zillow_photos_by_address(addr_query)
|
||||
|
||||
# Record Firecrawl usage
|
||||
if meta.get("credits_used") and not args.dry_run:
|
||||
try:
|
||||
record_firecrawl_usage(
|
||||
source="zillow_photo_backfill",
|
||||
credits=meta["credits_used"],
|
||||
url=meta.get("url_attempted"),
|
||||
description=f"photos for clerk deal id={deal['id']} ({deal['source']})",
|
||||
)
|
||||
except Exception as e:
|
||||
print(f" [warn] could not record firecrawl_usage: {e}")
|
||||
|
||||
if photos:
|
||||
stats["found"] += 1
|
||||
print(f" OK: {len(photos)} fotos")
|
||||
if not args.dry_run:
|
||||
conn.execute(
|
||||
"UPDATE deals SET photos_urls = ?, last_seen_at = ? WHERE id = ?",
|
||||
(json.dumps(photos), time.strftime("%Y-%m-%dT%H:%M:%S"), deal["id"]),
|
||||
)
|
||||
else:
|
||||
stats["empty"] += 1
|
||||
reason = meta.get("error") or "no photos in markdown"
|
||||
print(f" EMPTY: {reason[:100]}")
|
||||
if not args.dry_run:
|
||||
# Mark as attempted by setting empty list (vs NULL)
|
||||
conn.execute(
|
||||
"UPDATE deals SET photos_urls = ? WHERE id = ?",
|
||||
("[]", deal["id"]),
|
||||
)
|
||||
|
||||
elapsed = time.perf_counter() - t0
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f"DONE in {elapsed:.0f}s")
|
||||
print(f" Photos found: {stats['found']}/{total}")
|
||||
print(f" Empty result: {stats['empty']}/{total}")
|
||||
print(f" Hit rate: {stats['found']/total*100:.1f}%")
|
||||
b = firecrawl_budget_status()
|
||||
print(f" Firecrawl now: {b['credits_used']}/{b['credits_budget']} ({b['usage_pct']}%)")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user