102 lines
3.2 KiB
Python
102 lines
3.2 KiB
Python
"""Backfill clerk deal photos via County Property Appraiser sites (GRATIS).
|
|
|
|
Alternativa a backfill_zillow_photos.py — usa Playwright sobre PA sites,
|
|
cero costo Firecrawl.
|
|
|
|
Coverage actual: solo Broward (~70 deals). Phase 3.5.B: agregar Duval, etc.
|
|
|
|
Solo procesa deals que NO tienen foto AUN. Idempotent.
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse, io, json, sys, time
|
|
from pathlib import Path
|
|
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(ROOT))
|
|
|
|
from deals_db import init_db, _get_conn
|
|
from data_fetchers.pa_photo_lookup import _fetch_broward_batch
|
|
|
|
|
|
COUNTY_TO_SOURCE = {
|
|
"Broward": "broward_clerk",
|
|
# "Duval": "duval_clerk", # Phase 3.5.B
|
|
# "Hillsborough": "hillsborough_clerk", # Phase 3.5.B
|
|
}
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--county", default="Broward", help="County to backfill (default Broward)")
|
|
ap.add_argument("--limit", type=int, default=None)
|
|
ap.add_argument("--dry-run", action="store_true")
|
|
args = ap.parse_args()
|
|
|
|
init_db()
|
|
conn = _get_conn()
|
|
|
|
source = COUNTY_TO_SOURCE.get(args.county)
|
|
if not source:
|
|
print(f"ERROR: county '{args.county}' not yet supported. Available: {list(COUNTY_TO_SOURCE.keys())}")
|
|
return 1
|
|
|
|
# Find clerk deals WITHOUT photo
|
|
q = (
|
|
"SELECT id, parcel_id, address FROM deals "
|
|
"WHERE source = ? "
|
|
"AND parcel_id IS NOT NULL AND parcel_id != '' "
|
|
"AND (photos_urls IS NULL OR photos_urls = '' OR photos_urls = '[]') "
|
|
"ORDER BY id"
|
|
)
|
|
if args.limit:
|
|
q += f" LIMIT {args.limit}"
|
|
rows = conn.execute(q, (source,)).fetchall()
|
|
|
|
print(f"Found {len(rows)} {args.county} deals sin foto")
|
|
if not rows:
|
|
return 0
|
|
|
|
parcel_ids = [r["parcel_id"] for r in rows]
|
|
print(f"Starting batch fetch via {args.county} PA (Playwright, gratis)...")
|
|
print(f"Estimated time: ~{len(parcel_ids) * 12}s ({len(parcel_ids) * 12 // 60}m)")
|
|
print()
|
|
|
|
t0 = time.perf_counter()
|
|
results = _fetch_broward_batch(parcel_ids, timeout_seconds=20)
|
|
elapsed = time.perf_counter() - t0
|
|
|
|
hits = 0
|
|
misses = 0
|
|
for r in rows:
|
|
photo = results.get(r["parcel_id"])
|
|
if photo:
|
|
hits += 1
|
|
if not args.dry_run:
|
|
conn.execute(
|
|
"UPDATE deals SET photos_urls = ? WHERE id = ?",
|
|
(json.dumps([photo]), r["id"]),
|
|
)
|
|
print(f" ✓ id={r['id']} parcel={r['parcel_id']} → {photo[-60:]}")
|
|
else:
|
|
misses += 1
|
|
if not args.dry_run:
|
|
conn.execute(
|
|
"UPDATE deals SET photos_urls = ? WHERE id = ?",
|
|
("[]", r["id"]),
|
|
)
|
|
print(f" ✗ id={r['id']} parcel={r['parcel_id']} no photo found")
|
|
|
|
print()
|
|
print("=" * 50)
|
|
print(f"DONE in {elapsed:.0f}s ({elapsed/60:.1f} min)")
|
|
print(f" Hits: {hits}/{len(rows)}")
|
|
print(f" Misses: {misses}/{len(rows)}")
|
|
print(f" Hit rate: {hits*100//len(rows)}%")
|
|
print(f" Cost: $0 (Playwright gratis)")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|