Files
AR-House/scripts/rebackfill_broward_pa_photos.py
2026-07-03 12:24:58 -04:00

106 lines
3.8 KiB
Python

"""Re-backfill PA photos para Broward deals affected by Zillow photo misassignment bug.
Logic:
1. Find Broward deals whose current photos_urls SHARES URL with other deals
(= affected by bug)
2. Para esos, re-fetch foto from bcpa.net via pa_photo_lookup.fetch_pa_photo (gratis)
3. Update DB with correct PA photo URL
Cero costos Firecrawl. Usa Playwright local contra bcpa.net SPA.
"""
from __future__ import annotations
import argparse
import json
import sqlite3
import time
import sys
sys.path.insert(0, ".")
def main():
p = argparse.ArgumentParser()
p.add_argument("--dry-run", action="store_true")
p.add_argument("--db", default="data/deals.db")
p.add_argument("--limit", type=int, default=0, help="Cap for testing (0 = no limit)")
args = p.parse_args()
conn = sqlite3.connect(args.db)
cur = conn.cursor()
# Step 1: find Broward deals SIN foto (NULL/empty) OR con duplicate Zillow URL
# parcel_id valid + county Broward + sin photo o photo bugged
cur.execute("""
SELECT id, parcel_id, address, photos_urls
FROM deals
WHERE county = 'Broward'
AND parcel_id IS NOT NULL
AND parcel_id != ''
AND parcel_id NOT LIKE 'Property Appraiser%'
AND (
photos_urls IS NULL OR photos_urls = '' OR photos_urls = '[]'
OR photos_urls IN (
SELECT photos_urls FROM deals
WHERE photos_urls IS NOT NULL AND photos_urls != '[]'
GROUP BY photos_urls
HAVING COUNT(DISTINCT id) > 1
)
)
""")
affected = cur.fetchall()
if not affected:
print("No Broward deals affected by bug — DB clean.")
return
print(f"Found {len(affected)} Broward deals affected by photo bug.")
if args.limit:
affected = affected[: args.limit]
print(f"Limiting to first {args.limit} for testing.")
if args.dry_run:
for deal_id, parcel, addr, old_photos in affected[:10]:
print(f" Would re-fetch: id={deal_id} parcel={parcel} addr={(addr or '?')[:50]}")
print(f"\nDRY RUN — no changes. Re-run sin --dry-run to execute.")
return
# Step 2: re-fetch via PA
from data_fetchers.pa_photo_lookup import fetch_pa_photo
updated = 0
failed = 0
t0 = time.time()
for deal_id, parcel, addr, old_photos in affected:
addr_safe = (addr or "?")[:50].encode("ascii", "replace").decode("ascii")
print(f" Fetching id={deal_id} parcel={parcel} addr={addr_safe}...", flush=True)
try:
photo_url, meta = fetch_pa_photo(county="Broward", parcel_id=parcel,
timeout_seconds=25)
if photo_url:
new_photos = json.dumps([photo_url])
cur.execute("UPDATE deals SET photos_urls = ? WHERE id = ?",
(new_photos, deal_id))
conn.commit()
updated += 1
print(f" PA photo: {photo_url[-50:]}")
else:
# Set to NULL so card shows no photo (instead of stale Zillow)
cur.execute("UPDATE deals SET photos_urls = NULL WHERE id = ?", (deal_id,))
conn.commit()
failed += 1
err = (meta.get("error") or "no photo found")[:80]
print(f" NO PA photo found ({err}). photos_urls NULLed.")
except Exception as e:
failed += 1
err = str(e)[:80].encode("ascii", "replace").decode("ascii")
print(f" EXCEPTION: {err}")
time.sleep(2) # don't hammer if errors
elapsed = time.time() - t0
print()
print(f"Done in {elapsed:.0f}s. Updated {updated}, failed/no-photo {failed}.")
conn.close()
if __name__ == "__main__":
main()