106 lines
3.8 KiB
Python
106 lines
3.8 KiB
Python
"""Re-backfill PA photos para Broward deals affected by Zillow photo misassignment bug.
|
|
|
|
Logic:
|
|
1. Find Broward deals whose current photos_urls SHARES URL with other deals
|
|
(= affected by bug)
|
|
2. Para esos, re-fetch foto from bcpa.net via pa_photo_lookup.fetch_pa_photo (gratis)
|
|
3. Update DB with correct PA photo URL
|
|
|
|
Cero costos Firecrawl. Usa Playwright local contra bcpa.net SPA.
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import json
|
|
import sqlite3
|
|
import time
|
|
import sys
|
|
sys.path.insert(0, ".")
|
|
|
|
|
|
def main():
|
|
p = argparse.ArgumentParser()
|
|
p.add_argument("--dry-run", action="store_true")
|
|
p.add_argument("--db", default="data/deals.db")
|
|
p.add_argument("--limit", type=int, default=0, help="Cap for testing (0 = no limit)")
|
|
args = p.parse_args()
|
|
|
|
conn = sqlite3.connect(args.db)
|
|
cur = conn.cursor()
|
|
|
|
# Step 1: find Broward deals SIN foto (NULL/empty) OR con duplicate Zillow URL
|
|
# parcel_id valid + county Broward + sin photo o photo bugged
|
|
cur.execute("""
|
|
SELECT id, parcel_id, address, photos_urls
|
|
FROM deals
|
|
WHERE county = 'Broward'
|
|
AND parcel_id IS NOT NULL
|
|
AND parcel_id != ''
|
|
AND parcel_id NOT LIKE 'Property Appraiser%'
|
|
AND (
|
|
photos_urls IS NULL OR photos_urls = '' OR photos_urls = '[]'
|
|
OR photos_urls IN (
|
|
SELECT photos_urls FROM deals
|
|
WHERE photos_urls IS NOT NULL AND photos_urls != '[]'
|
|
GROUP BY photos_urls
|
|
HAVING COUNT(DISTINCT id) > 1
|
|
)
|
|
)
|
|
""")
|
|
affected = cur.fetchall()
|
|
|
|
if not affected:
|
|
print("No Broward deals affected by bug — DB clean.")
|
|
return
|
|
|
|
print(f"Found {len(affected)} Broward deals affected by photo bug.")
|
|
if args.limit:
|
|
affected = affected[: args.limit]
|
|
print(f"Limiting to first {args.limit} for testing.")
|
|
|
|
if args.dry_run:
|
|
for deal_id, parcel, addr, old_photos in affected[:10]:
|
|
print(f" Would re-fetch: id={deal_id} parcel={parcel} addr={(addr or '?')[:50]}")
|
|
print(f"\nDRY RUN — no changes. Re-run sin --dry-run to execute.")
|
|
return
|
|
|
|
# Step 2: re-fetch via PA
|
|
from data_fetchers.pa_photo_lookup import fetch_pa_photo
|
|
|
|
updated = 0
|
|
failed = 0
|
|
t0 = time.time()
|
|
for deal_id, parcel, addr, old_photos in affected:
|
|
addr_safe = (addr or "?")[:50].encode("ascii", "replace").decode("ascii")
|
|
print(f" Fetching id={deal_id} parcel={parcel} addr={addr_safe}...", flush=True)
|
|
try:
|
|
photo_url, meta = fetch_pa_photo(county="Broward", parcel_id=parcel,
|
|
timeout_seconds=25)
|
|
if photo_url:
|
|
new_photos = json.dumps([photo_url])
|
|
cur.execute("UPDATE deals SET photos_urls = ? WHERE id = ?",
|
|
(new_photos, deal_id))
|
|
conn.commit()
|
|
updated += 1
|
|
print(f" PA photo: {photo_url[-50:]}")
|
|
else:
|
|
# Set to NULL so card shows no photo (instead of stale Zillow)
|
|
cur.execute("UPDATE deals SET photos_urls = NULL WHERE id = ?", (deal_id,))
|
|
conn.commit()
|
|
failed += 1
|
|
err = (meta.get("error") or "no photo found")[:80]
|
|
print(f" NO PA photo found ({err}). photos_urls NULLed.")
|
|
except Exception as e:
|
|
failed += 1
|
|
err = str(e)[:80].encode("ascii", "replace").decode("ascii")
|
|
print(f" EXCEPTION: {err}")
|
|
time.sleep(2) # don't hammer if errors
|
|
|
|
elapsed = time.time() - t0
|
|
print()
|
|
print(f"Done in {elapsed:.0f}s. Updated {updated}, failed/no-photo {failed}.")
|
|
conn.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|