"""Cleanup script: NULL out duplicate photo_urls so user can re-scrape. Strategy: any photo_url shared by 2+ deals is suspect. NULL them ALL — let the next Zillow scrape (now with fixed parser) re-populate correctly. Run with --dry-run first to preview impact. """ from __future__ import annotations import argparse import sqlite3 import sys def main(): p = argparse.ArgumentParser() p.add_argument("--dry-run", action="store_true", help="Show impact without writing") p.add_argument("--db", default="data/deals.db") args = p.parse_args() conn = sqlite3.connect(args.db) cur = conn.cursor() # Find duplicate photo_urls cur.execute(""" SELECT photos_urls, COUNT(DISTINCT id) AS deals_sharing, GROUP_CONCAT(id) AS deal_ids, GROUP_CONCAT(DISTINCT substr(address,1,50)) AS addresses FROM deals WHERE photos_urls IS NOT NULL AND photos_urls != '[]' AND photos_urls != '' GROUP BY photos_urls HAVING deals_sharing > 1 """) duplicates = cur.fetchall() if not duplicates: print("No duplicate photos found — DB clean.") return print(f"Found {len(duplicates)} duplicate photo cases affecting " f"{sum(r[1] for r in duplicates)} deals total.") print() affected_deal_ids: list[int] = [] for photos_url, count, deal_ids_csv, addresses in duplicates: deal_ids = [int(x) for x in deal_ids_csv.split(",")] affected_deal_ids.extend(deal_ids) photo_preview = photos_url[:80].encode("ascii", "replace").decode("ascii") print(f" {count} deals share: {photo_preview}") print(f" deal IDs: {deal_ids[:10]}{'...' if len(deal_ids) > 10 else ''}") print(f" addresses: {addresses[:100].encode('ascii','replace').decode('ascii')}") print() print(f"Total affected deals: {len(affected_deal_ids)}") if args.dry_run: print("\nDRY RUN — no changes written. Re-run without --dry-run to clean.") return # NULL out these deals' photos_urls placeholders = ",".join("?" * len(affected_deal_ids)) cur.execute( f"UPDATE deals SET photos_urls = NULL WHERE id IN ({placeholders})", affected_deal_ids, ) conn.commit() print(f"\nDONE. NULL'd photos_urls for {cur.rowcount} deals.") print("Next Zillow scrape (with fixed parser) will re-populate correctly.") conn.close() if __name__ == "__main__": main()