Files
AR-House/scripts/cleanup_duplicate_photos.py
2026-07-03 12:24:58 -04:00

74 lines
2.4 KiB
Python

"""Cleanup script: NULL out duplicate photo_urls so user can re-scrape.
Strategy: any photo_url shared by 2+ deals is suspect. NULL them ALL — let
the next Zillow scrape (now with fixed parser) re-populate correctly.
Run with --dry-run first to preview impact.
"""
from __future__ import annotations
import argparse
import sqlite3
import sys
def main():
p = argparse.ArgumentParser()
p.add_argument("--dry-run", action="store_true", help="Show impact without writing")
p.add_argument("--db", default="data/deals.db")
args = p.parse_args()
conn = sqlite3.connect(args.db)
cur = conn.cursor()
# Find duplicate photo_urls
cur.execute("""
SELECT photos_urls, COUNT(DISTINCT id) AS deals_sharing,
GROUP_CONCAT(id) AS deal_ids,
GROUP_CONCAT(DISTINCT substr(address,1,50)) AS addresses
FROM deals
WHERE photos_urls IS NOT NULL AND photos_urls != '[]' AND photos_urls != ''
GROUP BY photos_urls
HAVING deals_sharing > 1
""")
duplicates = cur.fetchall()
if not duplicates:
print("No duplicate photos found — DB clean.")
return
print(f"Found {len(duplicates)} duplicate photo cases affecting "
f"{sum(r[1] for r in duplicates)} deals total.")
print()
affected_deal_ids: list[int] = []
for photos_url, count, deal_ids_csv, addresses in duplicates:
deal_ids = [int(x) for x in deal_ids_csv.split(",")]
affected_deal_ids.extend(deal_ids)
photo_preview = photos_url[:80].encode("ascii", "replace").decode("ascii")
print(f" {count} deals share: {photo_preview}")
print(f" deal IDs: {deal_ids[:10]}{'...' if len(deal_ids) > 10 else ''}")
print(f" addresses: {addresses[:100].encode('ascii','replace').decode('ascii')}")
print()
print(f"Total affected deals: {len(affected_deal_ids)}")
if args.dry_run:
print("\nDRY RUN — no changes written. Re-run without --dry-run to clean.")
return
# NULL out these deals' photos_urls
placeholders = ",".join("?" * len(affected_deal_ids))
cur.execute(
f"UPDATE deals SET photos_urls = NULL WHERE id IN ({placeholders})",
affected_deal_ids,
)
conn.commit()
print(f"\nDONE. NULL'd photos_urls for {cur.rowcount} deals.")
print("Next Zillow scrape (with fixed parser) will re-populate correctly.")
conn.close()
if __name__ == "__main__":
main()