74 lines
2.4 KiB
Python
74 lines
2.4 KiB
Python
"""Cleanup script: NULL out duplicate photo_urls so user can re-scrape.
|
|
|
|
Strategy: any photo_url shared by 2+ deals is suspect. NULL them ALL — let
|
|
the next Zillow scrape (now with fixed parser) re-populate correctly.
|
|
|
|
Run with --dry-run first to preview impact.
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import sqlite3
|
|
import sys
|
|
|
|
|
|
def main():
|
|
p = argparse.ArgumentParser()
|
|
p.add_argument("--dry-run", action="store_true", help="Show impact without writing")
|
|
p.add_argument("--db", default="data/deals.db")
|
|
args = p.parse_args()
|
|
|
|
conn = sqlite3.connect(args.db)
|
|
cur = conn.cursor()
|
|
|
|
# Find duplicate photo_urls
|
|
cur.execute("""
|
|
SELECT photos_urls, COUNT(DISTINCT id) AS deals_sharing,
|
|
GROUP_CONCAT(id) AS deal_ids,
|
|
GROUP_CONCAT(DISTINCT substr(address,1,50)) AS addresses
|
|
FROM deals
|
|
WHERE photos_urls IS NOT NULL AND photos_urls != '[]' AND photos_urls != ''
|
|
GROUP BY photos_urls
|
|
HAVING deals_sharing > 1
|
|
""")
|
|
duplicates = cur.fetchall()
|
|
|
|
if not duplicates:
|
|
print("No duplicate photos found — DB clean.")
|
|
return
|
|
|
|
print(f"Found {len(duplicates)} duplicate photo cases affecting "
|
|
f"{sum(r[1] for r in duplicates)} deals total.")
|
|
print()
|
|
|
|
affected_deal_ids: list[int] = []
|
|
for photos_url, count, deal_ids_csv, addresses in duplicates:
|
|
deal_ids = [int(x) for x in deal_ids_csv.split(",")]
|
|
affected_deal_ids.extend(deal_ids)
|
|
photo_preview = photos_url[:80].encode("ascii", "replace").decode("ascii")
|
|
print(f" {count} deals share: {photo_preview}")
|
|
print(f" deal IDs: {deal_ids[:10]}{'...' if len(deal_ids) > 10 else ''}")
|
|
print(f" addresses: {addresses[:100].encode('ascii','replace').decode('ascii')}")
|
|
|
|
print()
|
|
print(f"Total affected deals: {len(affected_deal_ids)}")
|
|
|
|
if args.dry_run:
|
|
print("\nDRY RUN — no changes written. Re-run without --dry-run to clean.")
|
|
return
|
|
|
|
# NULL out these deals' photos_urls
|
|
placeholders = ",".join("?" * len(affected_deal_ids))
|
|
cur.execute(
|
|
f"UPDATE deals SET photos_urls = NULL WHERE id IN ({placeholders})",
|
|
affected_deal_ids,
|
|
)
|
|
conn.commit()
|
|
print(f"\nDONE. NULL'd photos_urls for {cur.rowcount} deals.")
|
|
print("Next Zillow scrape (with fixed parser) will re-populate correctly.")
|
|
|
|
conn.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|