feat: AR-House initial commit
This commit is contained in:
@@ -0,0 +1,73 @@
|
||||
"""Cleanup script: NULL out duplicate photo_urls so user can re-scrape.
|
||||
|
||||
Strategy: any photo_url shared by 2+ deals is suspect. NULL them ALL — let
|
||||
the next Zillow scrape (now with fixed parser) re-populate correctly.
|
||||
|
||||
Run with --dry-run first to preview impact.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import sqlite3
|
||||
import sys
|
||||
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("--dry-run", action="store_true", help="Show impact without writing")
|
||||
p.add_argument("--db", default="data/deals.db")
|
||||
args = p.parse_args()
|
||||
|
||||
conn = sqlite3.connect(args.db)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Find duplicate photo_urls
|
||||
cur.execute("""
|
||||
SELECT photos_urls, COUNT(DISTINCT id) AS deals_sharing,
|
||||
GROUP_CONCAT(id) AS deal_ids,
|
||||
GROUP_CONCAT(DISTINCT substr(address,1,50)) AS addresses
|
||||
FROM deals
|
||||
WHERE photos_urls IS NOT NULL AND photos_urls != '[]' AND photos_urls != ''
|
||||
GROUP BY photos_urls
|
||||
HAVING deals_sharing > 1
|
||||
""")
|
||||
duplicates = cur.fetchall()
|
||||
|
||||
if not duplicates:
|
||||
print("No duplicate photos found — DB clean.")
|
||||
return
|
||||
|
||||
print(f"Found {len(duplicates)} duplicate photo cases affecting "
|
||||
f"{sum(r[1] for r in duplicates)} deals total.")
|
||||
print()
|
||||
|
||||
affected_deal_ids: list[int] = []
|
||||
for photos_url, count, deal_ids_csv, addresses in duplicates:
|
||||
deal_ids = [int(x) for x in deal_ids_csv.split(",")]
|
||||
affected_deal_ids.extend(deal_ids)
|
||||
photo_preview = photos_url[:80].encode("ascii", "replace").decode("ascii")
|
||||
print(f" {count} deals share: {photo_preview}")
|
||||
print(f" deal IDs: {deal_ids[:10]}{'...' if len(deal_ids) > 10 else ''}")
|
||||
print(f" addresses: {addresses[:100].encode('ascii','replace').decode('ascii')}")
|
||||
|
||||
print()
|
||||
print(f"Total affected deals: {len(affected_deal_ids)}")
|
||||
|
||||
if args.dry_run:
|
||||
print("\nDRY RUN — no changes written. Re-run without --dry-run to clean.")
|
||||
return
|
||||
|
||||
# NULL out these deals' photos_urls
|
||||
placeholders = ",".join("?" * len(affected_deal_ids))
|
||||
cur.execute(
|
||||
f"UPDATE deals SET photos_urls = NULL WHERE id IN ({placeholders})",
|
||||
affected_deal_ids,
|
||||
)
|
||||
conn.commit()
|
||||
print(f"\nDONE. NULL'd photos_urls for {cur.rowcount} deals.")
|
||||
print("Next Zillow scrape (with fixed parser) will re-populate correctly.")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user