"""Migration: separar Zillow zpids del campo case_number. BUG ORIGINAL: El Zillow scraper estaba guardando el zpid en la columna case_number, que es INCORRECTO porque case_number es solo para court cases reales (foreclosure judicial, tax_deed). Esto contaminaba el dataset con falsos positivos de "deals judiciales". FIX: 1. Aplica ALTER TABLE para agregar external_id (idempotent via init_db) 2. Para cada deal de source='zillow': mover case_number → external_id 3. Tambien aplica a hud_homestore (su HUD case# es trackeo, no court case judicial) Idempotent: si ya se corrio, los nuevos rows seran 0. """ from __future__ import annotations import io, sys from pathlib import Path sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT)) from deals_db import init_db, _get_conn def main() -> int: init_db() # ensures external_id column exists conn = _get_conn() cols = {r["name"] for r in conn.execute("PRAGMA table_info(deals)").fetchall()} if "external_id" not in cols: print("ERROR: external_id column missing after init_db()") return 1 # Step 1: Zillow zpids — move case_number → external_id rows_z = conn.execute( "SELECT COUNT(*) FROM deals WHERE source = 'zillow' " "AND case_number IS NOT NULL AND case_number != '' " "AND (external_id IS NULL OR external_id = '')" ).fetchone() print(f"Zillow deals needing migration: {rows_z[0]}") if rows_z[0] > 0: conn.execute( "UPDATE deals SET external_id = case_number, case_number = NULL " "WHERE source = 'zillow' " "AND case_number IS NOT NULL AND case_number != '' " "AND (external_id IS NULL OR external_id = '')" ) print(f" Migrated {rows_z[0]} Zillow zpid values") # Step 2: HUD Homestore — HUD case# is a tracking number, NOT a court case. # We still move it to external_id; case_number stays NULL for HUD (since # HUD listings are REO, not judicial proceedings). rows_h = conn.execute( "SELECT COUNT(*) FROM deals WHERE source = 'hud_homestore' " "AND case_number IS NOT NULL AND case_number != '' " "AND (external_id IS NULL OR external_id = '')" ).fetchone() print(f"HUD deals needing migration: {rows_h[0]}") if rows_h[0] > 0: conn.execute( "UPDATE deals SET external_id = case_number, case_number = NULL " "WHERE source = 'hud_homestore' " "AND case_number IS NOT NULL AND case_number != '' " "AND (external_id IS NULL OR external_id = '')" ) print(f" Migrated {rows_h[0]} HUD case# values") # Step 3: Verify clerks NOT affected (their case_number IS a real court case) clerk_with_case = conn.execute( "SELECT COUNT(*) FROM deals WHERE source LIKE '%_clerk' " "AND case_number IS NOT NULL AND case_number != ''" ).fetchone()[0] print(f"\nClerk deals with case_number (court cases, unchanged): {clerk_with_case}") # Final state print() print("=== Final state ===") rows = conn.execute(""" SELECT source, SUM(CASE WHEN case_number IS NOT NULL AND case_number != '' THEN 1 ELSE 0 END) AS with_case, SUM(CASE WHEN external_id IS NOT NULL AND external_id != '' THEN 1 ELSE 0 END) AS with_ext FROM deals GROUP BY source ORDER BY source """).fetchall() print(f"{'source':<22} {'case_number':>12} {'external_id':>12}") for r in rows: print(f" {r['source']:<22} {r['with_case']:>12} {r['with_ext']:>12}") return 0 if __name__ == "__main__": sys.exit(main())