Files
AR-House/scripts/migrate_zillow_zpid_to_external_id.py
2026-07-03 12:24:58 -04:00

95 lines
3.6 KiB
Python

"""Migration: separar Zillow zpids del campo case_number.
BUG ORIGINAL:
El Zillow scraper estaba guardando el zpid en la columna case_number, que es
INCORRECTO porque case_number es solo para court cases reales (foreclosure
judicial, tax_deed). Esto contaminaba el dataset con falsos positivos de
"deals judiciales".
FIX:
1. Aplica ALTER TABLE para agregar external_id (idempotent via init_db)
2. Para cada deal de source='zillow': mover case_number → external_id
3. Tambien aplica a hud_homestore (su HUD case# es trackeo, no court case judicial)
Idempotent: si ya se corrio, los nuevos rows seran 0.
"""
from __future__ import annotations
import io, sys
from pathlib import Path
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from deals_db import init_db, _get_conn
def main() -> int:
init_db() # ensures external_id column exists
conn = _get_conn()
cols = {r["name"] for r in conn.execute("PRAGMA table_info(deals)").fetchall()}
if "external_id" not in cols:
print("ERROR: external_id column missing after init_db()")
return 1
# Step 1: Zillow zpids — move case_number → external_id
rows_z = conn.execute(
"SELECT COUNT(*) FROM deals WHERE source = 'zillow' "
"AND case_number IS NOT NULL AND case_number != '' "
"AND (external_id IS NULL OR external_id = '')"
).fetchone()
print(f"Zillow deals needing migration: {rows_z[0]}")
if rows_z[0] > 0:
conn.execute(
"UPDATE deals SET external_id = case_number, case_number = NULL "
"WHERE source = 'zillow' "
"AND case_number IS NOT NULL AND case_number != '' "
"AND (external_id IS NULL OR external_id = '')"
)
print(f" Migrated {rows_z[0]} Zillow zpid values")
# Step 2: HUD Homestore — HUD case# is a tracking number, NOT a court case.
# We still move it to external_id; case_number stays NULL for HUD (since
# HUD listings are REO, not judicial proceedings).
rows_h = conn.execute(
"SELECT COUNT(*) FROM deals WHERE source = 'hud_homestore' "
"AND case_number IS NOT NULL AND case_number != '' "
"AND (external_id IS NULL OR external_id = '')"
).fetchone()
print(f"HUD deals needing migration: {rows_h[0]}")
if rows_h[0] > 0:
conn.execute(
"UPDATE deals SET external_id = case_number, case_number = NULL "
"WHERE source = 'hud_homestore' "
"AND case_number IS NOT NULL AND case_number != '' "
"AND (external_id IS NULL OR external_id = '')"
)
print(f" Migrated {rows_h[0]} HUD case# values")
# Step 3: Verify clerks NOT affected (their case_number IS a real court case)
clerk_with_case = conn.execute(
"SELECT COUNT(*) FROM deals WHERE source LIKE '%_clerk' "
"AND case_number IS NOT NULL AND case_number != ''"
).fetchone()[0]
print(f"\nClerk deals with case_number (court cases, unchanged): {clerk_with_case}")
# Final state
print()
print("=== Final state ===")
rows = conn.execute("""
SELECT source,
SUM(CASE WHEN case_number IS NOT NULL AND case_number != '' THEN 1 ELSE 0 END) AS with_case,
SUM(CASE WHEN external_id IS NOT NULL AND external_id != '' THEN 1 ELSE 0 END) AS with_ext
FROM deals GROUP BY source ORDER BY source
""").fetchall()
print(f"{'source':<22} {'case_number':>12} {'external_id':>12}")
for r in rows:
print(f" {r['source']:<22} {r['with_case']:>12} {r['with_ext']:>12}")
return 0
if __name__ == "__main__":
sys.exit(main())