95 lines
3.6 KiB
Python
95 lines
3.6 KiB
Python
"""Migration: separar Zillow zpids del campo case_number.
|
|
|
|
BUG ORIGINAL:
|
|
El Zillow scraper estaba guardando el zpid en la columna case_number, que es
|
|
INCORRECTO porque case_number es solo para court cases reales (foreclosure
|
|
judicial, tax_deed). Esto contaminaba el dataset con falsos positivos de
|
|
"deals judiciales".
|
|
|
|
FIX:
|
|
1. Aplica ALTER TABLE para agregar external_id (idempotent via init_db)
|
|
2. Para cada deal de source='zillow': mover case_number → external_id
|
|
3. Tambien aplica a hud_homestore (su HUD case# es trackeo, no court case judicial)
|
|
|
|
Idempotent: si ya se corrio, los nuevos rows seran 0.
|
|
"""
|
|
from __future__ import annotations
|
|
import io, sys
|
|
from pathlib import Path
|
|
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
sys.path.insert(0, str(ROOT))
|
|
|
|
from deals_db import init_db, _get_conn
|
|
|
|
|
|
def main() -> int:
|
|
init_db() # ensures external_id column exists
|
|
conn = _get_conn()
|
|
|
|
cols = {r["name"] for r in conn.execute("PRAGMA table_info(deals)").fetchall()}
|
|
if "external_id" not in cols:
|
|
print("ERROR: external_id column missing after init_db()")
|
|
return 1
|
|
|
|
# Step 1: Zillow zpids — move case_number → external_id
|
|
rows_z = conn.execute(
|
|
"SELECT COUNT(*) FROM deals WHERE source = 'zillow' "
|
|
"AND case_number IS NOT NULL AND case_number != '' "
|
|
"AND (external_id IS NULL OR external_id = '')"
|
|
).fetchone()
|
|
print(f"Zillow deals needing migration: {rows_z[0]}")
|
|
if rows_z[0] > 0:
|
|
conn.execute(
|
|
"UPDATE deals SET external_id = case_number, case_number = NULL "
|
|
"WHERE source = 'zillow' "
|
|
"AND case_number IS NOT NULL AND case_number != '' "
|
|
"AND (external_id IS NULL OR external_id = '')"
|
|
)
|
|
print(f" Migrated {rows_z[0]} Zillow zpid values")
|
|
|
|
# Step 2: HUD Homestore — HUD case# is a tracking number, NOT a court case.
|
|
# We still move it to external_id; case_number stays NULL for HUD (since
|
|
# HUD listings are REO, not judicial proceedings).
|
|
rows_h = conn.execute(
|
|
"SELECT COUNT(*) FROM deals WHERE source = 'hud_homestore' "
|
|
"AND case_number IS NOT NULL AND case_number != '' "
|
|
"AND (external_id IS NULL OR external_id = '')"
|
|
).fetchone()
|
|
print(f"HUD deals needing migration: {rows_h[0]}")
|
|
if rows_h[0] > 0:
|
|
conn.execute(
|
|
"UPDATE deals SET external_id = case_number, case_number = NULL "
|
|
"WHERE source = 'hud_homestore' "
|
|
"AND case_number IS NOT NULL AND case_number != '' "
|
|
"AND (external_id IS NULL OR external_id = '')"
|
|
)
|
|
print(f" Migrated {rows_h[0]} HUD case# values")
|
|
|
|
# Step 3: Verify clerks NOT affected (their case_number IS a real court case)
|
|
clerk_with_case = conn.execute(
|
|
"SELECT COUNT(*) FROM deals WHERE source LIKE '%_clerk' "
|
|
"AND case_number IS NOT NULL AND case_number != ''"
|
|
).fetchone()[0]
|
|
print(f"\nClerk deals with case_number (court cases, unchanged): {clerk_with_case}")
|
|
|
|
# Final state
|
|
print()
|
|
print("=== Final state ===")
|
|
rows = conn.execute("""
|
|
SELECT source,
|
|
SUM(CASE WHEN case_number IS NOT NULL AND case_number != '' THEN 1 ELSE 0 END) AS with_case,
|
|
SUM(CASE WHEN external_id IS NOT NULL AND external_id != '' THEN 1 ELSE 0 END) AS with_ext
|
|
FROM deals GROUP BY source ORDER BY source
|
|
""").fetchall()
|
|
print(f"{'source':<22} {'case_number':>12} {'external_id':>12}")
|
|
for r in rows:
|
|
print(f" {r['source']:<22} {r['with_case']:>12} {r['with_ext']:>12}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|