821 lines
38 KiB
Python
821 lines
38 KiB
Python
"""data_fetchers/pa_duval.py — Full Duval County Property Appraiser extractor.
|
|
|
|
Sitio: https://paopropertysearch.coj.net (ASP.NET WebForms)
|
|
Flow: Search.aspx → Results.aspx → Detail.aspx?ParcelNumber=XXX
|
|
|
|
Extrae todo lo publico del Duval PA para construir un Property Snapshot Report:
|
|
- Owner name(s)
|
|
- Property address + subdivision + legal description
|
|
- Building: type, year_built, sqft heated/total, bedrooms, bathrooms,
|
|
exterior wall, roof type, interior flooring
|
|
- Values: just/market, assessed, exemptions (3-year history)
|
|
- Tax breakdown por taxing district
|
|
- Sales history completa (book/page, date, price, deed type, qualified status)
|
|
- Homestead exemption (key signal: owner-occupant vs investor)
|
|
- Land details (zoning, lot size, use code)
|
|
- Extra features (fireplace, pool, etc.)
|
|
|
|
USAGE:
|
|
from data_fetchers.pa_duval import fetch_duval_pa_record
|
|
rec = fetch_duval_pa_record(address="2352 SCENIC VIEW CT", zip_code="32218")
|
|
# rec["year_built"], rec["sales_history"], rec["homestead_active"]...
|
|
|
|
TECHNICAL:
|
|
- ASP.NET WebForms con WebForm_DoPostBackWithOptions (compat IE8)
|
|
- Element IDs ESTABLES (no autogenerados)
|
|
- Per-search latency: ~10-15s (entry → search → results → detail)
|
|
- Free (Playwright local, no API cost)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
|
|
|
|
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131"
|
|
|
|
_BASE_URL = "https://paopropertysearch.coj.net"
|
|
_SEARCH_URL = f"{_BASE_URL}/Basic/Search.aspx"
|
|
|
|
# Address parser para Duval ASP.NET form fields.
|
|
# Acepta long form (COURT, STREET) y abbreviation (CT, ST). Strip city/state/zip
|
|
# antes de parsear (split por primera coma).
|
|
_ADDRESS_RE = re.compile(
|
|
r"^\s*(?P<num>\d+)\s+"
|
|
r"(?:(?P<prefix>N|S|E|W|NE|NW|SE|SW)\s+)?"
|
|
r"(?P<name>[A-Z][A-Z\s\d\-']*?)"
|
|
r"\s+(?P<suffix>"
|
|
r"ST|STREET|AVE|AV|AVENUE|RD|ROAD|DR|DRIVE|CT|COURT|CIR|CIRCLE|"
|
|
r"LN|LANE|BLVD|BOULEVARD|HWY|HIGHWAY|WAY|PL|PLACE|PKY|PKWY|PARKWAY|"
|
|
r"TRL|TRAIL|TER|TERRACE|LOOP|RUN|ALY|ALLEY|XING|CROSSING"
|
|
r")\b",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# Map long form → ASP.NET ddStreetSuffix value
|
|
_SUFFIX_NORMALIZE = {
|
|
"STREET": "ST", "AVENUE": "AVE", "AV": "AVE", "ROAD": "RD",
|
|
"DRIVE": "DR", "COURT": "CT", "CIRCLE": "CIR", "LANE": "LN",
|
|
"BOULEVARD": "BLVD", "HIGHWAY": "HWY", "PLACE": "PL",
|
|
"PARKWAY": "PKWY", "PKY": "PKWY", "TRAIL": "TRL", "TERRACE": "TER",
|
|
"ALLEY": "ALY", "CROSSING": "XING",
|
|
}
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# Field ID mapping (confirmed via probe on 2352 SCENIC VIEW CT)
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
_SCALAR_IDS = {
|
|
"owner_name": "ctl00_cphBody_repeaterOwnerInformation_ctl00_lblOwnerName",
|
|
"site_address_line1": "ctl00_cphBody_repeaterBuilding_ctl00_lblBuildingSiteAddressLine1",
|
|
"site_address_line2": "ctl00_cphBody_repeaterBuilding_ctl00_lblBuildingSiteAddressLine2",
|
|
"building_type": "ctl00_cphBody_repeaterBuilding_ctl00_lblBuildingType",
|
|
"year_built": "ctl00_cphBody_repeaterBuilding_ctl00_lblYearBuilt",
|
|
"building_value": "ctl00_cphBody_repeaterBuilding_ctl00_lblBldgValue",
|
|
# Tax values current + last year
|
|
"tax_last_year_just": "ctl00_cphBody_lblTaxLastYearJustValue",
|
|
"tax_last_year_assessed": "ctl00_cphBody_lblTaxLastYearAssessedValue",
|
|
"tax_last_year_exemptions": "ctl00_cphBody_lblTaxLastYearExemptions",
|
|
"tax_last_year_taxable": "ctl00_cphBody_lblTaxLastYearTaxableValue",
|
|
"tax_current_year_just": "ctl00_cphBody_lblTaxCurrentYearJustValue",
|
|
"tax_current_year_assessed": "ctl00_cphBody_lblTaxCurrentYearAssessedValue",
|
|
"tax_current_year_exemptions": "ctl00_cphBody_lblTaxCurrentYearExemptions",
|
|
"tax_current_year_taxable": "ctl00_cphBody_lblTaxCurrentYearTaxableValue",
|
|
# Values from main values table (no current "in progress" year)
|
|
"assessed_value_3": "ctl00_cphBody_lblAssessedValue3",
|
|
"taxable_value_school": "ctl00_cphBody_lblTaxableValueSchool",
|
|
}
|
|
|
|
|
|
def _parse_address(address: str) -> Optional[dict]:
|
|
"""Parse address into Duval form fields.
|
|
|
|
Acepta:
|
|
'2352 SCENIC VIEW CT' → simple
|
|
'2352 SCENIC VIEW Court, Jacksonville, FL 32218' → con city/state/zip
|
|
'123 N MAIN ST, Anytown, FL' → con prefix
|
|
"""
|
|
if not address:
|
|
return None
|
|
# Strip city/state/zip — toma solo lo antes del primer comma
|
|
street_only = address.split(",")[0].strip().upper()
|
|
m = _ADDRESS_RE.search(street_only)
|
|
if not m:
|
|
return None
|
|
suffix_raw = (m.group("suffix") or "").strip().upper()
|
|
suffix_normalized = _SUFFIX_NORMALIZE.get(suffix_raw, suffix_raw) if suffix_raw else None
|
|
return {
|
|
"street_num": m.group("num"),
|
|
"prefix": (m.group("prefix") or "").strip().upper() or None,
|
|
"name": m.group("name").strip(),
|
|
"suffix": suffix_normalized,
|
|
}
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# Public API
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
def fetch_duval_pa_record(
|
|
address: Optional[str] = None,
|
|
parcel_id: Optional[str] = None,
|
|
zip_code: Optional[str] = None,
|
|
timeout_seconds: int = 30,
|
|
listing_price: Optional[float] = None,
|
|
) -> dict:
|
|
"""Fetch full Duval PA record by address OR parcel_id (RE#).
|
|
|
|
Args:
|
|
address: street address (e.g. "2352 SCENIC VIEW CT")
|
|
parcel_id: Duval RE# (e.g. "044273-0370") — preferred si lo tenes
|
|
zip_code: optional zip filter
|
|
timeout_seconds: max wait per Playwright op
|
|
|
|
Returns:
|
|
Dict con TODOS los campos publicos. Si fallo, dict tiene 'errors'.
|
|
Key fields:
|
|
- owner_name, owner_full_address (mailing)
|
|
- site_address, parcel_id (RE#), subdivision
|
|
- year_built (ENTERO), building_type, sqft_heated, sqft_gross,
|
|
sqft_garage, bedrooms, baths, stories
|
|
- exterior_wall, roof_struct, roofing_cover, interior_wall, int_flooring
|
|
- just_value_current, assessed_value_current, taxable_current,
|
|
exemption_current
|
|
- just_value_last, assessed_value_last, taxes_billed_last
|
|
- homestead_active (bool — exemptions >= $25,000 = homestead)
|
|
- sales_history: [{date, price, deed_type, qualified, book_page,
|
|
vacant_improved}, ...]
|
|
- extra_features: [{code, description, value}, ...]
|
|
- land: {zoning, use_code, lot_acres, lot_total_sqft}
|
|
- legal_description: str
|
|
- tax_breakdown: [{district, assessed, exempt, taxable, tax_amt}, ...]
|
|
- source_url: str (detail page URL)
|
|
- fetched_at: ISO timestamp
|
|
"""
|
|
fetched_at = datetime.now(timezone.utc).isoformat()
|
|
result = {
|
|
"county": "Duval",
|
|
"source": "Duval Property Appraiser (paopropertysearch.coj.net)",
|
|
"fetched_at": fetched_at,
|
|
"errors": [],
|
|
}
|
|
|
|
if not address and not parcel_id:
|
|
result["errors"].append("no address or parcel_id provided")
|
|
return result
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
|
except ImportError:
|
|
result["errors"].append("playwright not installed")
|
|
return result
|
|
|
|
parsed_addr = _parse_address(address) if address else None
|
|
if address and not parsed_addr:
|
|
result["errors"].append(f"could not parse address '{address}' (need format: 'NUM [PREFIX] NAME SUFFIX')")
|
|
return result
|
|
|
|
try:
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
ctx = browser.new_context(user_agent=USER_AGENT)
|
|
page = ctx.new_page()
|
|
page.set_default_timeout(timeout_seconds * 1000)
|
|
|
|
page.goto(_SEARCH_URL, wait_until="load", timeout=timeout_seconds * 1000)
|
|
time.sleep(2)
|
|
|
|
if parcel_id:
|
|
pid_clean = parcel_id.replace("-", "").strip()
|
|
detail_url = f"{_BASE_URL}/Basic/Detail.aspx?RE={pid_clean}"
|
|
try:
|
|
page.goto(detail_url, wait_until="load", timeout=timeout_seconds * 1000)
|
|
except Exception:
|
|
# If 'load' times out, fall back to 'commit' (page has navigated)
|
|
page.goto(detail_url, wait_until="commit", timeout=timeout_seconds * 1000)
|
|
time.sleep(5)
|
|
else:
|
|
# Search by address fields
|
|
page.locator("#ctl00_cphBody_tbStreetNumber").fill(parsed_addr["street_num"])
|
|
if parsed_addr["prefix"]:
|
|
try:
|
|
page.locator("#ctl00_cphBody_ddStreetPrefix").select_option(value=parsed_addr["prefix"])
|
|
except Exception:
|
|
pass
|
|
page.locator("#ctl00_cphBody_tbStreetName").fill(parsed_addr["name"])
|
|
if parsed_addr["suffix"]:
|
|
try:
|
|
page.locator("#ctl00_cphBody_ddStreetSuffix").select_option(value=parsed_addr["suffix"])
|
|
except Exception:
|
|
pass
|
|
if zip_code:
|
|
try:
|
|
page.locator("#ctl00_cphBody_tbZipCode").fill(zip_code)
|
|
except Exception:
|
|
pass
|
|
|
|
page.locator("#ctl00_cphBody_bSearch").click()
|
|
page.wait_for_timeout(4000)
|
|
|
|
# If results table → extract href from first row link and
|
|
# navigate directly (Playwright click + navigation hangs on ASP.NET)
|
|
rs_url = page.url
|
|
if "Results.aspx" in rs_url:
|
|
detail_href = _extract_detail_href_with_retry(page, max_retries=2)
|
|
if not detail_href:
|
|
body_preview = page.inner_text("body")[:300].replace("\n", " ")
|
|
result["errors"].append(
|
|
f"results page returned but no Detail.aspx link found "
|
|
f"(url={page.url}, body_preview={body_preview!r})"
|
|
)
|
|
browser.close()
|
|
return result
|
|
# Build absolute URL and navigate directly (no click)
|
|
if detail_href.startswith("/"):
|
|
detail_url = f"{_BASE_URL}{detail_href}"
|
|
elif detail_href.startswith("http"):
|
|
detail_url = detail_href
|
|
else:
|
|
detail_url = f"{_BASE_URL}/Basic/{detail_href}"
|
|
try:
|
|
page.goto(detail_url, wait_until="load", timeout=timeout_seconds * 1000)
|
|
except Exception:
|
|
page.goto(detail_url, wait_until="commit", timeout=timeout_seconds * 1000)
|
|
time.sleep(5)
|
|
elif "Detail.aspx" not in page.url:
|
|
result["errors"].append(f"unexpected URL after search: {page.url}")
|
|
browser.close()
|
|
return result
|
|
|
|
# We should now be on Detail.aspx
|
|
if "Detail.aspx" not in page.url:
|
|
result["errors"].append(f"failed to reach Detail page, URL: {page.url}")
|
|
browser.close()
|
|
return result
|
|
|
|
result["source_url"] = page.url
|
|
|
|
# Wait for KEY element to confirm full render before extracting.
|
|
# Retry once on server error (Duval intermittent 500s).
|
|
render_ok = False
|
|
for attempt in range(3):
|
|
try:
|
|
page.wait_for_selector(
|
|
"#ctl00_cphBody_repeaterBuilding_ctl00_lblYearBuilt",
|
|
state="attached",
|
|
timeout=20000,
|
|
)
|
|
render_ok = True
|
|
break
|
|
except Exception:
|
|
# Try owner sentinel
|
|
try:
|
|
page.wait_for_selector(
|
|
"#ctl00_cphBody_repeaterOwnerInformation_ctl00_lblOwnerName",
|
|
state="attached",
|
|
timeout=10000,
|
|
)
|
|
render_ok = True
|
|
break
|
|
except Exception:
|
|
# Check if server error → retry
|
|
body = page.inner_text("body")[:500]
|
|
if "Server Error" in body or "Runtime Error" in body:
|
|
if attempt < 2:
|
|
time.sleep(8 * (attempt + 1))
|
|
try:
|
|
page.reload(wait_until="load", timeout=30000)
|
|
except Exception:
|
|
pass
|
|
continue
|
|
break
|
|
if not render_ok:
|
|
result["errors"].append("detail page didn't render expected elements (server slow or error)")
|
|
|
|
# Extract all scalar fields in one JS call
|
|
scalars = page.evaluate(
|
|
"""(ids) => {
|
|
const out = {};
|
|
for (const [k, id] of Object.entries(ids)) {
|
|
const el = document.getElementById(id);
|
|
out[k] = el ? (el.textContent || '').trim() : '';
|
|
}
|
|
return out;
|
|
}""",
|
|
_SCALAR_IDS,
|
|
)
|
|
|
|
# Extract sales history
|
|
sales_history = page.evaluate("""
|
|
() => {
|
|
const out = [];
|
|
document.querySelectorAll('table').forEach((tbl) => {
|
|
const hdrCells = tbl.querySelectorAll('tr')[0]?.querySelectorAll('th, td');
|
|
if (!hdrCells || hdrCells.length < 4) return;
|
|
const hdrText = Array.from(hdrCells).map(c => (c.textContent||'').trim().toLowerCase());
|
|
const isSalesHdr = hdrText.some(h => h.includes('sale date') || h.includes('book/page') || h.includes('deed instrument'));
|
|
if (!isSalesHdr) return;
|
|
const rows = tbl.querySelectorAll('tr');
|
|
for (let i = 1; i < rows.length; i++) {
|
|
const cells = rows[i].querySelectorAll('td');
|
|
if (cells.length < 4) continue;
|
|
const r = {};
|
|
cells.forEach((c, idx) => {
|
|
const h = hdrText[idx] || `col${idx}`;
|
|
r[h] = (c.textContent || '').trim();
|
|
});
|
|
if (Object.values(r).some(v => v && v.length > 0)) out.push(r);
|
|
}
|
|
});
|
|
return out;
|
|
}
|
|
""")
|
|
|
|
# Extract building area
|
|
building_area = page.evaluate("""
|
|
() => {
|
|
const out = {};
|
|
const grid = document.getElementById('ctl00_cphBody_repeaterBuilding_ctl00_gridBuildingArea');
|
|
if (!grid) return out;
|
|
const rows = grid.querySelectorAll('tr');
|
|
for (let i = 1; i < rows.length; i++) {
|
|
const cells = rows[i].querySelectorAll('td');
|
|
if (cells.length < 4) continue;
|
|
const type = (cells[0].textContent || '').trim();
|
|
const gross = (cells[1].textContent || '').trim();
|
|
const heated = (cells[2].textContent || '').trim();
|
|
const effective = (cells[3].textContent || '').trim();
|
|
if (type) out[type] = { gross, heated, effective };
|
|
}
|
|
return out;
|
|
}
|
|
""")
|
|
|
|
# Extract building attributes (beds, baths, stories)
|
|
attrs = page.evaluate("""
|
|
() => {
|
|
const out = {};
|
|
const grid = document.getElementById('ctl00_cphBody_repeaterBuilding_ctl00_gridBuildingAttributes');
|
|
if (!grid) return out;
|
|
const rows = grid.querySelectorAll('tr');
|
|
for (let i = 1; i < rows.length; i++) {
|
|
const cells = rows[i].querySelectorAll('td');
|
|
if (cells.length < 2) continue;
|
|
const element = (cells[0].textContent || '').trim();
|
|
const code = (cells[1].textContent || '').trim();
|
|
if (element) out[element] = code;
|
|
}
|
|
return out;
|
|
}
|
|
""")
|
|
|
|
# Extract building structural elements (roof, walls, flooring)
|
|
# NOTE: these come from the same building section, different grid
|
|
structural = page.evaluate("""
|
|
() => {
|
|
const out = {};
|
|
// Find any grid in building section with Element/Code/Detail headers
|
|
document.querySelectorAll('table').forEach((tbl) => {
|
|
const hdrs = tbl.querySelectorAll('tr')[0]?.querySelectorAll('th, td');
|
|
if (!hdrs) return;
|
|
const ht = Array.from(hdrs).map(c => (c.textContent||'').trim().toLowerCase());
|
|
if (!(ht.includes('element') && ht.includes('code') && ht.includes('detail'))) return;
|
|
// Skip the simpler attributes table (only 3 fields)
|
|
const rows = tbl.querySelectorAll('tr');
|
|
if (rows.length < 4) return;
|
|
for (let i = 1; i < rows.length; i++) {
|
|
const cells = rows[i].querySelectorAll('td');
|
|
if (cells.length < 3) continue;
|
|
const element = (cells[0].textContent || '').trim();
|
|
const detail = (cells[2].textContent || '').trim();
|
|
if (element && detail) {
|
|
if (out[element]) {
|
|
out[element] += '; ' + detail;
|
|
} else {
|
|
out[element] = detail;
|
|
}
|
|
}
|
|
}
|
|
});
|
|
return out;
|
|
}
|
|
""")
|
|
|
|
# Extract main property identity (RE#, subdivision, etc.) from top table
|
|
top_props = page.evaluate("""
|
|
() => {
|
|
const out = {};
|
|
document.querySelectorAll('table').forEach((tbl) => {
|
|
const rows = tbl.querySelectorAll('tr');
|
|
if (rows.length < 3) return;
|
|
// Top table has key:value rows (2 cells per row)
|
|
// Heuristic: first cell ends with ':' or matches known labels
|
|
const knownLabels = ['re #','re#','tax district','property use',
|
|
'# of buildings','legal desc','subdivision','total area'];
|
|
let matchCount = 0;
|
|
const candidate = {};
|
|
for (const tr of rows) {
|
|
const cells = tr.querySelectorAll('td, th');
|
|
if (cells.length !== 2) continue;
|
|
const k = (cells[0].textContent || '').trim().toLowerCase().replace(/:$/, '');
|
|
const v = (cells[1].textContent || '').trim();
|
|
if (k && v && knownLabels.some(kw => k.startsWith(kw))) {
|
|
matchCount++;
|
|
candidate[k] = v;
|
|
}
|
|
}
|
|
if (matchCount >= 3) {
|
|
Object.assign(out, candidate);
|
|
}
|
|
});
|
|
return out;
|
|
}
|
|
""")
|
|
|
|
# Land details
|
|
land = page.evaluate("""
|
|
() => {
|
|
const out = {};
|
|
const grid = document.getElementById('ctl00_cphBody_gridLand');
|
|
if (!grid) return out;
|
|
const rows = grid.querySelectorAll('tr');
|
|
if (rows.length < 2) return out;
|
|
const hdrs = rows[0].querySelectorAll('th, td');
|
|
const hdrText = Array.from(hdrs).map(c => (c.textContent||'').trim().toLowerCase());
|
|
const dataRow = rows[1].querySelectorAll('td');
|
|
hdrText.forEach((h, i) => {
|
|
if (dataRow[i]) out[h] = (dataRow[i].textContent || '').trim();
|
|
});
|
|
return out;
|
|
}
|
|
""")
|
|
|
|
# Extra features (fireplace, pool, deck, etc.)
|
|
features = page.evaluate("""
|
|
() => {
|
|
const out = [];
|
|
const grid = document.getElementById('ctl00_cphBody_gridExtraFeatures');
|
|
if (!grid) return out;
|
|
const rows = grid.querySelectorAll('tr');
|
|
for (let i = 1; i < rows.length; i++) {
|
|
const cells = rows[i].querySelectorAll('td');
|
|
if (cells.length < 5) continue;
|
|
out.push({
|
|
code: (cells[1]?.textContent || '').trim(),
|
|
description: (cells[2]?.textContent || '').trim(),
|
|
units: (cells[6]?.textContent || '').trim(),
|
|
value: (cells[7]?.textContent || '').trim(),
|
|
});
|
|
}
|
|
return out;
|
|
}
|
|
""")
|
|
|
|
browser.close()
|
|
|
|
# ─── Post-process ─────────────────────────────────────────────
|
|
result.update({k: _clean(v) for k, v in scalars.items()})
|
|
|
|
# Numeric conversions
|
|
for k in ("year_built",):
|
|
v = result.get(k, "")
|
|
if v:
|
|
result[k] = _to_int(v)
|
|
for k in ("building_value", "tax_last_year_just", "tax_last_year_assessed",
|
|
"tax_last_year_exemptions", "tax_last_year_taxable",
|
|
"tax_current_year_just", "tax_current_year_assessed",
|
|
"tax_current_year_exemptions", "tax_current_year_taxable",
|
|
"assessed_value_3", "taxable_value_school"):
|
|
v = result.get(k, "")
|
|
if v:
|
|
result[k] = _money_to_int(v)
|
|
|
|
# Parcel id / subdivision / etc from top props
|
|
result["parcel_id"] = top_props.get("re #", "") or top_props.get("re#", "")
|
|
result["tax_district"] = top_props.get("tax district", "")
|
|
result["property_use"] = top_props.get("property use", "")
|
|
result["num_buildings"] = top_props.get("# of buildings", "")
|
|
result["subdivision"] = top_props.get("subdivision", "")
|
|
result["lot_total_sqft"] = _to_int(top_props.get("total area", "") or "0")
|
|
|
|
# Building area summary
|
|
result["building_area_grid"] = building_area
|
|
result["sqft_heated"] = _to_int(
|
|
(building_area.get("Base Area") or {}).get("heated", "0") or
|
|
(building_area.get("Total") or {}).get("heated", "0") or "0"
|
|
)
|
|
result["sqft_gross"] = _to_int(
|
|
(building_area.get("Total") or {}).get("gross", "0") or "0"
|
|
)
|
|
result["sqft_garage"] = _to_int(
|
|
(building_area.get("Finished Garage") or {}).get("gross", "0") or "0"
|
|
)
|
|
|
|
# Attributes: beds/baths/stories
|
|
def _attr_to_num(s):
|
|
if not s:
|
|
return None
|
|
try:
|
|
return float(s.split(".")[0]) if "." in s else float(s)
|
|
except (ValueError, TypeError):
|
|
return None
|
|
result["bedrooms"] = _attr_to_num(attrs.get("Bedrooms", ""))
|
|
result["baths"] = _attr_to_num(attrs.get("Baths", ""))
|
|
result["stories"] = _attr_to_num(attrs.get("Stories", ""))
|
|
result["units"] = _attr_to_num(attrs.get("Rooms / Units", ""))
|
|
|
|
# Structural elements
|
|
result["exterior_wall"] = structural.get("Exterior Wall", "")
|
|
result["roof_struct"] = structural.get("Roof Struct", "")
|
|
result["roofing_cover"] = structural.get("Roofing Cover", "")
|
|
result["interior_wall"] = structural.get("Interior Wall", "")
|
|
result["int_flooring"] = structural.get("Int Flooring", "")
|
|
|
|
# Sales history normalized
|
|
result["sales_history"] = []
|
|
for s in sales_history:
|
|
record = {
|
|
"book_page": _clean(s.get("book/page", "")),
|
|
"date": _clean(s.get("sale date", "")),
|
|
"price": _money_to_int(s.get("sale price", "") or "0"),
|
|
"deed_type": _clean(s.get("deed instrument type code", "") or s.get("deed type", "")),
|
|
"qualified": _clean(s.get("qualified/unqualified", "") or s.get("qualified", "")),
|
|
"vacant_improved": _clean(s.get("vacant/improved", "")),
|
|
}
|
|
if any(record.values()):
|
|
result["sales_history"].append(record)
|
|
|
|
# Land details
|
|
result["land"] = {
|
|
"use_code": land.get("code", ""),
|
|
"use_description": land.get("use description", ""),
|
|
"zoning": land.get("zoning assessment", ""),
|
|
"front": land.get("front", ""),
|
|
"depth": land.get("depth", ""),
|
|
"land_units": land.get("land units", ""),
|
|
"land_type": land.get("land type", ""),
|
|
"land_value": _money_to_int(land.get("land value", "") or "0"),
|
|
}
|
|
|
|
# Extra features (fireplace, pool, etc.)
|
|
result["extra_features"] = features
|
|
|
|
# Homestead detection: exemptions >= $25K = primary residence with HX
|
|
ex_last = result.get("tax_last_year_exemptions") or 0
|
|
ex_curr = result.get("tax_current_year_exemptions") or 0
|
|
result["homestead_active"] = (ex_last >= 25000) or (ex_curr >= 25000)
|
|
result["homestead_amount_current"] = ex_curr
|
|
result["homestead_amount_last"] = ex_last
|
|
|
|
# Convenience: most recent qualified sale price
|
|
qualified_sales = [s for s in result["sales_history"]
|
|
if s.get("qualified", "").lower().startswith("qualified")
|
|
and s.get("price", 0) and s["price"] >= 1000]
|
|
result["most_recent_qualified_sale"] = qualified_sales[0] if qualified_sales else None
|
|
|
|
# Effective renovation signal:
|
|
# If most recent qualified sale price >> previous qualified sale price by
|
|
# >30% within 24 months → likely renovated/flipped.
|
|
renov_signal = _detect_renovation_pattern(
|
|
result["sales_history"], listing_price=listing_price,
|
|
)
|
|
result["renovation_signal"] = renov_signal
|
|
|
|
except PWTimeout as e:
|
|
result["errors"].append(f"timeout: {e}")
|
|
except Exception as e:
|
|
import traceback
|
|
result["errors"].append(f"{type(e).__name__}: {e}")
|
|
result["_trace"] = traceback.format_exc()[:600]
|
|
|
|
return result
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# Helpers — server retry / detail link extraction
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
def _extract_detail_href_with_retry(page, max_retries: int = 2) -> Optional[str]:
|
|
"""Wait for Detail.aspx link on Results page, retry on server errors.
|
|
|
|
Duval PA returns intermittent 500 errors ("wait operation timed out")
|
|
when rate-limited. Retry with backoff handles that.
|
|
"""
|
|
for attempt in range(max_retries + 1):
|
|
# Wait for results to render
|
|
time.sleep(3)
|
|
try:
|
|
page.wait_for_selector("a[href*='Detail.aspx']", state="attached", timeout=15000)
|
|
except Exception:
|
|
pass
|
|
|
|
href = page.evaluate("""
|
|
() => {
|
|
const links = document.querySelectorAll("a[href*='Detail.aspx']");
|
|
return links.length > 0 ? links[0].getAttribute('href') : null;
|
|
}
|
|
""")
|
|
if href:
|
|
return href
|
|
|
|
# Check if this is a server error page
|
|
body = page.inner_text("body")[:500]
|
|
is_server_error = (
|
|
"Server Error" in body or
|
|
"wait operation timed out" in body or
|
|
"Runtime Error" in body
|
|
)
|
|
if is_server_error and attempt < max_retries:
|
|
# Backoff and retry — reload the search
|
|
backoff = 5 * (attempt + 1)
|
|
time.sleep(backoff)
|
|
try:
|
|
page.reload(wait_until="load", timeout=30000)
|
|
except Exception:
|
|
pass
|
|
continue
|
|
|
|
# If not server error, the link just isn't there — return None
|
|
return None
|
|
|
|
return None
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# Helpers
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
def _clean(s) -> str:
|
|
if not isinstance(s, str):
|
|
return s
|
|
return re.sub(r"\s+", " ", s).strip()
|
|
|
|
|
|
def _to_int(s) -> Optional[int]:
|
|
if not s:
|
|
return None
|
|
cleaned = re.sub(r"[^\d-]", "", str(s))
|
|
try:
|
|
return int(cleaned) if cleaned else None
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def _money_to_int(s) -> Optional[int]:
|
|
if not s:
|
|
return None
|
|
cleaned = re.sub(r"[^\d.-]", "", str(s))
|
|
if not cleaned or cleaned == "-":
|
|
return None
|
|
try:
|
|
return int(float(cleaned))
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def _detect_renovation_pattern(sales: list[dict], listing_price: Optional[float] = None) -> dict:
|
|
"""Heuristic: detect flip / renovation / flip-in-progress patterns.
|
|
|
|
Args:
|
|
sales: sales_history (recent first)
|
|
listing_price: optional current listing price — habilita flip-in-progress detection
|
|
|
|
Returns:
|
|
{
|
|
"is_flip_pattern": bool, # qualified sales historical flip detected
|
|
"is_flip_in_progress": bool, # NEW: recent qualified << current listing
|
|
"evidence": str,
|
|
"most_recent_qualified": dict | None,
|
|
"prior_qualified": dict | None,
|
|
"value_increase_pct": float | None,
|
|
"months_between": int | None,
|
|
"listing_premium_pct": float | None, # NEW: (listing - recent_qualified) / recent_qualified * 100
|
|
"months_since_recent_sale": int | None,
|
|
"interpretation_es": str | None,
|
|
}
|
|
"""
|
|
out = {
|
|
"is_flip_pattern": False,
|
|
"is_flip_in_progress": False,
|
|
"evidence": "",
|
|
"most_recent_qualified": None,
|
|
"prior_qualified": None,
|
|
"value_increase_pct": None,
|
|
"months_between": None,
|
|
"listing_premium_pct": None,
|
|
"months_since_recent_sale": None,
|
|
"interpretation_es": None,
|
|
}
|
|
qualified = [s for s in sales
|
|
if s.get("qualified", "").lower().startswith("qualified")
|
|
and s.get("price", 0) and s["price"] >= 1000]
|
|
if not qualified:
|
|
return out
|
|
|
|
recent = qualified[0]
|
|
out["most_recent_qualified"] = recent
|
|
|
|
# ─── Pattern A: historical flip (prior qualified → recent qualified) ─────
|
|
if len(qualified) >= 2:
|
|
prior = qualified[1]
|
|
out["prior_qualified"] = prior
|
|
try:
|
|
increase = (recent["price"] - prior["price"]) / prior["price"] * 100
|
|
out["value_increase_pct"] = round(increase, 1)
|
|
except (TypeError, ZeroDivisionError):
|
|
pass
|
|
try:
|
|
d1 = datetime.strptime(recent["date"], "%m/%d/%Y")
|
|
d2 = datetime.strptime(prior["date"], "%m/%d/%Y")
|
|
months = abs((d1 - d2).days) // 30
|
|
out["months_between"] = months
|
|
except (ValueError, TypeError, KeyError):
|
|
pass
|
|
|
|
if out["value_increase_pct"] and out["months_between"]:
|
|
if out["value_increase_pct"] >= 25 and out["months_between"] <= 30:
|
|
out["is_flip_pattern"] = True
|
|
out["evidence"] = (
|
|
f"+{out['value_increase_pct']}% in {out['months_between']} months "
|
|
f"({prior['date']} ${prior['price']:,} -> {recent['date']} ${recent['price']:,})"
|
|
)
|
|
|
|
# ─── Pattern B: FLIP-IN-PROGRESS (recent qualified << current listing) ──
|
|
if listing_price and listing_price > 0 and recent.get("price", 0) > 0:
|
|
try:
|
|
premium = (listing_price - recent["price"]) / recent["price"] * 100
|
|
out["listing_premium_pct"] = round(premium, 1)
|
|
except (TypeError, ZeroDivisionError):
|
|
pass
|
|
try:
|
|
d_recent = datetime.strptime(recent["date"], "%m/%d/%Y")
|
|
today = datetime.now()
|
|
months_since = abs((today - d_recent).days) // 30
|
|
out["months_since_recent_sale"] = months_since
|
|
except (ValueError, TypeError, KeyError):
|
|
pass
|
|
|
|
# Flip-in-progress: recent qualified sale is 15%+ below listing AND
|
|
# the sale was within last 18 months (typical flip turnaround)
|
|
if (out["listing_premium_pct"] and out["listing_premium_pct"] >= 15
|
|
and out["months_since_recent_sale"] is not None
|
|
and out["months_since_recent_sale"] <= 18):
|
|
out["is_flip_in_progress"] = True
|
|
if out["evidence"]:
|
|
out["evidence"] += " | "
|
|
out["evidence"] += (
|
|
f"FLIP-IN-PROGRESS: owner bought ${recent['price']:,} on {recent['date']} "
|
|
f"({out['months_since_recent_sale']}mo ago), listing ${listing_price:,.0f} "
|
|
f"(+{out['listing_premium_pct']}%)"
|
|
)
|
|
|
|
# ─── Spanish interpretation ─────────────────────────────────────────────
|
|
if out["is_flip_in_progress"] and out["is_flip_pattern"]:
|
|
out["interpretation_es"] = (
|
|
"PATRON DE FLIP REPETIDO: la propiedad ya fue flipped una vez en el "
|
|
"historial. El owner actual la compro reciente y la lista mucho mas "
|
|
"alto. Probable renovacion reciente -> precio refleja inversion. "
|
|
"Si comprador final, esperate negociacion dura del owner (necesita "
|
|
"recuperar costos de rehab + margen)."
|
|
)
|
|
elif out["is_flip_in_progress"]:
|
|
out["interpretation_es"] = (
|
|
f"FLIP-IN-PROGRESS: el owner compro hace {out['months_since_recent_sale']}mo "
|
|
f"a ${recent['price']:,} y lista a ${listing_price:,.0f} (+{out['listing_premium_pct']:.0f}%). "
|
|
"Probable renovacion en el medio. Precio incluye trabajo. Negociar dificil — "
|
|
"owner tiene 'sunk cost' del rehab. Validar condicion real con inspeccion."
|
|
)
|
|
elif out["is_flip_pattern"]:
|
|
out["interpretation_es"] = (
|
|
f"HISTORIAL DE FLIP: la propiedad subio +{out['value_increase_pct']}% en "
|
|
f"{out['months_between']}mo (sale prior). Indica renovacion previa."
|
|
)
|
|
|
|
return out
|
|
|
|
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
# CLI
|
|
# ════════════════════════════════════════════════════════════════════════════
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
import json
|
|
|
|
parser = argparse.ArgumentParser(description="Duval PA full record fetcher")
|
|
parser.add_argument("--address", help="Street address (e.g. '2352 SCENIC VIEW CT')")
|
|
parser.add_argument("--parcel", help="RE# (e.g. '044273-0370')")
|
|
parser.add_argument("--zip", help="Optional ZIP filter")
|
|
args = parser.parse_args()
|
|
|
|
if not args.address and not args.parcel:
|
|
parser.error("--address or --parcel required")
|
|
|
|
rec = fetch_duval_pa_record(
|
|
address=args.address, parcel_id=args.parcel, zip_code=args.zip,
|
|
)
|
|
print(json.dumps(rec, indent=2, default=str))
|