feat: AR-House initial commit
This commit is contained in:
@@ -0,0 +1,820 @@
|
||||
"""data_fetchers/pa_duval.py — Full Duval County Property Appraiser extractor.
|
||||
|
||||
Sitio: https://paopropertysearch.coj.net (ASP.NET WebForms)
|
||||
Flow: Search.aspx → Results.aspx → Detail.aspx?ParcelNumber=XXX
|
||||
|
||||
Extrae todo lo publico del Duval PA para construir un Property Snapshot Report:
|
||||
- Owner name(s)
|
||||
- Property address + subdivision + legal description
|
||||
- Building: type, year_built, sqft heated/total, bedrooms, bathrooms,
|
||||
exterior wall, roof type, interior flooring
|
||||
- Values: just/market, assessed, exemptions (3-year history)
|
||||
- Tax breakdown por taxing district
|
||||
- Sales history completa (book/page, date, price, deed type, qualified status)
|
||||
- Homestead exemption (key signal: owner-occupant vs investor)
|
||||
- Land details (zoning, lot size, use code)
|
||||
- Extra features (fireplace, pool, etc.)
|
||||
|
||||
USAGE:
|
||||
from data_fetchers.pa_duval import fetch_duval_pa_record
|
||||
rec = fetch_duval_pa_record(address="2352 SCENIC VIEW CT", zip_code="32218")
|
||||
# rec["year_built"], rec["sales_history"], rec["homestead_active"]...
|
||||
|
||||
TECHNICAL:
|
||||
- ASP.NET WebForms con WebForm_DoPostBackWithOptions (compat IE8)
|
||||
- Element IDs ESTABLES (no autogenerados)
|
||||
- Per-search latency: ~10-15s (entry → search → results → detail)
|
||||
- Free (Playwright local, no API cost)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
|
||||
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/131"
|
||||
|
||||
_BASE_URL = "https://paopropertysearch.coj.net"
|
||||
_SEARCH_URL = f"{_BASE_URL}/Basic/Search.aspx"
|
||||
|
||||
# Address parser para Duval ASP.NET form fields.
|
||||
# Acepta long form (COURT, STREET) y abbreviation (CT, ST). Strip city/state/zip
|
||||
# antes de parsear (split por primera coma).
|
||||
_ADDRESS_RE = re.compile(
|
||||
r"^\s*(?P<num>\d+)\s+"
|
||||
r"(?:(?P<prefix>N|S|E|W|NE|NW|SE|SW)\s+)?"
|
||||
r"(?P<name>[A-Z][A-Z\s\d\-']*?)"
|
||||
r"\s+(?P<suffix>"
|
||||
r"ST|STREET|AVE|AV|AVENUE|RD|ROAD|DR|DRIVE|CT|COURT|CIR|CIRCLE|"
|
||||
r"LN|LANE|BLVD|BOULEVARD|HWY|HIGHWAY|WAY|PL|PLACE|PKY|PKWY|PARKWAY|"
|
||||
r"TRL|TRAIL|TER|TERRACE|LOOP|RUN|ALY|ALLEY|XING|CROSSING"
|
||||
r")\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# Map long form → ASP.NET ddStreetSuffix value
|
||||
_SUFFIX_NORMALIZE = {
|
||||
"STREET": "ST", "AVENUE": "AVE", "AV": "AVE", "ROAD": "RD",
|
||||
"DRIVE": "DR", "COURT": "CT", "CIRCLE": "CIR", "LANE": "LN",
|
||||
"BOULEVARD": "BLVD", "HIGHWAY": "HWY", "PLACE": "PL",
|
||||
"PARKWAY": "PKWY", "PKY": "PKWY", "TRAIL": "TRL", "TERRACE": "TER",
|
||||
"ALLEY": "ALY", "CROSSING": "XING",
|
||||
}
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Field ID mapping (confirmed via probe on 2352 SCENIC VIEW CT)
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
_SCALAR_IDS = {
|
||||
"owner_name": "ctl00_cphBody_repeaterOwnerInformation_ctl00_lblOwnerName",
|
||||
"site_address_line1": "ctl00_cphBody_repeaterBuilding_ctl00_lblBuildingSiteAddressLine1",
|
||||
"site_address_line2": "ctl00_cphBody_repeaterBuilding_ctl00_lblBuildingSiteAddressLine2",
|
||||
"building_type": "ctl00_cphBody_repeaterBuilding_ctl00_lblBuildingType",
|
||||
"year_built": "ctl00_cphBody_repeaterBuilding_ctl00_lblYearBuilt",
|
||||
"building_value": "ctl00_cphBody_repeaterBuilding_ctl00_lblBldgValue",
|
||||
# Tax values current + last year
|
||||
"tax_last_year_just": "ctl00_cphBody_lblTaxLastYearJustValue",
|
||||
"tax_last_year_assessed": "ctl00_cphBody_lblTaxLastYearAssessedValue",
|
||||
"tax_last_year_exemptions": "ctl00_cphBody_lblTaxLastYearExemptions",
|
||||
"tax_last_year_taxable": "ctl00_cphBody_lblTaxLastYearTaxableValue",
|
||||
"tax_current_year_just": "ctl00_cphBody_lblTaxCurrentYearJustValue",
|
||||
"tax_current_year_assessed": "ctl00_cphBody_lblTaxCurrentYearAssessedValue",
|
||||
"tax_current_year_exemptions": "ctl00_cphBody_lblTaxCurrentYearExemptions",
|
||||
"tax_current_year_taxable": "ctl00_cphBody_lblTaxCurrentYearTaxableValue",
|
||||
# Values from main values table (no current "in progress" year)
|
||||
"assessed_value_3": "ctl00_cphBody_lblAssessedValue3",
|
||||
"taxable_value_school": "ctl00_cphBody_lblTaxableValueSchool",
|
||||
}
|
||||
|
||||
|
||||
def _parse_address(address: str) -> Optional[dict]:
|
||||
"""Parse address into Duval form fields.
|
||||
|
||||
Acepta:
|
||||
'2352 SCENIC VIEW CT' → simple
|
||||
'2352 SCENIC VIEW Court, Jacksonville, FL 32218' → con city/state/zip
|
||||
'123 N MAIN ST, Anytown, FL' → con prefix
|
||||
"""
|
||||
if not address:
|
||||
return None
|
||||
# Strip city/state/zip — toma solo lo antes del primer comma
|
||||
street_only = address.split(",")[0].strip().upper()
|
||||
m = _ADDRESS_RE.search(street_only)
|
||||
if not m:
|
||||
return None
|
||||
suffix_raw = (m.group("suffix") or "").strip().upper()
|
||||
suffix_normalized = _SUFFIX_NORMALIZE.get(suffix_raw, suffix_raw) if suffix_raw else None
|
||||
return {
|
||||
"street_num": m.group("num"),
|
||||
"prefix": (m.group("prefix") or "").strip().upper() or None,
|
||||
"name": m.group("name").strip(),
|
||||
"suffix": suffix_normalized,
|
||||
}
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Public API
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def fetch_duval_pa_record(
|
||||
address: Optional[str] = None,
|
||||
parcel_id: Optional[str] = None,
|
||||
zip_code: Optional[str] = None,
|
||||
timeout_seconds: int = 30,
|
||||
listing_price: Optional[float] = None,
|
||||
) -> dict:
|
||||
"""Fetch full Duval PA record by address OR parcel_id (RE#).
|
||||
|
||||
Args:
|
||||
address: street address (e.g. "2352 SCENIC VIEW CT")
|
||||
parcel_id: Duval RE# (e.g. "044273-0370") — preferred si lo tenes
|
||||
zip_code: optional zip filter
|
||||
timeout_seconds: max wait per Playwright op
|
||||
|
||||
Returns:
|
||||
Dict con TODOS los campos publicos. Si fallo, dict tiene 'errors'.
|
||||
Key fields:
|
||||
- owner_name, owner_full_address (mailing)
|
||||
- site_address, parcel_id (RE#), subdivision
|
||||
- year_built (ENTERO), building_type, sqft_heated, sqft_gross,
|
||||
sqft_garage, bedrooms, baths, stories
|
||||
- exterior_wall, roof_struct, roofing_cover, interior_wall, int_flooring
|
||||
- just_value_current, assessed_value_current, taxable_current,
|
||||
exemption_current
|
||||
- just_value_last, assessed_value_last, taxes_billed_last
|
||||
- homestead_active (bool — exemptions >= $25,000 = homestead)
|
||||
- sales_history: [{date, price, deed_type, qualified, book_page,
|
||||
vacant_improved}, ...]
|
||||
- extra_features: [{code, description, value}, ...]
|
||||
- land: {zoning, use_code, lot_acres, lot_total_sqft}
|
||||
- legal_description: str
|
||||
- tax_breakdown: [{district, assessed, exempt, taxable, tax_amt}, ...]
|
||||
- source_url: str (detail page URL)
|
||||
- fetched_at: ISO timestamp
|
||||
"""
|
||||
fetched_at = datetime.now(timezone.utc).isoformat()
|
||||
result = {
|
||||
"county": "Duval",
|
||||
"source": "Duval Property Appraiser (paopropertysearch.coj.net)",
|
||||
"fetched_at": fetched_at,
|
||||
"errors": [],
|
||||
}
|
||||
|
||||
if not address and not parcel_id:
|
||||
result["errors"].append("no address or parcel_id provided")
|
||||
return result
|
||||
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
||||
except ImportError:
|
||||
result["errors"].append("playwright not installed")
|
||||
return result
|
||||
|
||||
parsed_addr = _parse_address(address) if address else None
|
||||
if address and not parsed_addr:
|
||||
result["errors"].append(f"could not parse address '{address}' (need format: 'NUM [PREFIX] NAME SUFFIX')")
|
||||
return result
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
ctx = browser.new_context(user_agent=USER_AGENT)
|
||||
page = ctx.new_page()
|
||||
page.set_default_timeout(timeout_seconds * 1000)
|
||||
|
||||
page.goto(_SEARCH_URL, wait_until="load", timeout=timeout_seconds * 1000)
|
||||
time.sleep(2)
|
||||
|
||||
if parcel_id:
|
||||
pid_clean = parcel_id.replace("-", "").strip()
|
||||
detail_url = f"{_BASE_URL}/Basic/Detail.aspx?RE={pid_clean}"
|
||||
try:
|
||||
page.goto(detail_url, wait_until="load", timeout=timeout_seconds * 1000)
|
||||
except Exception:
|
||||
# If 'load' times out, fall back to 'commit' (page has navigated)
|
||||
page.goto(detail_url, wait_until="commit", timeout=timeout_seconds * 1000)
|
||||
time.sleep(5)
|
||||
else:
|
||||
# Search by address fields
|
||||
page.locator("#ctl00_cphBody_tbStreetNumber").fill(parsed_addr["street_num"])
|
||||
if parsed_addr["prefix"]:
|
||||
try:
|
||||
page.locator("#ctl00_cphBody_ddStreetPrefix").select_option(value=parsed_addr["prefix"])
|
||||
except Exception:
|
||||
pass
|
||||
page.locator("#ctl00_cphBody_tbStreetName").fill(parsed_addr["name"])
|
||||
if parsed_addr["suffix"]:
|
||||
try:
|
||||
page.locator("#ctl00_cphBody_ddStreetSuffix").select_option(value=parsed_addr["suffix"])
|
||||
except Exception:
|
||||
pass
|
||||
if zip_code:
|
||||
try:
|
||||
page.locator("#ctl00_cphBody_tbZipCode").fill(zip_code)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
page.locator("#ctl00_cphBody_bSearch").click()
|
||||
page.wait_for_timeout(4000)
|
||||
|
||||
# If results table → extract href from first row link and
|
||||
# navigate directly (Playwright click + navigation hangs on ASP.NET)
|
||||
rs_url = page.url
|
||||
if "Results.aspx" in rs_url:
|
||||
detail_href = _extract_detail_href_with_retry(page, max_retries=2)
|
||||
if not detail_href:
|
||||
body_preview = page.inner_text("body")[:300].replace("\n", " ")
|
||||
result["errors"].append(
|
||||
f"results page returned but no Detail.aspx link found "
|
||||
f"(url={page.url}, body_preview={body_preview!r})"
|
||||
)
|
||||
browser.close()
|
||||
return result
|
||||
# Build absolute URL and navigate directly (no click)
|
||||
if detail_href.startswith("/"):
|
||||
detail_url = f"{_BASE_URL}{detail_href}"
|
||||
elif detail_href.startswith("http"):
|
||||
detail_url = detail_href
|
||||
else:
|
||||
detail_url = f"{_BASE_URL}/Basic/{detail_href}"
|
||||
try:
|
||||
page.goto(detail_url, wait_until="load", timeout=timeout_seconds * 1000)
|
||||
except Exception:
|
||||
page.goto(detail_url, wait_until="commit", timeout=timeout_seconds * 1000)
|
||||
time.sleep(5)
|
||||
elif "Detail.aspx" not in page.url:
|
||||
result["errors"].append(f"unexpected URL after search: {page.url}")
|
||||
browser.close()
|
||||
return result
|
||||
|
||||
# We should now be on Detail.aspx
|
||||
if "Detail.aspx" not in page.url:
|
||||
result["errors"].append(f"failed to reach Detail page, URL: {page.url}")
|
||||
browser.close()
|
||||
return result
|
||||
|
||||
result["source_url"] = page.url
|
||||
|
||||
# Wait for KEY element to confirm full render before extracting.
|
||||
# Retry once on server error (Duval intermittent 500s).
|
||||
render_ok = False
|
||||
for attempt in range(3):
|
||||
try:
|
||||
page.wait_for_selector(
|
||||
"#ctl00_cphBody_repeaterBuilding_ctl00_lblYearBuilt",
|
||||
state="attached",
|
||||
timeout=20000,
|
||||
)
|
||||
render_ok = True
|
||||
break
|
||||
except Exception:
|
||||
# Try owner sentinel
|
||||
try:
|
||||
page.wait_for_selector(
|
||||
"#ctl00_cphBody_repeaterOwnerInformation_ctl00_lblOwnerName",
|
||||
state="attached",
|
||||
timeout=10000,
|
||||
)
|
||||
render_ok = True
|
||||
break
|
||||
except Exception:
|
||||
# Check if server error → retry
|
||||
body = page.inner_text("body")[:500]
|
||||
if "Server Error" in body or "Runtime Error" in body:
|
||||
if attempt < 2:
|
||||
time.sleep(8 * (attempt + 1))
|
||||
try:
|
||||
page.reload(wait_until="load", timeout=30000)
|
||||
except Exception:
|
||||
pass
|
||||
continue
|
||||
break
|
||||
if not render_ok:
|
||||
result["errors"].append("detail page didn't render expected elements (server slow or error)")
|
||||
|
||||
# Extract all scalar fields in one JS call
|
||||
scalars = page.evaluate(
|
||||
"""(ids) => {
|
||||
const out = {};
|
||||
for (const [k, id] of Object.entries(ids)) {
|
||||
const el = document.getElementById(id);
|
||||
out[k] = el ? (el.textContent || '').trim() : '';
|
||||
}
|
||||
return out;
|
||||
}""",
|
||||
_SCALAR_IDS,
|
||||
)
|
||||
|
||||
# Extract sales history
|
||||
sales_history = page.evaluate("""
|
||||
() => {
|
||||
const out = [];
|
||||
document.querySelectorAll('table').forEach((tbl) => {
|
||||
const hdrCells = tbl.querySelectorAll('tr')[0]?.querySelectorAll('th, td');
|
||||
if (!hdrCells || hdrCells.length < 4) return;
|
||||
const hdrText = Array.from(hdrCells).map(c => (c.textContent||'').trim().toLowerCase());
|
||||
const isSalesHdr = hdrText.some(h => h.includes('sale date') || h.includes('book/page') || h.includes('deed instrument'));
|
||||
if (!isSalesHdr) return;
|
||||
const rows = tbl.querySelectorAll('tr');
|
||||
for (let i = 1; i < rows.length; i++) {
|
||||
const cells = rows[i].querySelectorAll('td');
|
||||
if (cells.length < 4) continue;
|
||||
const r = {};
|
||||
cells.forEach((c, idx) => {
|
||||
const h = hdrText[idx] || `col${idx}`;
|
||||
r[h] = (c.textContent || '').trim();
|
||||
});
|
||||
if (Object.values(r).some(v => v && v.length > 0)) out.push(r);
|
||||
}
|
||||
});
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Extract building area
|
||||
building_area = page.evaluate("""
|
||||
() => {
|
||||
const out = {};
|
||||
const grid = document.getElementById('ctl00_cphBody_repeaterBuilding_ctl00_gridBuildingArea');
|
||||
if (!grid) return out;
|
||||
const rows = grid.querySelectorAll('tr');
|
||||
for (let i = 1; i < rows.length; i++) {
|
||||
const cells = rows[i].querySelectorAll('td');
|
||||
if (cells.length < 4) continue;
|
||||
const type = (cells[0].textContent || '').trim();
|
||||
const gross = (cells[1].textContent || '').trim();
|
||||
const heated = (cells[2].textContent || '').trim();
|
||||
const effective = (cells[3].textContent || '').trim();
|
||||
if (type) out[type] = { gross, heated, effective };
|
||||
}
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Extract building attributes (beds, baths, stories)
|
||||
attrs = page.evaluate("""
|
||||
() => {
|
||||
const out = {};
|
||||
const grid = document.getElementById('ctl00_cphBody_repeaterBuilding_ctl00_gridBuildingAttributes');
|
||||
if (!grid) return out;
|
||||
const rows = grid.querySelectorAll('tr');
|
||||
for (let i = 1; i < rows.length; i++) {
|
||||
const cells = rows[i].querySelectorAll('td');
|
||||
if (cells.length < 2) continue;
|
||||
const element = (cells[0].textContent || '').trim();
|
||||
const code = (cells[1].textContent || '').trim();
|
||||
if (element) out[element] = code;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Extract building structural elements (roof, walls, flooring)
|
||||
# NOTE: these come from the same building section, different grid
|
||||
structural = page.evaluate("""
|
||||
() => {
|
||||
const out = {};
|
||||
// Find any grid in building section with Element/Code/Detail headers
|
||||
document.querySelectorAll('table').forEach((tbl) => {
|
||||
const hdrs = tbl.querySelectorAll('tr')[0]?.querySelectorAll('th, td');
|
||||
if (!hdrs) return;
|
||||
const ht = Array.from(hdrs).map(c => (c.textContent||'').trim().toLowerCase());
|
||||
if (!(ht.includes('element') && ht.includes('code') && ht.includes('detail'))) return;
|
||||
// Skip the simpler attributes table (only 3 fields)
|
||||
const rows = tbl.querySelectorAll('tr');
|
||||
if (rows.length < 4) return;
|
||||
for (let i = 1; i < rows.length; i++) {
|
||||
const cells = rows[i].querySelectorAll('td');
|
||||
if (cells.length < 3) continue;
|
||||
const element = (cells[0].textContent || '').trim();
|
||||
const detail = (cells[2].textContent || '').trim();
|
||||
if (element && detail) {
|
||||
if (out[element]) {
|
||||
out[element] += '; ' + detail;
|
||||
} else {
|
||||
out[element] = detail;
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Extract main property identity (RE#, subdivision, etc.) from top table
|
||||
top_props = page.evaluate("""
|
||||
() => {
|
||||
const out = {};
|
||||
document.querySelectorAll('table').forEach((tbl) => {
|
||||
const rows = tbl.querySelectorAll('tr');
|
||||
if (rows.length < 3) return;
|
||||
// Top table has key:value rows (2 cells per row)
|
||||
// Heuristic: first cell ends with ':' or matches known labels
|
||||
const knownLabels = ['re #','re#','tax district','property use',
|
||||
'# of buildings','legal desc','subdivision','total area'];
|
||||
let matchCount = 0;
|
||||
const candidate = {};
|
||||
for (const tr of rows) {
|
||||
const cells = tr.querySelectorAll('td, th');
|
||||
if (cells.length !== 2) continue;
|
||||
const k = (cells[0].textContent || '').trim().toLowerCase().replace(/:$/, '');
|
||||
const v = (cells[1].textContent || '').trim();
|
||||
if (k && v && knownLabels.some(kw => k.startsWith(kw))) {
|
||||
matchCount++;
|
||||
candidate[k] = v;
|
||||
}
|
||||
}
|
||||
if (matchCount >= 3) {
|
||||
Object.assign(out, candidate);
|
||||
}
|
||||
});
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Land details
|
||||
land = page.evaluate("""
|
||||
() => {
|
||||
const out = {};
|
||||
const grid = document.getElementById('ctl00_cphBody_gridLand');
|
||||
if (!grid) return out;
|
||||
const rows = grid.querySelectorAll('tr');
|
||||
if (rows.length < 2) return out;
|
||||
const hdrs = rows[0].querySelectorAll('th, td');
|
||||
const hdrText = Array.from(hdrs).map(c => (c.textContent||'').trim().toLowerCase());
|
||||
const dataRow = rows[1].querySelectorAll('td');
|
||||
hdrText.forEach((h, i) => {
|
||||
if (dataRow[i]) out[h] = (dataRow[i].textContent || '').trim();
|
||||
});
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
# Extra features (fireplace, pool, deck, etc.)
|
||||
features = page.evaluate("""
|
||||
() => {
|
||||
const out = [];
|
||||
const grid = document.getElementById('ctl00_cphBody_gridExtraFeatures');
|
||||
if (!grid) return out;
|
||||
const rows = grid.querySelectorAll('tr');
|
||||
for (let i = 1; i < rows.length; i++) {
|
||||
const cells = rows[i].querySelectorAll('td');
|
||||
if (cells.length < 5) continue;
|
||||
out.push({
|
||||
code: (cells[1]?.textContent || '').trim(),
|
||||
description: (cells[2]?.textContent || '').trim(),
|
||||
units: (cells[6]?.textContent || '').trim(),
|
||||
value: (cells[7]?.textContent || '').trim(),
|
||||
});
|
||||
}
|
||||
return out;
|
||||
}
|
||||
""")
|
||||
|
||||
browser.close()
|
||||
|
||||
# ─── Post-process ─────────────────────────────────────────────
|
||||
result.update({k: _clean(v) for k, v in scalars.items()})
|
||||
|
||||
# Numeric conversions
|
||||
for k in ("year_built",):
|
||||
v = result.get(k, "")
|
||||
if v:
|
||||
result[k] = _to_int(v)
|
||||
for k in ("building_value", "tax_last_year_just", "tax_last_year_assessed",
|
||||
"tax_last_year_exemptions", "tax_last_year_taxable",
|
||||
"tax_current_year_just", "tax_current_year_assessed",
|
||||
"tax_current_year_exemptions", "tax_current_year_taxable",
|
||||
"assessed_value_3", "taxable_value_school"):
|
||||
v = result.get(k, "")
|
||||
if v:
|
||||
result[k] = _money_to_int(v)
|
||||
|
||||
# Parcel id / subdivision / etc from top props
|
||||
result["parcel_id"] = top_props.get("re #", "") or top_props.get("re#", "")
|
||||
result["tax_district"] = top_props.get("tax district", "")
|
||||
result["property_use"] = top_props.get("property use", "")
|
||||
result["num_buildings"] = top_props.get("# of buildings", "")
|
||||
result["subdivision"] = top_props.get("subdivision", "")
|
||||
result["lot_total_sqft"] = _to_int(top_props.get("total area", "") or "0")
|
||||
|
||||
# Building area summary
|
||||
result["building_area_grid"] = building_area
|
||||
result["sqft_heated"] = _to_int(
|
||||
(building_area.get("Base Area") or {}).get("heated", "0") or
|
||||
(building_area.get("Total") or {}).get("heated", "0") or "0"
|
||||
)
|
||||
result["sqft_gross"] = _to_int(
|
||||
(building_area.get("Total") or {}).get("gross", "0") or "0"
|
||||
)
|
||||
result["sqft_garage"] = _to_int(
|
||||
(building_area.get("Finished Garage") or {}).get("gross", "0") or "0"
|
||||
)
|
||||
|
||||
# Attributes: beds/baths/stories
|
||||
def _attr_to_num(s):
|
||||
if not s:
|
||||
return None
|
||||
try:
|
||||
return float(s.split(".")[0]) if "." in s else float(s)
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
result["bedrooms"] = _attr_to_num(attrs.get("Bedrooms", ""))
|
||||
result["baths"] = _attr_to_num(attrs.get("Baths", ""))
|
||||
result["stories"] = _attr_to_num(attrs.get("Stories", ""))
|
||||
result["units"] = _attr_to_num(attrs.get("Rooms / Units", ""))
|
||||
|
||||
# Structural elements
|
||||
result["exterior_wall"] = structural.get("Exterior Wall", "")
|
||||
result["roof_struct"] = structural.get("Roof Struct", "")
|
||||
result["roofing_cover"] = structural.get("Roofing Cover", "")
|
||||
result["interior_wall"] = structural.get("Interior Wall", "")
|
||||
result["int_flooring"] = structural.get("Int Flooring", "")
|
||||
|
||||
# Sales history normalized
|
||||
result["sales_history"] = []
|
||||
for s in sales_history:
|
||||
record = {
|
||||
"book_page": _clean(s.get("book/page", "")),
|
||||
"date": _clean(s.get("sale date", "")),
|
||||
"price": _money_to_int(s.get("sale price", "") or "0"),
|
||||
"deed_type": _clean(s.get("deed instrument type code", "") or s.get("deed type", "")),
|
||||
"qualified": _clean(s.get("qualified/unqualified", "") or s.get("qualified", "")),
|
||||
"vacant_improved": _clean(s.get("vacant/improved", "")),
|
||||
}
|
||||
if any(record.values()):
|
||||
result["sales_history"].append(record)
|
||||
|
||||
# Land details
|
||||
result["land"] = {
|
||||
"use_code": land.get("code", ""),
|
||||
"use_description": land.get("use description", ""),
|
||||
"zoning": land.get("zoning assessment", ""),
|
||||
"front": land.get("front", ""),
|
||||
"depth": land.get("depth", ""),
|
||||
"land_units": land.get("land units", ""),
|
||||
"land_type": land.get("land type", ""),
|
||||
"land_value": _money_to_int(land.get("land value", "") or "0"),
|
||||
}
|
||||
|
||||
# Extra features (fireplace, pool, etc.)
|
||||
result["extra_features"] = features
|
||||
|
||||
# Homestead detection: exemptions >= $25K = primary residence with HX
|
||||
ex_last = result.get("tax_last_year_exemptions") or 0
|
||||
ex_curr = result.get("tax_current_year_exemptions") or 0
|
||||
result["homestead_active"] = (ex_last >= 25000) or (ex_curr >= 25000)
|
||||
result["homestead_amount_current"] = ex_curr
|
||||
result["homestead_amount_last"] = ex_last
|
||||
|
||||
# Convenience: most recent qualified sale price
|
||||
qualified_sales = [s for s in result["sales_history"]
|
||||
if s.get("qualified", "").lower().startswith("qualified")
|
||||
and s.get("price", 0) and s["price"] >= 1000]
|
||||
result["most_recent_qualified_sale"] = qualified_sales[0] if qualified_sales else None
|
||||
|
||||
# Effective renovation signal:
|
||||
# If most recent qualified sale price >> previous qualified sale price by
|
||||
# >30% within 24 months → likely renovated/flipped.
|
||||
renov_signal = _detect_renovation_pattern(
|
||||
result["sales_history"], listing_price=listing_price,
|
||||
)
|
||||
result["renovation_signal"] = renov_signal
|
||||
|
||||
except PWTimeout as e:
|
||||
result["errors"].append(f"timeout: {e}")
|
||||
except Exception as e:
|
||||
import traceback
|
||||
result["errors"].append(f"{type(e).__name__}: {e}")
|
||||
result["_trace"] = traceback.format_exc()[:600]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Helpers — server retry / detail link extraction
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def _extract_detail_href_with_retry(page, max_retries: int = 2) -> Optional[str]:
|
||||
"""Wait for Detail.aspx link on Results page, retry on server errors.
|
||||
|
||||
Duval PA returns intermittent 500 errors ("wait operation timed out")
|
||||
when rate-limited. Retry with backoff handles that.
|
||||
"""
|
||||
for attempt in range(max_retries + 1):
|
||||
# Wait for results to render
|
||||
time.sleep(3)
|
||||
try:
|
||||
page.wait_for_selector("a[href*='Detail.aspx']", state="attached", timeout=15000)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
href = page.evaluate("""
|
||||
() => {
|
||||
const links = document.querySelectorAll("a[href*='Detail.aspx']");
|
||||
return links.length > 0 ? links[0].getAttribute('href') : null;
|
||||
}
|
||||
""")
|
||||
if href:
|
||||
return href
|
||||
|
||||
# Check if this is a server error page
|
||||
body = page.inner_text("body")[:500]
|
||||
is_server_error = (
|
||||
"Server Error" in body or
|
||||
"wait operation timed out" in body or
|
||||
"Runtime Error" in body
|
||||
)
|
||||
if is_server_error and attempt < max_retries:
|
||||
# Backoff and retry — reload the search
|
||||
backoff = 5 * (attempt + 1)
|
||||
time.sleep(backoff)
|
||||
try:
|
||||
page.reload(wait_until="load", timeout=30000)
|
||||
except Exception:
|
||||
pass
|
||||
continue
|
||||
|
||||
# If not server error, the link just isn't there — return None
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# Helpers
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
def _clean(s) -> str:
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
return re.sub(r"\s+", " ", s).strip()
|
||||
|
||||
|
||||
def _to_int(s) -> Optional[int]:
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d-]", "", str(s))
|
||||
try:
|
||||
return int(cleaned) if cleaned else None
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _money_to_int(s) -> Optional[int]:
|
||||
if not s:
|
||||
return None
|
||||
cleaned = re.sub(r"[^\d.-]", "", str(s))
|
||||
if not cleaned or cleaned == "-":
|
||||
return None
|
||||
try:
|
||||
return int(float(cleaned))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _detect_renovation_pattern(sales: list[dict], listing_price: Optional[float] = None) -> dict:
|
||||
"""Heuristic: detect flip / renovation / flip-in-progress patterns.
|
||||
|
||||
Args:
|
||||
sales: sales_history (recent first)
|
||||
listing_price: optional current listing price — habilita flip-in-progress detection
|
||||
|
||||
Returns:
|
||||
{
|
||||
"is_flip_pattern": bool, # qualified sales historical flip detected
|
||||
"is_flip_in_progress": bool, # NEW: recent qualified << current listing
|
||||
"evidence": str,
|
||||
"most_recent_qualified": dict | None,
|
||||
"prior_qualified": dict | None,
|
||||
"value_increase_pct": float | None,
|
||||
"months_between": int | None,
|
||||
"listing_premium_pct": float | None, # NEW: (listing - recent_qualified) / recent_qualified * 100
|
||||
"months_since_recent_sale": int | None,
|
||||
"interpretation_es": str | None,
|
||||
}
|
||||
"""
|
||||
out = {
|
||||
"is_flip_pattern": False,
|
||||
"is_flip_in_progress": False,
|
||||
"evidence": "",
|
||||
"most_recent_qualified": None,
|
||||
"prior_qualified": None,
|
||||
"value_increase_pct": None,
|
||||
"months_between": None,
|
||||
"listing_premium_pct": None,
|
||||
"months_since_recent_sale": None,
|
||||
"interpretation_es": None,
|
||||
}
|
||||
qualified = [s for s in sales
|
||||
if s.get("qualified", "").lower().startswith("qualified")
|
||||
and s.get("price", 0) and s["price"] >= 1000]
|
||||
if not qualified:
|
||||
return out
|
||||
|
||||
recent = qualified[0]
|
||||
out["most_recent_qualified"] = recent
|
||||
|
||||
# ─── Pattern A: historical flip (prior qualified → recent qualified) ─────
|
||||
if len(qualified) >= 2:
|
||||
prior = qualified[1]
|
||||
out["prior_qualified"] = prior
|
||||
try:
|
||||
increase = (recent["price"] - prior["price"]) / prior["price"] * 100
|
||||
out["value_increase_pct"] = round(increase, 1)
|
||||
except (TypeError, ZeroDivisionError):
|
||||
pass
|
||||
try:
|
||||
d1 = datetime.strptime(recent["date"], "%m/%d/%Y")
|
||||
d2 = datetime.strptime(prior["date"], "%m/%d/%Y")
|
||||
months = abs((d1 - d2).days) // 30
|
||||
out["months_between"] = months
|
||||
except (ValueError, TypeError, KeyError):
|
||||
pass
|
||||
|
||||
if out["value_increase_pct"] and out["months_between"]:
|
||||
if out["value_increase_pct"] >= 25 and out["months_between"] <= 30:
|
||||
out["is_flip_pattern"] = True
|
||||
out["evidence"] = (
|
||||
f"+{out['value_increase_pct']}% in {out['months_between']} months "
|
||||
f"({prior['date']} ${prior['price']:,} -> {recent['date']} ${recent['price']:,})"
|
||||
)
|
||||
|
||||
# ─── Pattern B: FLIP-IN-PROGRESS (recent qualified << current listing) ──
|
||||
if listing_price and listing_price > 0 and recent.get("price", 0) > 0:
|
||||
try:
|
||||
premium = (listing_price - recent["price"]) / recent["price"] * 100
|
||||
out["listing_premium_pct"] = round(premium, 1)
|
||||
except (TypeError, ZeroDivisionError):
|
||||
pass
|
||||
try:
|
||||
d_recent = datetime.strptime(recent["date"], "%m/%d/%Y")
|
||||
today = datetime.now()
|
||||
months_since = abs((today - d_recent).days) // 30
|
||||
out["months_since_recent_sale"] = months_since
|
||||
except (ValueError, TypeError, KeyError):
|
||||
pass
|
||||
|
||||
# Flip-in-progress: recent qualified sale is 15%+ below listing AND
|
||||
# the sale was within last 18 months (typical flip turnaround)
|
||||
if (out["listing_premium_pct"] and out["listing_premium_pct"] >= 15
|
||||
and out["months_since_recent_sale"] is not None
|
||||
and out["months_since_recent_sale"] <= 18):
|
||||
out["is_flip_in_progress"] = True
|
||||
if out["evidence"]:
|
||||
out["evidence"] += " | "
|
||||
out["evidence"] += (
|
||||
f"FLIP-IN-PROGRESS: owner bought ${recent['price']:,} on {recent['date']} "
|
||||
f"({out['months_since_recent_sale']}mo ago), listing ${listing_price:,.0f} "
|
||||
f"(+{out['listing_premium_pct']}%)"
|
||||
)
|
||||
|
||||
# ─── Spanish interpretation ─────────────────────────────────────────────
|
||||
if out["is_flip_in_progress"] and out["is_flip_pattern"]:
|
||||
out["interpretation_es"] = (
|
||||
"PATRON DE FLIP REPETIDO: la propiedad ya fue flipped una vez en el "
|
||||
"historial. El owner actual la compro reciente y la lista mucho mas "
|
||||
"alto. Probable renovacion reciente -> precio refleja inversion. "
|
||||
"Si comprador final, esperate negociacion dura del owner (necesita "
|
||||
"recuperar costos de rehab + margen)."
|
||||
)
|
||||
elif out["is_flip_in_progress"]:
|
||||
out["interpretation_es"] = (
|
||||
f"FLIP-IN-PROGRESS: el owner compro hace {out['months_since_recent_sale']}mo "
|
||||
f"a ${recent['price']:,} y lista a ${listing_price:,.0f} (+{out['listing_premium_pct']:.0f}%). "
|
||||
"Probable renovacion en el medio. Precio incluye trabajo. Negociar dificil — "
|
||||
"owner tiene 'sunk cost' del rehab. Validar condicion real con inspeccion."
|
||||
)
|
||||
elif out["is_flip_pattern"]:
|
||||
out["interpretation_es"] = (
|
||||
f"HISTORIAL DE FLIP: la propiedad subio +{out['value_increase_pct']}% en "
|
||||
f"{out['months_between']}mo (sale prior). Indica renovacion previa."
|
||||
)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
# CLI
|
||||
# ════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
|
||||
parser = argparse.ArgumentParser(description="Duval PA full record fetcher")
|
||||
parser.add_argument("--address", help="Street address (e.g. '2352 SCENIC VIEW CT')")
|
||||
parser.add_argument("--parcel", help="RE# (e.g. '044273-0370')")
|
||||
parser.add_argument("--zip", help="Optional ZIP filter")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.address and not args.parcel:
|
||||
parser.error("--address or --parcel required")
|
||||
|
||||
rec = fetch_duval_pa_record(
|
||||
address=args.address, parcel_id=args.parcel, zip_code=args.zip,
|
||||
)
|
||||
print(json.dumps(rec, indent=2, default=str))
|
||||
Reference in New Issue
Block a user