feat: AR-House initial commit

2026-07-03 12:24:58 -04:00
commit 047c05287a
216 changed files with 127552 additions and 0 deletions
@@ -0,0 +1,145 @@
+"""scrapers/_cache.py — File-based HTTP response cache para scrapers.
+
+Cada URL fetched se guarda con TTL configurable. Evita re-fetchar contenido
+no-cambiable durante la TTL window (default 24h para auction calendars).
+
+Aplicacion: scraper Miami-Dade Clerk + futuros scrapers de fuentes que
+actualicen diariamente.
+
+KEY: SHA256 del URL. VALUE: response body (text/html).
+PATH: .cache/scrapers/{namespace}/{hash}.html + .meta.json
+
+Schema meta:
+    {
+      "url": str,
+      "fetched_at": iso8601,
+      "ttl_seconds": int,
+      "content_length": int,
+      "status_code": int
+    }
+"""
+from __future__ import annotations
+
+import hashlib
+import json
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Optional
+
+# Cache root absoluto al proyecto
+_PROJECT_ROOT = Path(__file__).resolve().parent.parent
+DEFAULT_CACHE_DIR = _PROJECT_ROOT / ".cache" / "scrapers"
+
+# TTL defaults
+DEFAULT_TTL_SECONDS_DAILY = 86_400        # 24h — auction calendars actualizan diariamente
+DEFAULT_TTL_SECONDS_HOURLY = 3600         # 1h — listings live
+DEFAULT_TTL_SECONDS_WEEKLY = 604_800      # 7d — datos lentos (court records, official records)
+
+
+def _url_hash(url: str) -> str:
+    return hashlib.sha256(url.encode("utf-8")).hexdigest()[:24]
+
+
+def _paths(namespace: str, url: str) -> tuple[Path, Path]:
+    """Returns (html_path, meta_path)."""
+    nshash = _url_hash(url)
+    ns_dir = DEFAULT_CACHE_DIR / namespace
+    return ns_dir / f"{nshash}.html", ns_dir / f"{nshash}.meta.json"
+
+
+def get_cached(
+    namespace: str,
+    url: str,
+    *,
+    ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
+) -> Optional[str]:
+    """Returns cached HTML if cache hit AND not expired. None si miss/expired."""
+    html_path, meta_path = _paths(namespace, url)
+    if not html_path.exists() or not meta_path.exists():
+        return None
+    try:
+        meta = json.loads(meta_path.read_text(encoding="utf-8"))
+    except Exception:
+        return None
+    fetched_at_str = meta.get("fetched_at")
+    if not fetched_at_str:
+        return None
+    try:
+        fetched_at = datetime.fromisoformat(fetched_at_str)
+    except Exception:
+        return None
+    age = (datetime.now(timezone.utc) - fetched_at).total_seconds()
+    if age > ttl_seconds:
+        return None
+    try:
+        return html_path.read_text(encoding="utf-8")
+    except Exception:
+        return None
+
+
+def save_cache(
+    namespace: str,
+    url: str,
+    html: str,
+    *,
+    status_code: int = 200,
+    ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
+) -> None:
+    """Persiste HTML + meta."""
+    html_path, meta_path = _paths(namespace, url)
+    html_path.parent.mkdir(parents=True, exist_ok=True)
+    html_path.write_text(html, encoding="utf-8")
+    meta = {
+        "url": url,
+        "fetched_at": datetime.now(timezone.utc).isoformat(),
+        "ttl_seconds": ttl_seconds,
+        "content_length": len(html),
+        "status_code": status_code,
+    }
+    meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
+
+
+def cache_stats(namespace: Optional[str] = None) -> dict:
+    """Returns stats sobre uso del cache para debugging."""
+    root = DEFAULT_CACHE_DIR
+    if namespace:
+        root = root / namespace
+    if not root.exists():
+        return {"entries": 0, "total_bytes": 0, "namespaces": []}
+    total_bytes = 0
+    entries = 0
+    namespaces = set()
+    for meta_file in root.rglob("*.meta.json"):
+        entries += 1
+        try:
+            total_bytes += meta_file.stat().st_size
+            html_sibling = meta_file.with_name(meta_file.name.replace(".meta.json", ".html"))
+            if html_sibling.exists():
+                total_bytes += html_sibling.stat().st_size
+            namespaces.add(meta_file.parent.name)
+        except Exception:
+            pass
+    return {
+        "entries": entries,
+        "total_bytes": total_bytes,
+        "total_mb": round(total_bytes / 1024 / 1024, 2),
+        "namespaces": sorted(namespaces),
+    }
+
+
+def clear_cache(namespace: Optional[str] = None) -> int:
+    """Elimina cache entries. Returns count of files deleted."""
+    root = DEFAULT_CACHE_DIR
+    if namespace:
+        root = root / namespace
+    if not root.exists():
+        return 0
+    deleted = 0
+    for f in root.rglob("*"):
+        if f.is_file():
+            try:
+                f.unlink()
+                deleted += 1
+            except Exception:
+                pass
+    return deleted