AR-House/scrapers/_cache.py

"""scrapers/_cache.py — File-based HTTP response cache para scrapers.

Cada URL fetched se guarda con TTL configurable. Evita re-fetchar contenido
no-cambiable durante la TTL window (default 24h para auction calendars).

Aplicacion: scraper Miami-Dade Clerk + futuros scrapers de fuentes que
actualicen diariamente.

KEY: SHA256 del URL. VALUE: response body (text/html).
PATH: .cache/scrapers/{namespace}/{hash}.html + .meta.json

Schema meta:
    {
      "url": str,
      "fetched_at": iso8601,
      "ttl_seconds": int,
      "content_length": int,
      "status_code": int
    }
"""
from __future__ import annotations

import hashlib
import json
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Optional

# Cache root absoluto al proyecto
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
DEFAULT_CACHE_DIR = _PROJECT_ROOT / ".cache" / "scrapers"

# TTL defaults
DEFAULT_TTL_SECONDS_DAILY = 86_400        # 24h — auction calendars actualizan diariamente
DEFAULT_TTL_SECONDS_HOURLY = 3600         # 1h — listings live
DEFAULT_TTL_SECONDS_WEEKLY = 604_800      # 7d — datos lentos (court records, official records)


def _url_hash(url: str) -> str:
    return hashlib.sha256(url.encode("utf-8")).hexdigest()[:24]


def _paths(namespace: str, url: str) -> tuple[Path, Path]:
    """Returns (html_path, meta_path)."""
    nshash = _url_hash(url)
    ns_dir = DEFAULT_CACHE_DIR / namespace
    return ns_dir / f"{nshash}.html", ns_dir / f"{nshash}.meta.json"


def get_cached(
    namespace: str,
    url: str,
    *,
    ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
) -> Optional[str]:
    """Returns cached HTML if cache hit AND not expired. None si miss/expired."""
    html_path, meta_path = _paths(namespace, url)
    if not html_path.exists() or not meta_path.exists():
        return None
    try:
        meta = json.loads(meta_path.read_text(encoding="utf-8"))
    except Exception:
        return None
    fetched_at_str = meta.get("fetched_at")
    if not fetched_at_str:
        return None
    try:
        fetched_at = datetime.fromisoformat(fetched_at_str)
    except Exception:
        return None
    age = (datetime.now(timezone.utc) - fetched_at).total_seconds()
    if age > ttl_seconds:
        return None
    try:
        return html_path.read_text(encoding="utf-8")
    except Exception:
        return None


def save_cache(
    namespace: str,
    url: str,
    html: str,
    *,
    status_code: int = 200,
    ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
) -> None:
    """Persiste HTML + meta."""
    html_path, meta_path = _paths(namespace, url)
    html_path.parent.mkdir(parents=True, exist_ok=True)
    html_path.write_text(html, encoding="utf-8")
    meta = {
        "url": url,
        "fetched_at": datetime.now(timezone.utc).isoformat(),
        "ttl_seconds": ttl_seconds,
        "content_length": len(html),
        "status_code": status_code,
    }
    meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")


def cache_stats(namespace: Optional[str] = None) -> dict:
    """Returns stats sobre uso del cache para debugging."""
    root = DEFAULT_CACHE_DIR
    if namespace:
        root = root / namespace
    if not root.exists():
        return {"entries": 0, "total_bytes": 0, "namespaces": []}
    total_bytes = 0
    entries = 0
    namespaces = set()
    for meta_file in root.rglob("*.meta.json"):
        entries += 1
        try:
            total_bytes += meta_file.stat().st_size
            html_sibling = meta_file.with_name(meta_file.name.replace(".meta.json", ".html"))
            if html_sibling.exists():
                total_bytes += html_sibling.stat().st_size
            namespaces.add(meta_file.parent.name)
        except Exception:
            pass
    return {
        "entries": entries,
        "total_bytes": total_bytes,
        "total_mb": round(total_bytes / 1024 / 1024, 2),
        "namespaces": sorted(namespaces),
    }


def clear_cache(namespace: Optional[str] = None) -> int:
    """Elimina cache entries. Returns count of files deleted."""
    root = DEFAULT_CACHE_DIR
    if namespace:
        root = root / namespace
    if not root.exists():
        return 0
    deleted = 0
    for f in root.rglob("*"):
        if f.is_file():
            try:
                f.unlink()
                deleted += 1
            except Exception:
                pass
    return deleted