"""scrapers/_cache.py — File-based HTTP response cache para scrapers. Cada URL fetched se guarda con TTL configurable. Evita re-fetchar contenido no-cambiable durante la TTL window (default 24h para auction calendars). Aplicacion: scraper Miami-Dade Clerk + futuros scrapers de fuentes que actualicen diariamente. KEY: SHA256 del URL. VALUE: response body (text/html). PATH: .cache/scrapers/{namespace}/{hash}.html + .meta.json Schema meta: { "url": str, "fetched_at": iso8601, "ttl_seconds": int, "content_length": int, "status_code": int } """ from __future__ import annotations import hashlib import json from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Optional # Cache root absoluto al proyecto _PROJECT_ROOT = Path(__file__).resolve().parent.parent DEFAULT_CACHE_DIR = _PROJECT_ROOT / ".cache" / "scrapers" # TTL defaults DEFAULT_TTL_SECONDS_DAILY = 86_400 # 24h — auction calendars actualizan diariamente DEFAULT_TTL_SECONDS_HOURLY = 3600 # 1h — listings live DEFAULT_TTL_SECONDS_WEEKLY = 604_800 # 7d — datos lentos (court records, official records) def _url_hash(url: str) -> str: return hashlib.sha256(url.encode("utf-8")).hexdigest()[:24] def _paths(namespace: str, url: str) -> tuple[Path, Path]: """Returns (html_path, meta_path).""" nshash = _url_hash(url) ns_dir = DEFAULT_CACHE_DIR / namespace return ns_dir / f"{nshash}.html", ns_dir / f"{nshash}.meta.json" def get_cached( namespace: str, url: str, *, ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY, ) -> Optional[str]: """Returns cached HTML if cache hit AND not expired. None si miss/expired.""" html_path, meta_path = _paths(namespace, url) if not html_path.exists() or not meta_path.exists(): return None try: meta = json.loads(meta_path.read_text(encoding="utf-8")) except Exception: return None fetched_at_str = meta.get("fetched_at") if not fetched_at_str: return None try: fetched_at = datetime.fromisoformat(fetched_at_str) except Exception: return None age = (datetime.now(timezone.utc) - fetched_at).total_seconds() if age > ttl_seconds: return None try: return html_path.read_text(encoding="utf-8") except Exception: return None def save_cache( namespace: str, url: str, html: str, *, status_code: int = 200, ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY, ) -> None: """Persiste HTML + meta.""" html_path, meta_path = _paths(namespace, url) html_path.parent.mkdir(parents=True, exist_ok=True) html_path.write_text(html, encoding="utf-8") meta = { "url": url, "fetched_at": datetime.now(timezone.utc).isoformat(), "ttl_seconds": ttl_seconds, "content_length": len(html), "status_code": status_code, } meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8") def cache_stats(namespace: Optional[str] = None) -> dict: """Returns stats sobre uso del cache para debugging.""" root = DEFAULT_CACHE_DIR if namespace: root = root / namespace if not root.exists(): return {"entries": 0, "total_bytes": 0, "namespaces": []} total_bytes = 0 entries = 0 namespaces = set() for meta_file in root.rglob("*.meta.json"): entries += 1 try: total_bytes += meta_file.stat().st_size html_sibling = meta_file.with_name(meta_file.name.replace(".meta.json", ".html")) if html_sibling.exists(): total_bytes += html_sibling.stat().st_size namespaces.add(meta_file.parent.name) except Exception: pass return { "entries": entries, "total_bytes": total_bytes, "total_mb": round(total_bytes / 1024 / 1024, 2), "namespaces": sorted(namespaces), } def clear_cache(namespace: Optional[str] = None) -> int: """Elimina cache entries. Returns count of files deleted.""" root = DEFAULT_CACHE_DIR if namespace: root = root / namespace if not root.exists(): return 0 deleted = 0 for f in root.rglob("*"): if f.is_file(): try: f.unlink() deleted += 1 except Exception: pass return deleted