Files
AR-House/scrapers/_cache.py
T
2026-07-03 12:24:58 -04:00

146 lines
4.3 KiB
Python

"""scrapers/_cache.py — File-based HTTP response cache para scrapers.
Cada URL fetched se guarda con TTL configurable. Evita re-fetchar contenido
no-cambiable durante la TTL window (default 24h para auction calendars).
Aplicacion: scraper Miami-Dade Clerk + futuros scrapers de fuentes que
actualicen diariamente.
KEY: SHA256 del URL. VALUE: response body (text/html).
PATH: .cache/scrapers/{namespace}/{hash}.html + .meta.json
Schema meta:
{
"url": str,
"fetched_at": iso8601,
"ttl_seconds": int,
"content_length": int,
"status_code": int
}
"""
from __future__ import annotations
import hashlib
import json
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Optional
# Cache root absoluto al proyecto
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
DEFAULT_CACHE_DIR = _PROJECT_ROOT / ".cache" / "scrapers"
# TTL defaults
DEFAULT_TTL_SECONDS_DAILY = 86_400 # 24h — auction calendars actualizan diariamente
DEFAULT_TTL_SECONDS_HOURLY = 3600 # 1h — listings live
DEFAULT_TTL_SECONDS_WEEKLY = 604_800 # 7d — datos lentos (court records, official records)
def _url_hash(url: str) -> str:
return hashlib.sha256(url.encode("utf-8")).hexdigest()[:24]
def _paths(namespace: str, url: str) -> tuple[Path, Path]:
"""Returns (html_path, meta_path)."""
nshash = _url_hash(url)
ns_dir = DEFAULT_CACHE_DIR / namespace
return ns_dir / f"{nshash}.html", ns_dir / f"{nshash}.meta.json"
def get_cached(
namespace: str,
url: str,
*,
ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
) -> Optional[str]:
"""Returns cached HTML if cache hit AND not expired. None si miss/expired."""
html_path, meta_path = _paths(namespace, url)
if not html_path.exists() or not meta_path.exists():
return None
try:
meta = json.loads(meta_path.read_text(encoding="utf-8"))
except Exception:
return None
fetched_at_str = meta.get("fetched_at")
if not fetched_at_str:
return None
try:
fetched_at = datetime.fromisoformat(fetched_at_str)
except Exception:
return None
age = (datetime.now(timezone.utc) - fetched_at).total_seconds()
if age > ttl_seconds:
return None
try:
return html_path.read_text(encoding="utf-8")
except Exception:
return None
def save_cache(
namespace: str,
url: str,
html: str,
*,
status_code: int = 200,
ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
) -> None:
"""Persiste HTML + meta."""
html_path, meta_path = _paths(namespace, url)
html_path.parent.mkdir(parents=True, exist_ok=True)
html_path.write_text(html, encoding="utf-8")
meta = {
"url": url,
"fetched_at": datetime.now(timezone.utc).isoformat(),
"ttl_seconds": ttl_seconds,
"content_length": len(html),
"status_code": status_code,
}
meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
def cache_stats(namespace: Optional[str] = None) -> dict:
"""Returns stats sobre uso del cache para debugging."""
root = DEFAULT_CACHE_DIR
if namespace:
root = root / namespace
if not root.exists():
return {"entries": 0, "total_bytes": 0, "namespaces": []}
total_bytes = 0
entries = 0
namespaces = set()
for meta_file in root.rglob("*.meta.json"):
entries += 1
try:
total_bytes += meta_file.stat().st_size
html_sibling = meta_file.with_name(meta_file.name.replace(".meta.json", ".html"))
if html_sibling.exists():
total_bytes += html_sibling.stat().st_size
namespaces.add(meta_file.parent.name)
except Exception:
pass
return {
"entries": entries,
"total_bytes": total_bytes,
"total_mb": round(total_bytes / 1024 / 1024, 2),
"namespaces": sorted(namespaces),
}
def clear_cache(namespace: Optional[str] = None) -> int:
"""Elimina cache entries. Returns count of files deleted."""
root = DEFAULT_CACHE_DIR
if namespace:
root = root / namespace
if not root.exists():
return 0
deleted = 0
for f in root.rglob("*"):
if f.is_file():
try:
f.unlink()
deleted += 1
except Exception:
pass
return deleted