146 lines
4.3 KiB
Python
146 lines
4.3 KiB
Python
"""scrapers/_cache.py — File-based HTTP response cache para scrapers.
|
|
|
|
Cada URL fetched se guarda con TTL configurable. Evita re-fetchar contenido
|
|
no-cambiable durante la TTL window (default 24h para auction calendars).
|
|
|
|
Aplicacion: scraper Miami-Dade Clerk + futuros scrapers de fuentes que
|
|
actualicen diariamente.
|
|
|
|
KEY: SHA256 del URL. VALUE: response body (text/html).
|
|
PATH: .cache/scrapers/{namespace}/{hash}.html + .meta.json
|
|
|
|
Schema meta:
|
|
{
|
|
"url": str,
|
|
"fetched_at": iso8601,
|
|
"ttl_seconds": int,
|
|
"content_length": int,
|
|
"status_code": int
|
|
}
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import json
|
|
from datetime import datetime, timedelta, timezone
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
# Cache root absoluto al proyecto
|
|
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
|
DEFAULT_CACHE_DIR = _PROJECT_ROOT / ".cache" / "scrapers"
|
|
|
|
# TTL defaults
|
|
DEFAULT_TTL_SECONDS_DAILY = 86_400 # 24h — auction calendars actualizan diariamente
|
|
DEFAULT_TTL_SECONDS_HOURLY = 3600 # 1h — listings live
|
|
DEFAULT_TTL_SECONDS_WEEKLY = 604_800 # 7d — datos lentos (court records, official records)
|
|
|
|
|
|
def _url_hash(url: str) -> str:
|
|
return hashlib.sha256(url.encode("utf-8")).hexdigest()[:24]
|
|
|
|
|
|
def _paths(namespace: str, url: str) -> tuple[Path, Path]:
|
|
"""Returns (html_path, meta_path)."""
|
|
nshash = _url_hash(url)
|
|
ns_dir = DEFAULT_CACHE_DIR / namespace
|
|
return ns_dir / f"{nshash}.html", ns_dir / f"{nshash}.meta.json"
|
|
|
|
|
|
def get_cached(
|
|
namespace: str,
|
|
url: str,
|
|
*,
|
|
ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
|
|
) -> Optional[str]:
|
|
"""Returns cached HTML if cache hit AND not expired. None si miss/expired."""
|
|
html_path, meta_path = _paths(namespace, url)
|
|
if not html_path.exists() or not meta_path.exists():
|
|
return None
|
|
try:
|
|
meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
|
except Exception:
|
|
return None
|
|
fetched_at_str = meta.get("fetched_at")
|
|
if not fetched_at_str:
|
|
return None
|
|
try:
|
|
fetched_at = datetime.fromisoformat(fetched_at_str)
|
|
except Exception:
|
|
return None
|
|
age = (datetime.now(timezone.utc) - fetched_at).total_seconds()
|
|
if age > ttl_seconds:
|
|
return None
|
|
try:
|
|
return html_path.read_text(encoding="utf-8")
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def save_cache(
|
|
namespace: str,
|
|
url: str,
|
|
html: str,
|
|
*,
|
|
status_code: int = 200,
|
|
ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
|
|
) -> None:
|
|
"""Persiste HTML + meta."""
|
|
html_path, meta_path = _paths(namespace, url)
|
|
html_path.parent.mkdir(parents=True, exist_ok=True)
|
|
html_path.write_text(html, encoding="utf-8")
|
|
meta = {
|
|
"url": url,
|
|
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
|
"ttl_seconds": ttl_seconds,
|
|
"content_length": len(html),
|
|
"status_code": status_code,
|
|
}
|
|
meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
|
|
|
|
|
|
def cache_stats(namespace: Optional[str] = None) -> dict:
|
|
"""Returns stats sobre uso del cache para debugging."""
|
|
root = DEFAULT_CACHE_DIR
|
|
if namespace:
|
|
root = root / namespace
|
|
if not root.exists():
|
|
return {"entries": 0, "total_bytes": 0, "namespaces": []}
|
|
total_bytes = 0
|
|
entries = 0
|
|
namespaces = set()
|
|
for meta_file in root.rglob("*.meta.json"):
|
|
entries += 1
|
|
try:
|
|
total_bytes += meta_file.stat().st_size
|
|
html_sibling = meta_file.with_name(meta_file.name.replace(".meta.json", ".html"))
|
|
if html_sibling.exists():
|
|
total_bytes += html_sibling.stat().st_size
|
|
namespaces.add(meta_file.parent.name)
|
|
except Exception:
|
|
pass
|
|
return {
|
|
"entries": entries,
|
|
"total_bytes": total_bytes,
|
|
"total_mb": round(total_bytes / 1024 / 1024, 2),
|
|
"namespaces": sorted(namespaces),
|
|
}
|
|
|
|
|
|
def clear_cache(namespace: Optional[str] = None) -> int:
|
|
"""Elimina cache entries. Returns count of files deleted."""
|
|
root = DEFAULT_CACHE_DIR
|
|
if namespace:
|
|
root = root / namespace
|
|
if not root.exists():
|
|
return 0
|
|
deleted = 0
|
|
for f in root.rglob("*"):
|
|
if f.is_file():
|
|
try:
|
|
f.unlink()
|
|
deleted += 1
|
|
except Exception:
|
|
pass
|
|
return deleted
|