feat: AR-House initial commit
This commit is contained in:
@@ -0,0 +1,145 @@
|
||||
"""scrapers/_cache.py — File-based HTTP response cache para scrapers.
|
||||
|
||||
Cada URL fetched se guarda con TTL configurable. Evita re-fetchar contenido
|
||||
no-cambiable durante la TTL window (default 24h para auction calendars).
|
||||
|
||||
Aplicacion: scraper Miami-Dade Clerk + futuros scrapers de fuentes que
|
||||
actualicen diariamente.
|
||||
|
||||
KEY: SHA256 del URL. VALUE: response body (text/html).
|
||||
PATH: .cache/scrapers/{namespace}/{hash}.html + .meta.json
|
||||
|
||||
Schema meta:
|
||||
{
|
||||
"url": str,
|
||||
"fetched_at": iso8601,
|
||||
"ttl_seconds": int,
|
||||
"content_length": int,
|
||||
"status_code": int
|
||||
}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
# Cache root absoluto al proyecto
|
||||
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||
DEFAULT_CACHE_DIR = _PROJECT_ROOT / ".cache" / "scrapers"
|
||||
|
||||
# TTL defaults
|
||||
DEFAULT_TTL_SECONDS_DAILY = 86_400 # 24h — auction calendars actualizan diariamente
|
||||
DEFAULT_TTL_SECONDS_HOURLY = 3600 # 1h — listings live
|
||||
DEFAULT_TTL_SECONDS_WEEKLY = 604_800 # 7d — datos lentos (court records, official records)
|
||||
|
||||
|
||||
def _url_hash(url: str) -> str:
|
||||
return hashlib.sha256(url.encode("utf-8")).hexdigest()[:24]
|
||||
|
||||
|
||||
def _paths(namespace: str, url: str) -> tuple[Path, Path]:
|
||||
"""Returns (html_path, meta_path)."""
|
||||
nshash = _url_hash(url)
|
||||
ns_dir = DEFAULT_CACHE_DIR / namespace
|
||||
return ns_dir / f"{nshash}.html", ns_dir / f"{nshash}.meta.json"
|
||||
|
||||
|
||||
def get_cached(
|
||||
namespace: str,
|
||||
url: str,
|
||||
*,
|
||||
ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
|
||||
) -> Optional[str]:
|
||||
"""Returns cached HTML if cache hit AND not expired. None si miss/expired."""
|
||||
html_path, meta_path = _paths(namespace, url)
|
||||
if not html_path.exists() or not meta_path.exists():
|
||||
return None
|
||||
try:
|
||||
meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
return None
|
||||
fetched_at_str = meta.get("fetched_at")
|
||||
if not fetched_at_str:
|
||||
return None
|
||||
try:
|
||||
fetched_at = datetime.fromisoformat(fetched_at_str)
|
||||
except Exception:
|
||||
return None
|
||||
age = (datetime.now(timezone.utc) - fetched_at).total_seconds()
|
||||
if age > ttl_seconds:
|
||||
return None
|
||||
try:
|
||||
return html_path.read_text(encoding="utf-8")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def save_cache(
|
||||
namespace: str,
|
||||
url: str,
|
||||
html: str,
|
||||
*,
|
||||
status_code: int = 200,
|
||||
ttl_seconds: int = DEFAULT_TTL_SECONDS_DAILY,
|
||||
) -> None:
|
||||
"""Persiste HTML + meta."""
|
||||
html_path, meta_path = _paths(namespace, url)
|
||||
html_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
html_path.write_text(html, encoding="utf-8")
|
||||
meta = {
|
||||
"url": url,
|
||||
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
||||
"ttl_seconds": ttl_seconds,
|
||||
"content_length": len(html),
|
||||
"status_code": status_code,
|
||||
}
|
||||
meta_path.write_text(json.dumps(meta, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def cache_stats(namespace: Optional[str] = None) -> dict:
|
||||
"""Returns stats sobre uso del cache para debugging."""
|
||||
root = DEFAULT_CACHE_DIR
|
||||
if namespace:
|
||||
root = root / namespace
|
||||
if not root.exists():
|
||||
return {"entries": 0, "total_bytes": 0, "namespaces": []}
|
||||
total_bytes = 0
|
||||
entries = 0
|
||||
namespaces = set()
|
||||
for meta_file in root.rglob("*.meta.json"):
|
||||
entries += 1
|
||||
try:
|
||||
total_bytes += meta_file.stat().st_size
|
||||
html_sibling = meta_file.with_name(meta_file.name.replace(".meta.json", ".html"))
|
||||
if html_sibling.exists():
|
||||
total_bytes += html_sibling.stat().st_size
|
||||
namespaces.add(meta_file.parent.name)
|
||||
except Exception:
|
||||
pass
|
||||
return {
|
||||
"entries": entries,
|
||||
"total_bytes": total_bytes,
|
||||
"total_mb": round(total_bytes / 1024 / 1024, 2),
|
||||
"namespaces": sorted(namespaces),
|
||||
}
|
||||
|
||||
|
||||
def clear_cache(namespace: Optional[str] = None) -> int:
|
||||
"""Elimina cache entries. Returns count of files deleted."""
|
||||
root = DEFAULT_CACHE_DIR
|
||||
if namespace:
|
||||
root = root / namespace
|
||||
if not root.exists():
|
||||
return 0
|
||||
deleted = 0
|
||||
for f in root.rglob("*"):
|
||||
if f.is_file():
|
||||
try:
|
||||
f.unlink()
|
||||
deleted += 1
|
||||
except Exception:
|
||||
pass
|
||||
return deleted
|
||||
Reference in New Issue
Block a user