feat: Agente-Marketing initial commit
This commit is contained in:
@@ -0,0 +1,461 @@
|
||||
import re, json, time, random, shutil, os
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from datetime import datetime
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
CHROME_PATH = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
|
||||
CHROME_PROFILE = r"C:\Users\aerom\AppData\Local\Google\Chrome\User Data"
|
||||
TEMP_PROFILE = r"C:\Temp\chrome_casa_hunter"
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
}
|
||||
|
||||
CITY_COUNTY_MAP = {
|
||||
"vero beach": "Indian River", "sebastian": "Indian River",
|
||||
"fellsmere": "Indian River", "indian river shores": "Indian River",
|
||||
"stuart": "Martin", "jensen beach": "Martin", "hobe sound": "Martin",
|
||||
"palm city": "Martin", "port salerno": "Martin",
|
||||
"fort pierce": "St. Lucie", "port st. lucie": "St. Lucie",
|
||||
"melbourne": "Brevard", "palm bay": "Brevard", "titusville": "Brevard",
|
||||
"cocoa": "Brevard", "cocoa beach": "Brevard", "rockledge": "Brevard",
|
||||
"merritt island": "Brevard", "cape canaveral": "Brevard",
|
||||
"satellite beach": "Brevard", "west melbourne": "Brevard",
|
||||
"daytona beach": "Volusia", "ormond beach": "Volusia",
|
||||
"new smyrna beach": "Volusia", "edgewater": "Volusia",
|
||||
"port orange": "Volusia", "deltona": "Volusia",
|
||||
"palm coast": "Flagler", "flagler beach": "Flagler", "bunnell": "Flagler",
|
||||
"st. augustine": "St. Johns", "ponte vedra beach": "St. Johns",
|
||||
"nocatee": "St. Johns", "st. augustine beach": "St. Johns",
|
||||
"jacksonville": "Duval", "jacksonville beach": "Duval",
|
||||
"atlantic beach": "Duval", "neptune beach": "Duval",
|
||||
"fernandina beach": "Nassau", "yulee": "Nassau",
|
||||
}
|
||||
|
||||
COUNTY_CODES = {
|
||||
"Brevard": "9", "Duval": "31", "Flagler": "35", "Indian River": "61",
|
||||
"Martin": "86", "Nassau": "89", "St. Johns": "109", "St. Lucie": "111",
|
||||
"Volusia": "127",
|
||||
}
|
||||
|
||||
|
||||
def get_county_for_city(city: str) -> str:
|
||||
return CITY_COUNTY_MAP.get(city.lower().strip(), "")
|
||||
|
||||
|
||||
def score_property(prop: dict, search_cities: list, max_price: int) -> int:
|
||||
score = 40
|
||||
price = prop.get("price", 0)
|
||||
if not price or price <= 0:
|
||||
return 0
|
||||
|
||||
ratio = price / max_price
|
||||
if ratio <= 0.60:
|
||||
score += 35
|
||||
elif ratio <= 0.75:
|
||||
score += 25
|
||||
elif ratio <= 0.90:
|
||||
score += 15
|
||||
elif ratio <= 1.0:
|
||||
score += 8
|
||||
|
||||
city = (prop.get("city") or "").lower()
|
||||
county = (prop.get("county") or "").lower()
|
||||
for s in [c.lower() for c in search_cities]:
|
||||
if s in city or s in county or city in s or county in s:
|
||||
score += 12
|
||||
break
|
||||
|
||||
beds = prop.get("beds") or 0
|
||||
if beds >= 3:
|
||||
score += 8
|
||||
elif beds >= 2:
|
||||
score += 4
|
||||
|
||||
status = (prop.get("status") or "").lower()
|
||||
if any(w in status for w in ["foreclosure", "reo", "bank owned", "hud", "price reduced"]):
|
||||
score += 10
|
||||
elif any(w in status for w in ["new construction", "newly built"]):
|
||||
score += 5
|
||||
|
||||
return min(score, 100)
|
||||
|
||||
|
||||
# ── Chrome profile setup ───────────────────────────────────────────────────────
|
||||
def ensure_chrome_profile():
|
||||
"""Copia el perfil de Chrome si no existe el temporal."""
|
||||
if os.path.exists(os.path.join(TEMP_PROFILE, "Default")):
|
||||
return True
|
||||
if not os.path.exists(CHROME_PROFILE):
|
||||
return False
|
||||
try:
|
||||
os.makedirs(os.path.join(TEMP_PROFILE, "Default"), exist_ok=True)
|
||||
src = os.path.join(CHROME_PROFILE, "Default")
|
||||
dst = os.path.join(TEMP_PROFILE, "Default")
|
||||
for item in ["Cookies", "Login Data", "Web Data", "Preferences"]:
|
||||
s = os.path.join(src, item)
|
||||
if os.path.exists(s):
|
||||
shutil.copy2(s, dst)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f" Profile copy warning: {e}")
|
||||
return False
|
||||
|
||||
|
||||
# ── Playwright helpers ─────────────────────────────────────────────────────────
|
||||
def _hd(a=1.2, b=3.0):
|
||||
time.sleep(random.uniform(a, b))
|
||||
|
||||
def _scroll(page, steps=4):
|
||||
for _ in range(steps):
|
||||
page.mouse.wheel(0, random.randint(250, 600))
|
||||
time.sleep(random.uniform(0.4, 0.9))
|
||||
|
||||
def _type_human(page, text):
|
||||
for ch in text:
|
||||
page.keyboard.type(ch)
|
||||
time.sleep(random.uniform(0.07, 0.16))
|
||||
|
||||
def _parse_zillow_html(html, min_p=40000, max_p=230000):
|
||||
results = []
|
||||
m = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
|
||||
if not m:
|
||||
return results
|
||||
try:
|
||||
data = json.loads(m.group(1))
|
||||
list_results = (data.get("props", {}).get("pageProps", {})
|
||||
.get("searchPageState", {}).get("cat1", {})
|
||||
.get("searchResults", {}).get("listResults", []))
|
||||
for p in list_results:
|
||||
price = p.get("unformattedPrice", 0)
|
||||
if min_p <= price <= max_p:
|
||||
city_val = p.get("addressCity", "")
|
||||
results.append({
|
||||
"source": "Zillow",
|
||||
"address": p.get("address", ""),
|
||||
"price": price,
|
||||
"beds": p.get("beds", 0),
|
||||
"baths": p.get("baths", 0),
|
||||
"sqft": p.get("area", 0),
|
||||
"city": city_val,
|
||||
"state": p.get("addressState", "FL"),
|
||||
"county": get_county_for_city(city_val),
|
||||
"zipcode": str(p.get("addressZipcode", "")),
|
||||
"status": p.get("statusType", "For Sale"),
|
||||
"url": "https://www.zillow.com" + p.get("detailUrl", ""),
|
||||
"image_url": p.get("imgSrc", ""),
|
||||
"property_type": p.get("hdpData", {}).get("homeInfo", {}).get("homeType", ""),
|
||||
})
|
||||
except Exception as e:
|
||||
print(f" JSON parse error: {e}")
|
||||
return results
|
||||
|
||||
|
||||
# ── Zillow via Playwright + Chrome profile ────────────────────────────────────
|
||||
def scrape_zillow(cities: list, max_price: int) -> list:
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
except ImportError:
|
||||
print(" Playwright no instalado — saltando Zillow")
|
||||
return []
|
||||
|
||||
ensure_chrome_profile()
|
||||
all_results = []
|
||||
MIN_PRICE = 40000
|
||||
|
||||
with sync_playwright() as p:
|
||||
ctx = p.chromium.launch_persistent_context(
|
||||
user_data_dir=TEMP_PROFILE,
|
||||
executable_path=CHROME_PATH,
|
||||
headless=False,
|
||||
args=[
|
||||
"--profile-directory=Default",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--start-maximized",
|
||||
"--no-first-run",
|
||||
"--no-default-browser-check",
|
||||
],
|
||||
viewport={"width": 1366, "height": 768},
|
||||
)
|
||||
page = ctx.new_page()
|
||||
|
||||
for city in cities:
|
||||
city_q = f"{city}, FL"
|
||||
print(f" Zillow: buscando {city_q}...")
|
||||
try:
|
||||
page.goto("https://www.zillow.com", wait_until="load", timeout=30000)
|
||||
_hd(1.5, 2.5)
|
||||
|
||||
search = page.query_selector(
|
||||
"#search-box-input, input[id*='search'], "
|
||||
"input[placeholder*='address'], input[placeholder*='city']"
|
||||
)
|
||||
if search:
|
||||
search.click()
|
||||
_hd(0.3, 0.6)
|
||||
page.keyboard.down("Control")
|
||||
page.keyboard.press("a")
|
||||
page.keyboard.up("Control")
|
||||
page.keyboard.press("Delete")
|
||||
_hd(0.2, 0.4)
|
||||
_type_human(page, city_q)
|
||||
_hd(0.8, 1.5)
|
||||
page.keyboard.press("Enter")
|
||||
page.wait_for_load_state("load", timeout=30000)
|
||||
_hd(2, 3)
|
||||
_scroll(page, 4)
|
||||
_hd(1, 2)
|
||||
else:
|
||||
# fallback: URL directa
|
||||
slug = re.sub(r"[^a-z0-9\s-]", "", city.lower()).strip().replace(" ", "-")
|
||||
url = (f"https://www.zillow.com/homes/for_sale/{slug}-fl/"
|
||||
f"?searchQueryState=%7B%22filterState%22%3A%7B"
|
||||
f"%22price%22%3A%7B%22max%22%3A{max_price}%2C%22min%22%3A{MIN_PRICE}%7D%7D%7D")
|
||||
page.goto(url, wait_until="load", timeout=45000)
|
||||
_hd(2, 3)
|
||||
_scroll(page, 4)
|
||||
|
||||
html = page.content()
|
||||
title = page.title()
|
||||
# Si Cloudflare bloqueó, esperar hasta que el usuario lo resuelva (max 90s)
|
||||
if "denied" in title.lower() or "px-captcha" in html or "cf-browser-verification" in html:
|
||||
print(f" >> Cloudflare en {city}: resuelve el challenge en el browser (90s max)...")
|
||||
deadline = time.time() + 90
|
||||
while time.time() < deadline:
|
||||
time.sleep(4)
|
||||
html = page.content()
|
||||
t2 = page.title()
|
||||
if "denied" not in t2.lower() and "px-captcha" not in html:
|
||||
print(f" Challenge resuelto!")
|
||||
break
|
||||
else:
|
||||
print(f" Timeout esperando challenge - saltando {city}")
|
||||
continue
|
||||
|
||||
listings = _parse_zillow_html(html, MIN_PRICE, max_price)
|
||||
print(f" -> {len(listings)} encontradas")
|
||||
all_results.extend(listings)
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR {city}: {e}")
|
||||
|
||||
_hd(4, 7) # pausa entre ciudades para evitar bloqueo
|
||||
|
||||
ctx.close()
|
||||
|
||||
# Deduplicar
|
||||
seen, unique = set(), []
|
||||
for r in all_results:
|
||||
key = r["address"].lower().strip()
|
||||
if key and key not in seen:
|
||||
seen.add(key)
|
||||
unique.append(r)
|
||||
return unique
|
||||
|
||||
|
||||
# ── Realtor.com via Playwright + Chrome profile ───────────────────────────────
|
||||
def scrape_realtor(cities: list, max_price: int) -> list:
|
||||
try:
|
||||
from playwright.sync_api import sync_playwright
|
||||
except ImportError:
|
||||
return []
|
||||
|
||||
ensure_chrome_profile()
|
||||
all_results = []
|
||||
|
||||
with sync_playwright() as p:
|
||||
ctx = p.chromium.launch_persistent_context(
|
||||
user_data_dir=TEMP_PROFILE,
|
||||
executable_path=CHROME_PATH,
|
||||
headless=False,
|
||||
args=[
|
||||
"--profile-directory=Default",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--start-maximized",
|
||||
"--no-first-run",
|
||||
],
|
||||
viewport={"width": 1366, "height": 768},
|
||||
)
|
||||
page = ctx.new_page()
|
||||
|
||||
for city in cities:
|
||||
city_slug = re.sub(r"[^a-z0-9\s]", "", city.lower()).strip().replace(" ", "_")
|
||||
url = f"https://www.realtor.com/realestateandhomes-search/{city_slug}_FL/price-na-{max_price}"
|
||||
print(f" Realtor.com: buscando {city}...")
|
||||
try:
|
||||
page.goto(url, wait_until="load", timeout=45000)
|
||||
_hd(2, 3)
|
||||
_scroll(page, 4)
|
||||
_hd(1, 2)
|
||||
|
||||
html = page.content()
|
||||
m = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
|
||||
if not m:
|
||||
continue
|
||||
data = json.loads(m.group(1))
|
||||
properties = (data.get("props", {}).get("pageProps", {})
|
||||
.get("properties", []))
|
||||
for item in properties:
|
||||
price = item.get("list_price", 0)
|
||||
if not isinstance(price, (int, float)):
|
||||
continue
|
||||
price = int(price)
|
||||
if 40000 <= price <= max_price:
|
||||
loc = item.get("location", {}).get("address", {})
|
||||
city_val = loc.get("city", city)
|
||||
desc = item.get("description", {})
|
||||
all_results.append({
|
||||
"source": "Realtor.com",
|
||||
"address": loc.get("line", ""),
|
||||
"city": city_val,
|
||||
"county": get_county_for_city(city_val),
|
||||
"zipcode": str(loc.get("postal_code", "")),
|
||||
"price": price,
|
||||
"beds": desc.get("beds", 0),
|
||||
"baths": desc.get("baths_consolidated", 0),
|
||||
"sqft": desc.get("sqft", 0),
|
||||
"status": item.get("status", "For Sale"),
|
||||
"url": "https://www.realtor.com" + item.get("permalink", ""),
|
||||
"image_url": item.get("primary_photo", {}).get("href", ""),
|
||||
"property_type": desc.get("type", ""),
|
||||
})
|
||||
print(f" -> {len(properties)} revisadas")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR {city}: {e}")
|
||||
|
||||
_hd(3, 5)
|
||||
|
||||
ctx.close()
|
||||
|
||||
seen, unique = set(), []
|
||||
for r in all_results:
|
||||
key = r["address"].lower().strip()
|
||||
if key and key not in seen:
|
||||
seen.add(key)
|
||||
unique.append(r)
|
||||
return unique
|
||||
|
||||
|
||||
# ── HUD Homes (requests - gobierno, sin anti-bot) ─────────────────────────────
|
||||
def scrape_hud(cities: list, max_price: int) -> list:
|
||||
results = []
|
||||
counties = set()
|
||||
for city in cities:
|
||||
c = get_county_for_city(city)
|
||||
if c:
|
||||
counties.add(c)
|
||||
if not counties:
|
||||
counties = {"Brevard", "Indian River", "Duval", "St. Johns", "Volusia"}
|
||||
|
||||
for county in list(counties)[:6]:
|
||||
code = COUNTY_CODES.get(county, "")
|
||||
if not code:
|
||||
continue
|
||||
url = (f"https://www.hudhomestore.gov/HudHomes/Index.aspx"
|
||||
f"?sState=FL&sCounty={code}&sPriceMax={max_price}&sPriceMin=30000")
|
||||
try:
|
||||
r = requests.get(url, headers=HEADERS, timeout=15)
|
||||
soup = BeautifulSoup(r.text, "html.parser")
|
||||
rows = soup.select("tr.propRow, .property-row, tr[id^='prop']")
|
||||
for row in rows[:8]:
|
||||
text = row.get_text(" ", strip=True)
|
||||
price_m = re.search(r'\$[\d,]+', text)
|
||||
if not price_m:
|
||||
continue
|
||||
price = int(re.sub(r'[^\d]', '', price_m.group()))
|
||||
if 0 < price <= max_price:
|
||||
addr_el = row.select_one("a")
|
||||
prop = {
|
||||
"source": "HUD Homes",
|
||||
"address": addr_el.get_text(strip=True) if addr_el else text[:80],
|
||||
"city": county, "county": county,
|
||||
"price": price, "url": url,
|
||||
"status": "HUD Foreclosure",
|
||||
}
|
||||
prop["score"] = score_property(prop, cities, max_price)
|
||||
results.append(prop)
|
||||
except Exception as e:
|
||||
print(f" HUD {county}: {e}")
|
||||
return results
|
||||
|
||||
|
||||
# ── Fannie Mae HomePath (REO) ──────────────────────────────────────────────────
|
||||
def scrape_homepath(cities: list, max_price: int) -> list:
|
||||
results = []
|
||||
for term in cities[:8]:
|
||||
url = (f"https://www.homepath.fanniemae.com/listings"
|
||||
f"?searchTerm={requests.utils.quote(term + ' FL')}"
|
||||
f"&maxPrice={max_price}&state=FL")
|
||||
try:
|
||||
r = requests.get(url, headers=HEADERS, timeout=15)
|
||||
data = r.json() if "json" in r.headers.get("content-type", "") else {}
|
||||
listings = data.get("listings", data.get("results", []))
|
||||
for item in listings[:6]:
|
||||
price = item.get("listPrice", item.get("price", 0))
|
||||
if 0 < price <= max_price:
|
||||
city_val = item.get("city", term)
|
||||
prop = {
|
||||
"source": "Fannie Mae HomePath",
|
||||
"address": item.get("address", item.get("streetAddress", "")),
|
||||
"city": city_val,
|
||||
"county": get_county_for_city(city_val),
|
||||
"zipcode": str(item.get("postalCode", "")),
|
||||
"price": price,
|
||||
"beds": item.get("bedrooms", 0),
|
||||
"baths": item.get("bathrooms", 0),
|
||||
"sqft": item.get("squareFeet", 0),
|
||||
"url": f"https://www.homepath.fanniemae.com/listings/{item.get('id','')}",
|
||||
"status": "Fannie Mae REO",
|
||||
}
|
||||
prop["score"] = score_property(prop, cities, max_price)
|
||||
results.append(prop)
|
||||
except Exception as e:
|
||||
print(f" HomePath {term}: {e}")
|
||||
return results
|
||||
|
||||
|
||||
# ── Main runner ───────────────────────────────────────────────────────────────
|
||||
def run_all_scrapers(cities: list = None, max_price: int = 230000) -> dict:
|
||||
if not cities:
|
||||
cities = ["Vero Beach", "Jacksonville", "Melbourne", "St. Augustine"]
|
||||
|
||||
all_props = []
|
||||
log = {}
|
||||
|
||||
sources = {
|
||||
"Zillow (Browser)": lambda: scrape_zillow(cities, max_price),
|
||||
"Realtor.com (Browser)": lambda: scrape_realtor(cities, max_price),
|
||||
"HUD Homes": lambda: scrape_hud(cities, max_price),
|
||||
"Fannie Mae HomePath": lambda: scrape_homepath(cities, max_price),
|
||||
}
|
||||
|
||||
for name, fn in sources.items():
|
||||
try:
|
||||
print(f"\n[{name}]")
|
||||
props = fn()
|
||||
seen, unique = set(), []
|
||||
for p in props:
|
||||
key = ((p.get("address") or "").lower().strip(), p.get("price", 0))
|
||||
if key[0] and key not in seen:
|
||||
seen.add(key)
|
||||
p["score"] = score_property(p, cities, max_price)
|
||||
unique.append(p)
|
||||
all_props.extend(unique)
|
||||
log[name] = {"found": len(unique), "status": "ok"}
|
||||
print(f" -> {len(unique)} propiedades validas")
|
||||
except Exception as e:
|
||||
log[name] = {"found": 0, "status": f"error: {e}"}
|
||||
print(f" ERROR {name}: {e}")
|
||||
|
||||
all_props.sort(key=lambda x: x.get("score", 0), reverse=True)
|
||||
return {
|
||||
"properties": all_props,
|
||||
"log": log,
|
||||
"cities_searched": cities,
|
||||
"max_price": max_price,
|
||||
"ran_at": datetime.utcnow().isoformat()
|
||||
}
|
||||
Reference in New Issue
Block a user