enrichment: Haiku flat details + image gallery on expand
apply service
- POST /internal/fetch-listing: headless Playwright fetch of a listing URL,
returns {html, image_urls[], final_url}. Uses the same browser
fingerprint/profile as the apply run so bot guards don't kick in
web service
- New enrichment pipeline (web/enrichment.py):
/internal/flats → upsert → kick() enrichment in a background thread
1. POST /internal/fetch-listing on apply
2. llm.extract_flat_details(html, url) — Haiku tool-use call returns
structured JSON (address, rooms, rent, description, pros/cons, etc.)
3. Download each image directly to /data/flats/<slug>/NN.<ext>
4. Persist enrichment_json + image_count + enrichment_status on the flat
- llm.py: minimal Anthropic /v1/messages wrapper, no SDK
- DB migration v5 adds enrichment_json/_status/_updated_at + image_count
- Admin "Altbestand anreichern" button (POST /actions/enrich-all) queues
backfill for all pending/failed rows; runs in a detached task
- GET /partials/wohnung/<id> renders _wohnung_detail.html
- GET /flat-images/<slug>/<n> serves the downloaded image
UI
- Chevron on each list row toggles an inline detail pane (HTMX fetch on
first open, hx-preserve keeps it open across the 3–30 s polls)
- CSS .flat-gallery normalises image tiles to a 4/3 aspect with object-fit:
cover so different source sizes align cleanly
- "analysiert…" / "?" chips on the list reflect enrichment_status
Config
- ANTHROPIC_API_KEY + ANTHROPIC_MODEL wired into docker-compose's web
service (default model: claude-haiku-4-5-20251001)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
2609d3504a
commit
eb66284172
11 changed files with 688 additions and 44 deletions
168
web/enrichment.py
Normal file
168
web/enrichment.py
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
"""Flat-enrichment pipeline.
|
||||
|
||||
For each new flat we:
|
||||
1. Ask the apply service to fetch the listing via Playwright (bypasses bot guards)
|
||||
2. Feed the HTML to Haiku via `llm.extract_flat_details` → structured dict
|
||||
3. Download each image URL directly into /data/flats/<slug>/NN.<ext>
|
||||
4. Persist result on the flat row (enrichment_json + image_count + status)
|
||||
|
||||
Kicked as a detached asyncio task from /internal/flats so scraping stays fast.
|
||||
A small queue cap + per-call lock would be next steps if we ever need them.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
import db
|
||||
import llm
|
||||
from settings import DATA_DIR, INTERNAL_API_KEY
|
||||
|
||||
logger = logging.getLogger("web.enrichment")
|
||||
|
||||
APPLY_FETCH_URL = os.environ.get("APPLY_URL", "http://apply:8000") + "/internal/fetch-listing"
|
||||
IMAGES_DIR = DATA_DIR / "flats"
|
||||
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
MAX_IMAGES = 12
|
||||
MAX_IMAGE_BYTES = 3_000_000 # 3 MB per image
|
||||
IMAGE_TIMEOUT = 15
|
||||
|
||||
|
||||
def flat_slug(flat_id: str) -> str:
|
||||
"""Filesystem-safe short identifier for a flat (IDs are URLs)."""
|
||||
return hashlib.sha1(flat_id.encode("utf-8")).hexdigest()[:16]
|
||||
|
||||
|
||||
def flat_image_dir(flat_id: str) -> Path:
|
||||
d = IMAGES_DIR / flat_slug(flat_id)
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
return d
|
||||
|
||||
|
||||
def _fetch_listing(url: str) -> Optional[dict]:
|
||||
try:
|
||||
r = requests.post(
|
||||
APPLY_FETCH_URL,
|
||||
headers={"X-Internal-Api-Key": INTERNAL_API_KEY},
|
||||
json={"url": url},
|
||||
timeout=90,
|
||||
)
|
||||
except requests.RequestException as e:
|
||||
logger.warning("fetch-listing request failed for %s: %s", url, e)
|
||||
return None
|
||||
if r.status_code >= 400:
|
||||
logger.warning("fetch-listing %s: %s", r.status_code, r.text[:300])
|
||||
return None
|
||||
return r.json()
|
||||
|
||||
|
||||
def _ext_from_response(resp: requests.Response, url: str) -> str:
|
||||
ct = resp.headers.get("content-type", "").split(";")[0].strip().lower()
|
||||
if ct:
|
||||
ext = mimetypes.guess_extension(ct) or ""
|
||||
if ext:
|
||||
return ext.replace(".jpe", ".jpg")
|
||||
path = urlparse(url).path
|
||||
_, ext = os.path.splitext(path)
|
||||
return ext.lower() or ".jpg"
|
||||
|
||||
|
||||
def _download_images(flat_id: str, urls: list[str], referer: str) -> int:
|
||||
d = flat_image_dir(flat_id)
|
||||
# Clear any previous attempts so re-enrichment doesn't pile up dupes.
|
||||
for old in d.iterdir():
|
||||
try: old.unlink()
|
||||
except OSError: pass
|
||||
|
||||
saved = 0
|
||||
for raw_url in urls[:MAX_IMAGES]:
|
||||
try:
|
||||
r = requests.get(
|
||||
raw_url,
|
||||
headers={"Referer": referer,
|
||||
"User-Agent": "Mozilla/5.0 (lazyflat enricher)"},
|
||||
timeout=IMAGE_TIMEOUT,
|
||||
stream=True,
|
||||
)
|
||||
if r.status_code >= 400:
|
||||
continue
|
||||
ct = r.headers.get("content-type", "").split(";")[0].strip().lower()
|
||||
if not ct.startswith("image/"):
|
||||
continue
|
||||
ext = _ext_from_response(r, raw_url)
|
||||
path = d / f"{saved + 1:02d}{ext}"
|
||||
total = 0
|
||||
with open(path, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=65_536):
|
||||
if not chunk:
|
||||
continue
|
||||
total += len(chunk)
|
||||
if total > MAX_IMAGE_BYTES:
|
||||
break
|
||||
f.write(chunk)
|
||||
if total == 0:
|
||||
path.unlink(missing_ok=True)
|
||||
continue
|
||||
saved += 1
|
||||
except requests.RequestException as e:
|
||||
logger.info("image download failed %s: %s", raw_url, e)
|
||||
continue
|
||||
return saved
|
||||
|
||||
|
||||
def enrich_flat_sync(flat_id: str) -> None:
|
||||
"""Run the full enrichment pipeline for one flat. Blocking."""
|
||||
flat = db.get_flat(flat_id)
|
||||
if not flat:
|
||||
return
|
||||
url = flat["link"]
|
||||
logger.info("enrich start flat=%s url=%s", flat_id, url)
|
||||
listing = _fetch_listing(url)
|
||||
if not listing:
|
||||
db.set_flat_enrichment(flat_id, "failed")
|
||||
return
|
||||
|
||||
details = llm.extract_flat_details(listing.get("html") or "",
|
||||
listing.get("final_url") or url)
|
||||
if details is None:
|
||||
db.set_flat_enrichment(flat_id, "failed")
|
||||
return
|
||||
|
||||
image_urls = listing.get("image_urls") or []
|
||||
image_count = _download_images(flat_id, image_urls, referer=url)
|
||||
|
||||
db.set_flat_enrichment(flat_id, "ok", enrichment=details, image_count=image_count)
|
||||
logger.info("enrich done flat=%s images=%d", flat_id, image_count)
|
||||
|
||||
|
||||
def kick(flat_id: str) -> None:
|
||||
"""Fire-and-forget enrichment in a background thread."""
|
||||
asyncio.create_task(asyncio.to_thread(enrich_flat_sync, flat_id))
|
||||
|
||||
|
||||
async def _backfill_runner() -> None:
|
||||
rows = db.flats_needing_enrichment(limit=200)
|
||||
logger.info("enrich backfill: %d flats queued", len(rows))
|
||||
for row in rows:
|
||||
try:
|
||||
await asyncio.to_thread(enrich_flat_sync, row["id"])
|
||||
except Exception:
|
||||
logger.exception("backfill step failed flat=%s", row["id"])
|
||||
|
||||
|
||||
def kick_backfill() -> int:
|
||||
"""Queue enrichment for every flat still pending/failed. Returns how many
|
||||
flats are queued; the actual work happens in a detached task so the admin
|
||||
UI doesn't block for minutes."""
|
||||
pending = db.flats_needing_enrichment(limit=200)
|
||||
asyncio.create_task(_backfill_runner())
|
||||
return len(pending)
|
||||
Loading…
Add table
Add a link
Reference in a new issue