enrichment: Haiku flat details + image gallery on expand

apply service
- POST /internal/fetch-listing: headless Playwright fetch of a listing URL,
  returns {html, image_urls[], final_url}. Uses the same browser
  fingerprint/profile as the apply run so bot guards don't kick in

web service
- New enrichment pipeline (web/enrichment.py):
  /internal/flats → upsert → kick() enrichment in a background thread
    1. POST /internal/fetch-listing on apply
    2. llm.extract_flat_details(html, url) — Haiku tool-use call returns
       structured JSON (address, rooms, rent, description, pros/cons, etc.)
    3. Download each image directly to /data/flats/<slug>/NN.<ext>
    4. Persist enrichment_json + image_count + enrichment_status on the flat
- llm.py: minimal Anthropic /v1/messages wrapper, no SDK
- DB migration v5 adds enrichment_json/_status/_updated_at + image_count
- Admin "Altbestand anreichern" button (POST /actions/enrich-all) queues
  backfill for all pending/failed rows; runs in a detached task
- GET /partials/wohnung/<id> renders _wohnung_detail.html
- GET /flat-images/<slug>/<n> serves the downloaded image

UI
- Chevron on each list row toggles an inline detail pane (HTMX fetch on
  first open, hx-preserve keeps it open across the 3–30 s polls)
- CSS .flat-gallery normalises image tiles to a 4/3 aspect with object-fit:
  cover so different source sizes align cleanly
- "analysiert…" / "?" chips on the list reflect enrichment_status

Config
- ANTHROPIC_API_KEY + ANTHROPIC_MODEL wired into docker-compose's web
  service (default model: claude-haiku-4-5-20251001)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
EiSiMo 2026-04-21 14:46:12 +02:00
parent 2609d3504a
commit eb66284172
11 changed files with 688 additions and 44 deletions

168
web/enrichment.py Normal file
View file

@ -0,0 +1,168 @@
"""Flat-enrichment pipeline.
For each new flat we:
1. Ask the apply service to fetch the listing via Playwright (bypasses bot guards)
2. Feed the HTML to Haiku via `llm.extract_flat_details` structured dict
3. Download each image URL directly into /data/flats/<slug>/NN.<ext>
4. Persist result on the flat row (enrichment_json + image_count + status)
Kicked as a detached asyncio task from /internal/flats so scraping stays fast.
A small queue cap + per-call lock would be next steps if we ever need them.
"""
from __future__ import annotations
import asyncio
import hashlib
import logging
import mimetypes
import os
from pathlib import Path
from typing import Optional
from urllib.parse import urlparse
import requests
import db
import llm
from settings import DATA_DIR, INTERNAL_API_KEY
logger = logging.getLogger("web.enrichment")
APPLY_FETCH_URL = os.environ.get("APPLY_URL", "http://apply:8000") + "/internal/fetch-listing"
IMAGES_DIR = DATA_DIR / "flats"
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
MAX_IMAGES = 12
MAX_IMAGE_BYTES = 3_000_000 # 3 MB per image
IMAGE_TIMEOUT = 15
def flat_slug(flat_id: str) -> str:
"""Filesystem-safe short identifier for a flat (IDs are URLs)."""
return hashlib.sha1(flat_id.encode("utf-8")).hexdigest()[:16]
def flat_image_dir(flat_id: str) -> Path:
d = IMAGES_DIR / flat_slug(flat_id)
d.mkdir(parents=True, exist_ok=True)
return d
def _fetch_listing(url: str) -> Optional[dict]:
try:
r = requests.post(
APPLY_FETCH_URL,
headers={"X-Internal-Api-Key": INTERNAL_API_KEY},
json={"url": url},
timeout=90,
)
except requests.RequestException as e:
logger.warning("fetch-listing request failed for %s: %s", url, e)
return None
if r.status_code >= 400:
logger.warning("fetch-listing %s: %s", r.status_code, r.text[:300])
return None
return r.json()
def _ext_from_response(resp: requests.Response, url: str) -> str:
ct = resp.headers.get("content-type", "").split(";")[0].strip().lower()
if ct:
ext = mimetypes.guess_extension(ct) or ""
if ext:
return ext.replace(".jpe", ".jpg")
path = urlparse(url).path
_, ext = os.path.splitext(path)
return ext.lower() or ".jpg"
def _download_images(flat_id: str, urls: list[str], referer: str) -> int:
d = flat_image_dir(flat_id)
# Clear any previous attempts so re-enrichment doesn't pile up dupes.
for old in d.iterdir():
try: old.unlink()
except OSError: pass
saved = 0
for raw_url in urls[:MAX_IMAGES]:
try:
r = requests.get(
raw_url,
headers={"Referer": referer,
"User-Agent": "Mozilla/5.0 (lazyflat enricher)"},
timeout=IMAGE_TIMEOUT,
stream=True,
)
if r.status_code >= 400:
continue
ct = r.headers.get("content-type", "").split(";")[0].strip().lower()
if not ct.startswith("image/"):
continue
ext = _ext_from_response(r, raw_url)
path = d / f"{saved + 1:02d}{ext}"
total = 0
with open(path, "wb") as f:
for chunk in r.iter_content(chunk_size=65_536):
if not chunk:
continue
total += len(chunk)
if total > MAX_IMAGE_BYTES:
break
f.write(chunk)
if total == 0:
path.unlink(missing_ok=True)
continue
saved += 1
except requests.RequestException as e:
logger.info("image download failed %s: %s", raw_url, e)
continue
return saved
def enrich_flat_sync(flat_id: str) -> None:
"""Run the full enrichment pipeline for one flat. Blocking."""
flat = db.get_flat(flat_id)
if not flat:
return
url = flat["link"]
logger.info("enrich start flat=%s url=%s", flat_id, url)
listing = _fetch_listing(url)
if not listing:
db.set_flat_enrichment(flat_id, "failed")
return
details = llm.extract_flat_details(listing.get("html") or "",
listing.get("final_url") or url)
if details is None:
db.set_flat_enrichment(flat_id, "failed")
return
image_urls = listing.get("image_urls") or []
image_count = _download_images(flat_id, image_urls, referer=url)
db.set_flat_enrichment(flat_id, "ok", enrichment=details, image_count=image_count)
logger.info("enrich done flat=%s images=%d", flat_id, image_count)
def kick(flat_id: str) -> None:
"""Fire-and-forget enrichment in a background thread."""
asyncio.create_task(asyncio.to_thread(enrich_flat_sync, flat_id))
async def _backfill_runner() -> None:
rows = db.flats_needing_enrichment(limit=200)
logger.info("enrich backfill: %d flats queued", len(rows))
for row in rows:
try:
await asyncio.to_thread(enrich_flat_sync, row["id"])
except Exception:
logger.exception("backfill step failed flat=%s", row["id"])
def kick_backfill() -> int:
"""Queue enrichment for every flat still pending/failed. Returns how many
flats are queued; the actual work happens in a detached task so the admin
UI doesn't block for minutes."""
pending = db.flats_needing_enrichment(limit=200)
asyncio.create_task(_backfill_runner())
return len(pending)