enrichment: Haiku flat details + image gallery on expand

apply service
- POST /internal/fetch-listing: headless Playwright fetch of a listing URL,
  returns {html, image_urls[], final_url}. Uses the same browser
  fingerprint/profile as the apply run so bot guards don't kick in

web service
- New enrichment pipeline (web/enrichment.py):
  /internal/flats → upsert → kick() enrichment in a background thread
    1. POST /internal/fetch-listing on apply
    2. llm.extract_flat_details(html, url) — Haiku tool-use call returns
       structured JSON (address, rooms, rent, description, pros/cons, etc.)
    3. Download each image directly to /data/flats/<slug>/NN.<ext>
    4. Persist enrichment_json + image_count + enrichment_status on the flat
- llm.py: minimal Anthropic /v1/messages wrapper, no SDK
- DB migration v5 adds enrichment_json/_status/_updated_at + image_count
- Admin "Altbestand anreichern" button (POST /actions/enrich-all) queues
  backfill for all pending/failed rows; runs in a detached task
- GET /partials/wohnung/<id> renders _wohnung_detail.html
- GET /flat-images/<slug>/<n> serves the downloaded image

UI
- Chevron on each list row toggles an inline detail pane (HTMX fetch on
  first open, hx-preserve keeps it open across the 3–30 s polls)
- CSS .flat-gallery normalises image tiles to a 4/3 aspect with object-fit:
  cover so different source sizes align cleanly
- "analysiert…" / "?" chips on the list reflect enrichment_status

Config
- ANTHROPIC_API_KEY + ANTHROPIC_MODEL wired into docker-compose's web
  service (default model: claude-haiku-4-5-20251001)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
EiSiMo 2026-04-21 14:46:12 +02:00
parent 2609d3504a
commit eb66284172
11 changed files with 688 additions and 44 deletions

119
web/llm.py Normal file
View file

@ -0,0 +1,119 @@
"""Minimal Anthropic Messages API wrapper for flat enrichment.
Uses tool-use forced output so Haiku returns structured JSON instead of free
text we'd have to regex. No SDK — plain `requests` is enough here.
"""
from __future__ import annotations
import logging
from typing import Any, Optional
import requests
from settings import ANTHROPIC_API_KEY, ANTHROPIC_MODEL
logger = logging.getLogger("web.llm")
API_URL = "https://api.anthropic.com/v1/messages"
API_VERSION = "2023-06-01"
TOOL_NAME = "record_flat_details"
TOOL_SCHEMA: dict[str, Any] = {
"type": "object",
"properties": {
"address": {"type": ["string", "null"],
"description": "Full street address incl. postcode+city if present"},
"rooms": {"type": ["number", "null"], "description": "Number of rooms (decimal ok)"},
"size_sqm": {"type": ["number", "null"], "description": "Size in m²"},
"rent_cold": {"type": ["number", "null"], "description": "Kaltmiete in €"},
"rent_total": {"type": ["number", "null"], "description": "Warm/Gesamtmiete in €"},
"utilities": {"type": ["number", "null"], "description": "Nebenkosten in €"},
"deposit": {"type": ["number", "null"], "description": "Kaution in €"},
"available_from": {"type": ["string", "null"], "description": "Bezugsfrei ab (text)"},
"floor": {"type": ["string", "null"], "description": "Etage (text, z.B. '3. OG')"},
"heating": {"type": ["string", "null"]},
"energy_certificate": {"type": ["string", "null"]},
"energy_value": {"type": ["string", "null"]},
"year_built": {"type": ["string", "null"]},
"wbs_required": {"type": ["boolean", "null"]},
"wbs_type": {"type": ["string", "null"], "description": "WBS-Typ, z.B. '160' oder null"},
"description": {
"type": ["string", "null"],
"description": "Kurze 23-Satz-Beschreibung der Wohnung auf Deutsch. Fakten, keine Werbesprache.",
},
"features": {
"type": "array", "items": {"type": "string"},
"description": "Ausstattungsmerkmale (z.B. 'Balkon', 'Einbauküche', 'Parkett')",
},
"pros": {
"type": "array", "items": {"type": "string"},
"description": "24 konkrete Vorteile aus Bewerbersicht (keine Werbung)",
},
"cons": {
"type": "array", "items": {"type": "string"},
"description": "24 mögliche Nachteile / Punkte zum Beachten",
},
},
"required": [],
"additionalProperties": False,
}
SYSTEM_PROMPT = (
"Du extrahierst strukturierte Wohnungsdaten aus deutschem HTML-Quelltext von "
"Berliner Wohnungsbaugesellschaften (howoge, gewobag, degewo, gesobau, wbm, "
"stadt-und-land). Antworte AUSSCHLIESSLICH über den bereitgestellten Tool-Call. "
"Fehlende Werte → null. Keine Erfindungen — wenn etwas nicht klar aus dem HTML "
"hervorgeht, lass das Feld null. Zahlen bitte als Zahlen (nicht als String), "
"Beschreibung/Pros/Cons auf Deutsch."
)
def extract_flat_details(html: str, url: str,
max_html_chars: int = 60_000,
timeout: int = 60) -> Optional[dict]:
"""Call Haiku; return the structured dict or None on failure."""
if not ANTHROPIC_API_KEY:
logger.info("skipping enrichment: ANTHROPIC_API_KEY not set")
return None
user_content = (
f"URL: {url}\n\n"
f"HTML-Quellcode (ggf. gekürzt):\n---\n{html[:max_html_chars]}\n---"
)
body = {
"model": ANTHROPIC_MODEL,
"max_tokens": 1500,
"system": SYSTEM_PROMPT,
"tools": [{
"name": TOOL_NAME,
"description": "Persist the extracted flat details.",
"input_schema": TOOL_SCHEMA,
}],
"tool_choice": {"type": "tool", "name": TOOL_NAME},
"messages": [{"role": "user", "content": user_content}],
}
try:
r = requests.post(
API_URL,
headers={
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": API_VERSION,
"content-type": "application/json",
},
json=body,
timeout=timeout,
)
except requests.RequestException as e:
logger.warning("anthropic request failed: %s", e)
return None
if r.status_code >= 400:
logger.warning("anthropic %s: %s", r.status_code, r.text[:300])
return None
data = r.json()
for block in data.get("content", []):
if block.get("type") == "tool_use" and block.get("name") == TOOL_NAME:
return block.get("input") or {}
logger.warning("anthropic returned no tool_use block: %s", data)
return None