enrichment: Haiku flat details + image gallery on expand
apply service
- POST /internal/fetch-listing: headless Playwright fetch of a listing URL,
returns {html, image_urls[], final_url}. Uses the same browser
fingerprint/profile as the apply run so bot guards don't kick in
web service
- New enrichment pipeline (web/enrichment.py):
/internal/flats → upsert → kick() enrichment in a background thread
1. POST /internal/fetch-listing on apply
2. llm.extract_flat_details(html, url) — Haiku tool-use call returns
structured JSON (address, rooms, rent, description, pros/cons, etc.)
3. Download each image directly to /data/flats/<slug>/NN.<ext>
4. Persist enrichment_json + image_count + enrichment_status on the flat
- llm.py: minimal Anthropic /v1/messages wrapper, no SDK
- DB migration v5 adds enrichment_json/_status/_updated_at + image_count
- Admin "Altbestand anreichern" button (POST /actions/enrich-all) queues
backfill for all pending/failed rows; runs in a detached task
- GET /partials/wohnung/<id> renders _wohnung_detail.html
- GET /flat-images/<slug>/<n> serves the downloaded image
UI
- Chevron on each list row toggles an inline detail pane (HTMX fetch on
first open, hx-preserve keeps it open across the 3–30 s polls)
- CSS .flat-gallery normalises image tiles to a 4/3 aspect with object-fit:
cover so different source sizes align cleanly
- "analysiert…" / "?" chips on the list reflect enrichment_status
Config
- ANTHROPIC_API_KEY + ANTHROPIC_MODEL wired into docker-compose's web
service (default model: claude-haiku-4-5-20251001)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
2609d3504a
commit
eb66284172
11 changed files with 688 additions and 44 deletions
119
web/llm.py
Normal file
119
web/llm.py
Normal file
|
|
@ -0,0 +1,119 @@
|
|||
"""Minimal Anthropic Messages API wrapper for flat enrichment.
|
||||
|
||||
Uses tool-use forced output so Haiku returns structured JSON instead of free
|
||||
text we'd have to regex. No SDK — plain `requests` is enough here.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any, Optional
|
||||
|
||||
import requests
|
||||
|
||||
from settings import ANTHROPIC_API_KEY, ANTHROPIC_MODEL
|
||||
|
||||
logger = logging.getLogger("web.llm")
|
||||
|
||||
API_URL = "https://api.anthropic.com/v1/messages"
|
||||
API_VERSION = "2023-06-01"
|
||||
|
||||
TOOL_NAME = "record_flat_details"
|
||||
TOOL_SCHEMA: dict[str, Any] = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"address": {"type": ["string", "null"],
|
||||
"description": "Full street address incl. postcode+city if present"},
|
||||
"rooms": {"type": ["number", "null"], "description": "Number of rooms (decimal ok)"},
|
||||
"size_sqm": {"type": ["number", "null"], "description": "Size in m²"},
|
||||
"rent_cold": {"type": ["number", "null"], "description": "Kaltmiete in €"},
|
||||
"rent_total": {"type": ["number", "null"], "description": "Warm/Gesamtmiete in €"},
|
||||
"utilities": {"type": ["number", "null"], "description": "Nebenkosten in €"},
|
||||
"deposit": {"type": ["number", "null"], "description": "Kaution in €"},
|
||||
"available_from": {"type": ["string", "null"], "description": "Bezugsfrei ab (text)"},
|
||||
"floor": {"type": ["string", "null"], "description": "Etage (text, z.B. '3. OG')"},
|
||||
"heating": {"type": ["string", "null"]},
|
||||
"energy_certificate": {"type": ["string", "null"]},
|
||||
"energy_value": {"type": ["string", "null"]},
|
||||
"year_built": {"type": ["string", "null"]},
|
||||
"wbs_required": {"type": ["boolean", "null"]},
|
||||
"wbs_type": {"type": ["string", "null"], "description": "WBS-Typ, z.B. '160' oder null"},
|
||||
"description": {
|
||||
"type": ["string", "null"],
|
||||
"description": "Kurze 2–3-Satz-Beschreibung der Wohnung auf Deutsch. Fakten, keine Werbesprache.",
|
||||
},
|
||||
"features": {
|
||||
"type": "array", "items": {"type": "string"},
|
||||
"description": "Ausstattungsmerkmale (z.B. 'Balkon', 'Einbauküche', 'Parkett')",
|
||||
},
|
||||
"pros": {
|
||||
"type": "array", "items": {"type": "string"},
|
||||
"description": "2–4 konkrete Vorteile aus Bewerbersicht (keine Werbung)",
|
||||
},
|
||||
"cons": {
|
||||
"type": "array", "items": {"type": "string"},
|
||||
"description": "2–4 mögliche Nachteile / Punkte zum Beachten",
|
||||
},
|
||||
},
|
||||
"required": [],
|
||||
"additionalProperties": False,
|
||||
}
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"Du extrahierst strukturierte Wohnungsdaten aus deutschem HTML-Quelltext von "
|
||||
"Berliner Wohnungsbaugesellschaften (howoge, gewobag, degewo, gesobau, wbm, "
|
||||
"stadt-und-land). Antworte AUSSCHLIESSLICH über den bereitgestellten Tool-Call. "
|
||||
"Fehlende Werte → null. Keine Erfindungen — wenn etwas nicht klar aus dem HTML "
|
||||
"hervorgeht, lass das Feld null. Zahlen bitte als Zahlen (nicht als String), "
|
||||
"Beschreibung/Pros/Cons auf Deutsch."
|
||||
)
|
||||
|
||||
|
||||
def extract_flat_details(html: str, url: str,
|
||||
max_html_chars: int = 60_000,
|
||||
timeout: int = 60) -> Optional[dict]:
|
||||
"""Call Haiku; return the structured dict or None on failure."""
|
||||
if not ANTHROPIC_API_KEY:
|
||||
logger.info("skipping enrichment: ANTHROPIC_API_KEY not set")
|
||||
return None
|
||||
|
||||
user_content = (
|
||||
f"URL: {url}\n\n"
|
||||
f"HTML-Quellcode (ggf. gekürzt):\n---\n{html[:max_html_chars]}\n---"
|
||||
)
|
||||
body = {
|
||||
"model": ANTHROPIC_MODEL,
|
||||
"max_tokens": 1500,
|
||||
"system": SYSTEM_PROMPT,
|
||||
"tools": [{
|
||||
"name": TOOL_NAME,
|
||||
"description": "Persist the extracted flat details.",
|
||||
"input_schema": TOOL_SCHEMA,
|
||||
}],
|
||||
"tool_choice": {"type": "tool", "name": TOOL_NAME},
|
||||
"messages": [{"role": "user", "content": user_content}],
|
||||
}
|
||||
try:
|
||||
r = requests.post(
|
||||
API_URL,
|
||||
headers={
|
||||
"x-api-key": ANTHROPIC_API_KEY,
|
||||
"anthropic-version": API_VERSION,
|
||||
"content-type": "application/json",
|
||||
},
|
||||
json=body,
|
||||
timeout=timeout,
|
||||
)
|
||||
except requests.RequestException as e:
|
||||
logger.warning("anthropic request failed: %s", e)
|
||||
return None
|
||||
|
||||
if r.status_code >= 400:
|
||||
logger.warning("anthropic %s: %s", r.status_code, r.text[:300])
|
||||
return None
|
||||
|
||||
data = r.json()
|
||||
for block in data.get("content", []):
|
||||
if block.get("type") == "tool_use" and block.get("name") == TOOL_NAME:
|
||||
return block.get("input") or {}
|
||||
logger.warning("anthropic returned no tool_use block: %s", data)
|
||||
return None
|
||||
Loading…
Add table
Add a link
Reference in a new issue