enrichment: drop LLM for structured info, dedup images by sha + phash

Per user request, the LLM is no longer asked to extract rooms/size/rent/WBS — those come from the inberlinwohnen.de scraper which is reliable. Haiku is now used for one narrow job: pick which <img> URLs from the listing page are actual flat photos (vs. logos, badges, ads, employee portraits). On any LLM failure the unfiltered candidate list passes through. Image dedup runs in two tiers: 1. SHA256 of bytes — drops different URLs that point to byte-identical files 2. Perceptual hash (Pillow + imagehash, Hamming distance ≤ 5) — drops the "same image at a different resolution" duplicates from srcset / CDN variants that were filling galleries with 2–4× copies UI: - Wohnungsliste falls back to scraper-only display (rooms/size/rent/wbs) - Detail panel only shows images + "Zur Original-Anzeige →"; description / features / pros & cons / kv table are gone - Per-row "erneut versuchen" link + the "analysiert…/?" status chips were tied to LLM extraction and are removed; the header "Bilder nachladen (N)" button still surfaces pending/failed batches for admins Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-21 15:29:55 +02:00 · 2026-04-21 15:29:55 +02:00 · 0aa4c6c2bb
commit 0aa4c6c2bb
parent 374368e4af
6 changed files with 137 additions and 233 deletions
--- a/web/llm.py
+++ b/web/llm.py
@ -1,12 +1,14 @@
-"""Minimal Anthropic Messages API wrapper for flat enrichment.
+"""Anthropic Haiku helper — used only to pick which `<img>` URLs on a
+listing page are actual photos of the flat (vs. nav icons, badges, ads…).

-Uses tool-use forced output so Haiku returns structured JSON instead of free
-text we'd have to regex. No SDK — plain `requests` is enough here.
+If the API key is missing or the call fails, the caller passes the original
+candidates straight through, so this is a soft enhancement, not a
+dependency.
 """
 from __future__ import annotations

 import logging
-from typing import Any, Optional
+from typing import Optional

 import requests

@ -17,68 +19,41 @@ logger = logging.getLogger("web.llm")
 API_URL = "https://api.anthropic.com/v1/messages"
 API_VERSION = "2023-06-01"

-TOOL_NAME = "record_flat_details"
-TOOL_SCHEMA: dict[str, Any] = {
+TOOL_NAME = "select_flat_images"
+TOOL_SCHEMA = {
    "type": "object",
    "properties": {
-        "address": {"type": ["string", "null"],
-                    "description": "Full street address incl. postcode+city if present"},
-        "rooms": {"type": ["number", "null"], "description": "Number of rooms (decimal ok)"},
-        "size_sqm": {"type": ["number", "null"], "description": "Size in m²"},
-        "rent_cold": {"type": ["number", "null"], "description": "Kaltmiete in €"},
-        "rent_total": {"type": ["number", "null"], "description": "Warm/Gesamtmiete in €"},
-        "utilities": {"type": ["number", "null"], "description": "Nebenkosten in €"},
-        "deposit": {"type": ["number", "null"], "description": "Kaution in €"},
-        "available_from": {"type": ["string", "null"], "description": "Bezugsfrei ab (text)"},
-        "floor": {"type": ["string", "null"], "description": "Etage (text, z.B. '3. OG')"},
-        "heating": {"type": ["string", "null"]},
-        "energy_certificate": {"type": ["string", "null"]},
-        "energy_value": {"type": ["string", "null"]},
-        "year_built": {"type": ["string", "null"]},
-        "wbs_required": {"type": ["boolean", "null"]},
-        "wbs_type": {"type": ["string", "null"], "description": "WBS-Typ, z.B. '160' oder null"},
-        "description": {
-            "type": ["string", "null"],
-            "description": "Kurze 2–3-Satz-Beschreibung der Wohnung auf Deutsch. Fakten, keine Werbesprache.",
-        },
-        "features": {
-            "type": "array", "items": {"type": "string"},
-            "description": "Ausstattungsmerkmale (z.B. 'Balkon', 'Einbauküche', 'Parkett')",
-        },
-        "pros": {
-            "type": "array", "items": {"type": "string"},
-            "description": "2–4 konkrete Vorteile aus Bewerbersicht (keine Werbung)",
-        },
-        "cons": {
-            "type": "array", "items": {"type": "string"},
-            "description": "2–4 mögliche Nachteile / Punkte zum Beachten",
+        "urls": {
+            "type": "array",
+            "items": {"type": "string"},
+            "description": "Subset of the candidate URLs that show the actual flat — "
+                           "interior, exterior, floorplan. Keep ordering of input.",
        },
    },
-    "required": [],
+    "required": ["urls"],
    "additionalProperties": False,
 }

 SYSTEM_PROMPT = (
-    "Du extrahierst strukturierte Wohnungsdaten aus deutschem HTML-Quelltext von "
-    "Berliner Wohnungsbaugesellschaften (howoge, gewobag, degewo, gesobau, wbm, "
-    "stadt-und-land). Antworte AUSSCHLIESSLICH über den bereitgestellten Tool-Call. "
-    "Fehlende Werte → null. Keine Erfindungen — wenn etwas nicht klar aus dem HTML "
-    "hervorgeht, lass das Feld null. Zahlen bitte als Zahlen (nicht als String), "
-    "Beschreibung/Pros/Cons auf Deutsch."
+    "Du bekommst eine Liste von Bild-URLs einer Wohnungsanzeige. Wähle nur "
+    "die URLs aus, die ein Foto der Wohnung zeigen (Innenraum, Außenansicht "
+    "des Gebäudes, Grundriss). Verwerfe Logos, Icons, Banner, Ads, "
+    "Bewertungs-Sterne, Karten/Stadtpläne, Mitarbeiter-Portraits, Tracking-"
+    "Pixel. Behalte die Reihenfolge der Input-Liste bei. Antworte "
+    "ausschließlich über den Tool-Call."
 )


-def extract_flat_details(html: str, url: str,
-                         max_html_chars: int = 60_000,
-                         timeout: int = 60) -> Optional[dict]:
-    """Call Haiku; return the structured dict or None on failure."""
-    if not ANTHROPIC_API_KEY:
-        logger.info("skipping enrichment: ANTHROPIC_API_KEY not set")
-        return None
+def select_flat_image_urls(candidates: list[str], page_url: str,
+                           timeout: int = 30) -> list[str]:
+    """Return the LLM-filtered subset, or the original list on any failure."""
+    if not ANTHROPIC_API_KEY or not candidates:
+        return candidates

-    user_content = (
-        f"URL: {url}\n\n"
-        f"HTML-Quellcode (ggf. gekürzt):\n---\n{html[:max_html_chars]}\n---"
+    user_text = (
+        f"Seite: {page_url}\n\n"
+        "Kandidaten-URLs (nummeriert):\n"
+        + "\n".join(f"{i+1}. {u}" for i, u in enumerate(candidates))
    )
    body = {
        "model": ANTHROPIC_MODEL,
@ -86,11 +61,11 @@ def extract_flat_details(html: str, url: str,
        "system": SYSTEM_PROMPT,
        "tools": [{
            "name": TOOL_NAME,
-            "description": "Persist the extracted flat details.",
+            "description": "Persist the selected flat-photo URLs.",
            "input_schema": TOOL_SCHEMA,
        }],
        "tool_choice": {"type": "tool", "name": TOOL_NAME},
-        "messages": [{"role": "user", "content": user_content}],
+        "messages": [{"role": "user", "content": user_text}],
    }
    try:
        r = requests.post(
@ -104,16 +79,18 @@ def extract_flat_details(html: str, url: str,
            timeout=timeout,
        )
    except requests.RequestException as e:
-        logger.warning("anthropic request failed: %s", e)
-        return None
-
+        logger.warning("anthropic image-select request failed: %s", e)
+        return candidates
    if r.status_code >= 400:
-        logger.warning("anthropic %s: %s", r.status_code, r.text[:300])
-        return None
+        logger.warning("anthropic image-select %s: %s", r.status_code, r.text[:300])
+        return candidates

    data = r.json()
    for block in data.get("content", []):
        if block.get("type") == "tool_use" and block.get("name") == TOOL_NAME:
-            return block.get("input") or {}
-    logger.warning("anthropic returned no tool_use block: %s", data)
-    return None
+            urls = (block.get("input") or {}).get("urls") or []
+            # Constrain to the original candidate set so the model can't
+            # invent URLs (it sometimes lightly rewrites them otherwise).
+            allowed = set(candidates)
+            return [u for u in urls if u in allowed]
+    return candidates