lazyflat/web/llm.py
EiSiMo 0aa4c6c2bb enrichment: drop LLM for structured info, dedup images by sha + phash
Per user request, the LLM is no longer asked to extract rooms/size/rent/WBS —
those come from the inberlinwohnen.de scraper which is reliable. Haiku is now
used for one narrow job: pick which <img> URLs from the listing page are
actual flat photos (vs. logos, badges, ads, employee portraits). On any LLM
failure the unfiltered candidate list passes through.

Image dedup runs in two tiers:
1. SHA256 of bytes — drops different URLs that point to byte-identical files
2. Perceptual hash (Pillow + imagehash, Hamming distance ≤ 5) — drops the
   "same image at a different resolution" duplicates from srcset / CDN
   variants that were filling galleries with 2–4× copies

UI:
- Wohnungsliste falls back to scraper-only display (rooms/size/rent/wbs)
- Detail panel only shows images + "Zur Original-Anzeige →"; description /
  features / pros & cons / kv table are gone
- Per-row "erneut versuchen" link + the "analysiert…/?" status chips were
  tied to LLM extraction and are removed; the header "Bilder nachladen (N)"
  button still surfaces pending/failed batches for admins

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-21 15:29:55 +02:00

96 lines
3.3 KiB
Python

"""Anthropic Haiku helper — used only to pick which `<img>` URLs on a
listing page are actual photos of the flat (vs. nav icons, badges, ads…).
If the API key is missing or the call fails, the caller passes the original
candidates straight through, so this is a soft enhancement, not a
dependency.
"""
from __future__ import annotations
import logging
from typing import Optional
import requests
from settings import ANTHROPIC_API_KEY, ANTHROPIC_MODEL
logger = logging.getLogger("web.llm")
API_URL = "https://api.anthropic.com/v1/messages"
API_VERSION = "2023-06-01"
TOOL_NAME = "select_flat_images"
TOOL_SCHEMA = {
"type": "object",
"properties": {
"urls": {
"type": "array",
"items": {"type": "string"},
"description": "Subset of the candidate URLs that show the actual flat — "
"interior, exterior, floorplan. Keep ordering of input.",
},
},
"required": ["urls"],
"additionalProperties": False,
}
SYSTEM_PROMPT = (
"Du bekommst eine Liste von Bild-URLs einer Wohnungsanzeige. Wähle nur "
"die URLs aus, die ein Foto der Wohnung zeigen (Innenraum, Außenansicht "
"des Gebäudes, Grundriss). Verwerfe Logos, Icons, Banner, Ads, "
"Bewertungs-Sterne, Karten/Stadtpläne, Mitarbeiter-Portraits, Tracking-"
"Pixel. Behalte die Reihenfolge der Input-Liste bei. Antworte "
"ausschließlich über den Tool-Call."
)
def select_flat_image_urls(candidates: list[str], page_url: str,
timeout: int = 30) -> list[str]:
"""Return the LLM-filtered subset, or the original list on any failure."""
if not ANTHROPIC_API_KEY or not candidates:
return candidates
user_text = (
f"Seite: {page_url}\n\n"
"Kandidaten-URLs (nummeriert):\n"
+ "\n".join(f"{i+1}. {u}" for i, u in enumerate(candidates))
)
body = {
"model": ANTHROPIC_MODEL,
"max_tokens": 1500,
"system": SYSTEM_PROMPT,
"tools": [{
"name": TOOL_NAME,
"description": "Persist the selected flat-photo URLs.",
"input_schema": TOOL_SCHEMA,
}],
"tool_choice": {"type": "tool", "name": TOOL_NAME},
"messages": [{"role": "user", "content": user_text}],
}
try:
r = requests.post(
API_URL,
headers={
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": API_VERSION,
"content-type": "application/json",
},
json=body,
timeout=timeout,
)
except requests.RequestException as e:
logger.warning("anthropic image-select request failed: %s", e)
return candidates
if r.status_code >= 400:
logger.warning("anthropic image-select %s: %s", r.status_code, r.text[:300])
return candidates
data = r.json()
for block in data.get("content", []):
if block.get("type") == "tool_use" and block.get("name") == TOOL_NAME:
urls = (block.get("input") or {}).get("urls") or []
# Constrain to the original candidate set so the model can't
# invent URLs (it sometimes lightly rewrites them otherwise).
allowed = set(candidates)
return [u for u in urls if u in allowed]
return candidates