enrichment: drop LLM for structured info, dedup images by sha + phash
Per user request, the LLM is no longer asked to extract rooms/size/rent/WBS — those come from the inberlinwohnen.de scraper which is reliable. Haiku is now used for one narrow job: pick which <img> URLs from the listing page are actual flat photos (vs. logos, badges, ads, employee portraits). On any LLM failure the unfiltered candidate list passes through. Image dedup runs in two tiers: 1. SHA256 of bytes — drops different URLs that point to byte-identical files 2. Perceptual hash (Pillow + imagehash, Hamming distance ≤ 5) — drops the "same image at a different resolution" duplicates from srcset / CDN variants that were filling galleries with 2–4× copies UI: - Wohnungsliste falls back to scraper-only display (rooms/size/rent/wbs) - Detail panel only shows images + "Zur Original-Anzeige →"; description / features / pros & cons / kv table are gone - Per-row "erneut versuchen" link + the "analysiert…/?" status chips were tied to LLM extraction and are removed; the header "Bilder nachladen (N)" button still surfaces pending/failed batches for admins Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
374368e4af
commit
0aa4c6c2bb
6 changed files with 137 additions and 233 deletions
15
web/app.py
15
web/app.py
|
|
@ -403,13 +403,7 @@ def _wohnungen_context(user) -> dict:
|
||||||
}, filters):
|
}, filters):
|
||||||
continue
|
continue
|
||||||
last = db.last_application_for_flat(uid, f["id"])
|
last = db.last_application_for_flat(uid, f["id"])
|
||||||
enrichment_data = None
|
flats_view.append({"row": f, "last": last})
|
||||||
if f["enrichment_json"]:
|
|
||||||
try:
|
|
||||||
enrichment_data = json.loads(f["enrichment_json"])
|
|
||||||
except Exception:
|
|
||||||
enrichment_data = None
|
|
||||||
flats_view.append({"row": f, "last": last, "enrichment": enrichment_data})
|
|
||||||
|
|
||||||
rejected_view = db.rejected_flats(uid)
|
rejected_view = db.rejected_flats(uid)
|
||||||
enrichment_counts = db.enrichment_counts()
|
enrichment_counts = db.enrichment_counts()
|
||||||
|
|
@ -489,12 +483,6 @@ def partial_wohnung_detail(request: Request, flat_id: str, user=Depends(require_
|
||||||
flat = db.get_flat(flat_id)
|
flat = db.get_flat(flat_id)
|
||||||
if not flat:
|
if not flat:
|
||||||
raise HTTPException(404)
|
raise HTTPException(404)
|
||||||
enrichment_data = None
|
|
||||||
if flat["enrichment_json"]:
|
|
||||||
try:
|
|
||||||
enrichment_data = json.loads(flat["enrichment_json"])
|
|
||||||
except Exception:
|
|
||||||
enrichment_data = None
|
|
||||||
slug = enrichment.flat_slug(flat_id)
|
slug = enrichment.flat_slug(flat_id)
|
||||||
image_urls = [
|
image_urls = [
|
||||||
f"/flat-images/{slug}/{i}"
|
f"/flat-images/{slug}/{i}"
|
||||||
|
|
@ -503,7 +491,6 @@ def partial_wohnung_detail(request: Request, flat_id: str, user=Depends(require_
|
||||||
ctx = {
|
ctx = {
|
||||||
"request": request,
|
"request": request,
|
||||||
"flat": flat,
|
"flat": flat,
|
||||||
"enrichment": enrichment_data,
|
|
||||||
"enrichment_status": flat["enrichment_status"],
|
"enrichment_status": flat["enrichment_status"],
|
||||||
"image_urls": image_urls,
|
"image_urls": image_urls,
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,19 +1,20 @@
|
||||||
"""Flat-enrichment pipeline.
|
"""Image enrichment pipeline.
|
||||||
|
|
||||||
For each new flat we:
|
For each new flat we:
|
||||||
1. Ask the apply service to fetch the listing via Playwright (bypasses bot guards)
|
1. Ask the apply service to fetch the listing via Playwright (bypasses bot guards)
|
||||||
2. Feed the HTML to Haiku via `llm.extract_flat_details` → structured dict
|
2. Optionally let Haiku narrow the <img> URL list down to actual flat photos
|
||||||
3. Download each image URL directly into /data/flats/<slug>/NN.<ext>
|
3. Download the images into /data/flats/<slug>/NN.<ext>, deduplicating by
|
||||||
4. Persist result on the flat row (enrichment_json + image_count + status)
|
exact bytes (SHA256) and visual similarity (perceptual hash)
|
||||||
|
|
||||||
Kicked as a detached asyncio task from /internal/flats so scraping stays fast.
|
Structured fields (rooms / size / rent / WBS) come from the inberlinwohnen.de
|
||||||
Every failure is caught, stashed in enrichment_json as {"_error": "...", ...}
|
scraper and are not re-derived by the LLM. Failures land in
|
||||||
and mirrored into the errors log so /logs/protokoll explains what went wrong.
|
enrichment_status='failed' with the reason in enrichment_json._error.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import io
|
||||||
import logging
|
import logging
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
|
|
@ -33,12 +34,12 @@ IMAGES_DIR = DATA_DIR / "flats"
|
||||||
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
MAX_IMAGES = 12
|
MAX_IMAGES = 12
|
||||||
MAX_IMAGE_BYTES = 3_000_000 # 3 MB per image
|
MAX_IMAGE_BYTES = 5_000_000
|
||||||
IMAGE_TIMEOUT = 15
|
IMAGE_TIMEOUT = 15
|
||||||
|
PHASH_DUPLICATE_THRESHOLD = 5 # Hamming distance ≤ N → considered the same picture
|
||||||
|
|
||||||
|
|
||||||
class EnrichmentError(Exception):
|
class EnrichmentError(Exception):
|
||||||
"""Raised by each pipeline step with a human-readable reason."""
|
|
||||||
def __init__(self, step: str, reason: str):
|
def __init__(self, step: str, reason: str):
|
||||||
self.step = step
|
self.step = step
|
||||||
self.reason = reason
|
self.reason = reason
|
||||||
|
|
@ -46,7 +47,6 @@ class EnrichmentError(Exception):
|
||||||
|
|
||||||
|
|
||||||
def flat_slug(flat_id: str) -> str:
|
def flat_slug(flat_id: str) -> str:
|
||||||
"""Filesystem-safe short identifier for a flat (IDs are URLs)."""
|
|
||||||
return hashlib.sha1(flat_id.encode("utf-8")).hexdigest()[:16]
|
return hashlib.sha1(flat_id.encode("utf-8")).hexdigest()[:16]
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -77,25 +77,50 @@ def _fetch_listing(url: str) -> dict:
|
||||||
raise EnrichmentError("fetch", "apply returned non-JSON response")
|
raise EnrichmentError("fetch", "apply returned non-JSON response")
|
||||||
|
|
||||||
|
|
||||||
def _ext_from_response(resp: requests.Response, url: str) -> str:
|
def _ext_for(content_type: str, url: str) -> str:
|
||||||
ct = resp.headers.get("content-type", "").split(";")[0].strip().lower()
|
ct = content_type.split(";")[0].strip().lower()
|
||||||
if ct:
|
if ct:
|
||||||
ext = mimetypes.guess_extension(ct) or ""
|
ext = mimetypes.guess_extension(ct) or ""
|
||||||
if ext:
|
if ext:
|
||||||
return ext.replace(".jpe", ".jpg")
|
return ext.replace(".jpe", ".jpg")
|
||||||
path = urlparse(url).path
|
_, ext = os.path.splitext(urlparse(url).path)
|
||||||
_, ext = os.path.splitext(path)
|
|
||||||
return ext.lower() or ".jpg"
|
return ext.lower() or ".jpg"
|
||||||
|
|
||||||
|
|
||||||
|
def _phash(data: bytes):
|
||||||
|
"""Return an imagehash.ImageHash for the bytes, or None if unsupported."""
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
import imagehash
|
||||||
|
except ImportError:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
with Image.open(io.BytesIO(data)) as img:
|
||||||
|
img.load()
|
||||||
|
return imagehash.phash(img)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _download_images(flat_id: str, urls: list[str], referer: str) -> int:
|
def _download_images(flat_id: str, urls: list[str], referer: str) -> int:
|
||||||
|
"""Download up to MAX_IMAGES distinct flat photos. Dedup tiers:
|
||||||
|
1. Skip URLs that fail / aren't images
|
||||||
|
2. Skip exact byte-equal duplicates (different URL pointing to same file)
|
||||||
|
3. Skip visually-equivalent images (same picture re-encoded at a
|
||||||
|
different size — common with srcset/CDN variants)
|
||||||
|
"""
|
||||||
d = flat_image_dir(flat_id)
|
d = flat_image_dir(flat_id)
|
||||||
for old in d.iterdir():
|
for old in d.iterdir():
|
||||||
try: old.unlink()
|
try: old.unlink()
|
||||||
except OSError: pass
|
except OSError: pass
|
||||||
|
|
||||||
|
seen_sha: set[str] = set()
|
||||||
|
seen_phash: list = [] # list of imagehash objects
|
||||||
|
|
||||||
saved = 0
|
saved = 0
|
||||||
for raw_url in urls[:MAX_IMAGES]:
|
for raw_url in urls:
|
||||||
|
if saved >= MAX_IMAGES:
|
||||||
|
break
|
||||||
try:
|
try:
|
||||||
r = requests.get(
|
r = requests.get(
|
||||||
raw_url,
|
raw_url,
|
||||||
|
|
@ -109,48 +134,53 @@ def _download_images(flat_id: str, urls: list[str], referer: str) -> int:
|
||||||
ct = r.headers.get("content-type", "").split(";")[0].strip().lower()
|
ct = r.headers.get("content-type", "").split(";")[0].strip().lower()
|
||||||
if not ct.startswith("image/"):
|
if not ct.startswith("image/"):
|
||||||
continue
|
continue
|
||||||
ext = _ext_from_response(r, raw_url)
|
data = bytearray()
|
||||||
path = d / f"{saved + 1:02d}{ext}"
|
|
||||||
total = 0
|
|
||||||
with open(path, "wb") as f:
|
|
||||||
for chunk in r.iter_content(chunk_size=65_536):
|
for chunk in r.iter_content(chunk_size=65_536):
|
||||||
if not chunk:
|
if not chunk:
|
||||||
continue
|
continue
|
||||||
total += len(chunk)
|
data.extend(chunk)
|
||||||
if total > MAX_IMAGE_BYTES:
|
if len(data) > MAX_IMAGE_BYTES:
|
||||||
break
|
break
|
||||||
f.write(chunk)
|
if not data:
|
||||||
if total == 0:
|
|
||||||
path.unlink(missing_ok=True)
|
|
||||||
continue
|
continue
|
||||||
saved += 1
|
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
logger.info("image download failed %s: %s", raw_url, e)
|
logger.info("image download failed %s: %s", raw_url, e)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
data_bytes = bytes(data)
|
||||||
|
|
||||||
|
sha = hashlib.sha256(data_bytes).hexdigest()
|
||||||
|
if sha in seen_sha:
|
||||||
|
continue
|
||||||
|
seen_sha.add(sha)
|
||||||
|
|
||||||
|
ph = _phash(data_bytes)
|
||||||
|
if ph is not None:
|
||||||
|
if any((ph - prev) <= PHASH_DUPLICATE_THRESHOLD for prev in seen_phash):
|
||||||
|
continue
|
||||||
|
seen_phash.append(ph)
|
||||||
|
|
||||||
|
ext = _ext_for(ct, raw_url)
|
||||||
|
path = d / f"{saved + 1:02d}{ext}"
|
||||||
|
path.write_bytes(data_bytes)
|
||||||
|
saved += 1
|
||||||
|
|
||||||
return saved
|
return saved
|
||||||
|
|
||||||
|
|
||||||
def enrich_flat_sync(flat_id: str) -> None:
|
def enrich_flat_sync(flat_id: str) -> None:
|
||||||
"""Run the full enrichment pipeline for one flat. Blocking."""
|
|
||||||
flat = db.get_flat(flat_id)
|
flat = db.get_flat(flat_id)
|
||||||
if not flat:
|
if not flat:
|
||||||
return
|
return
|
||||||
url = flat["link"]
|
url = flat["link"]
|
||||||
logger.info("enrich start flat=%s url=%s", flat_id, url)
|
logger.info("enrich start flat=%s", flat_id)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
listing = _fetch_listing(url)
|
listing = _fetch_listing(url)
|
||||||
html = listing.get("html") or ""
|
candidates = listing.get("image_urls") or []
|
||||||
final_url = listing.get("final_url") or url
|
if candidates:
|
||||||
if not html.strip():
|
candidates = llm.select_flat_image_urls(candidates, listing.get("final_url") or url)
|
||||||
raise EnrichmentError("fetch", "apply returned empty HTML")
|
image_count = _download_images(flat_id, candidates, referer=url)
|
||||||
|
|
||||||
details = llm.extract_flat_details(html, final_url)
|
|
||||||
if details is None:
|
|
||||||
raise EnrichmentError("llm", "model returned no tool_use or call failed (see web logs)")
|
|
||||||
|
|
||||||
image_urls = listing.get("image_urls") or []
|
|
||||||
image_count = _download_images(flat_id, image_urls, referer=url)
|
|
||||||
except EnrichmentError as e:
|
except EnrichmentError as e:
|
||||||
_record_failure(flat_id, e.step, e.reason)
|
_record_failure(flat_id, e.step, e.reason)
|
||||||
return
|
return
|
||||||
|
|
@ -159,7 +189,7 @@ def enrich_flat_sync(flat_id: str) -> None:
|
||||||
_record_failure(flat_id, "crash", f"{type(e).__name__}: {e}")
|
_record_failure(flat_id, "crash", f"{type(e).__name__}: {e}")
|
||||||
return
|
return
|
||||||
|
|
||||||
db.set_flat_enrichment(flat_id, "ok", enrichment=details, image_count=image_count)
|
db.set_flat_enrichment(flat_id, "ok", enrichment=None, image_count=image_count)
|
||||||
logger.info("enrich done flat=%s images=%d", flat_id, image_count)
|
logger.info("enrich done flat=%s images=%d", flat_id, image_count)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -179,7 +209,6 @@ def _record_failure(flat_id: str, step: str, reason: str) -> None:
|
||||||
|
|
||||||
|
|
||||||
def kick(flat_id: str) -> None:
|
def kick(flat_id: str) -> None:
|
||||||
"""Fire-and-forget enrichment in a background thread."""
|
|
||||||
asyncio.create_task(asyncio.to_thread(enrich_flat_sync, flat_id))
|
asyncio.create_task(asyncio.to_thread(enrich_flat_sync, flat_id))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
105
web/llm.py
105
web/llm.py
|
|
@ -1,12 +1,14 @@
|
||||||
"""Minimal Anthropic Messages API wrapper for flat enrichment.
|
"""Anthropic Haiku helper — used only to pick which `<img>` URLs on a
|
||||||
|
listing page are actual photos of the flat (vs. nav icons, badges, ads…).
|
||||||
|
|
||||||
Uses tool-use forced output so Haiku returns structured JSON instead of free
|
If the API key is missing or the call fails, the caller passes the original
|
||||||
text we'd have to regex. No SDK — plain `requests` is enough here.
|
candidates straight through, so this is a soft enhancement, not a
|
||||||
|
dependency.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from typing import Any, Optional
|
from typing import Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
|
@ -17,68 +19,41 @@ logger = logging.getLogger("web.llm")
|
||||||
API_URL = "https://api.anthropic.com/v1/messages"
|
API_URL = "https://api.anthropic.com/v1/messages"
|
||||||
API_VERSION = "2023-06-01"
|
API_VERSION = "2023-06-01"
|
||||||
|
|
||||||
TOOL_NAME = "record_flat_details"
|
TOOL_NAME = "select_flat_images"
|
||||||
TOOL_SCHEMA: dict[str, Any] = {
|
TOOL_SCHEMA = {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"address": {"type": ["string", "null"],
|
"urls": {
|
||||||
"description": "Full street address incl. postcode+city if present"},
|
"type": "array",
|
||||||
"rooms": {"type": ["number", "null"], "description": "Number of rooms (decimal ok)"},
|
"items": {"type": "string"},
|
||||||
"size_sqm": {"type": ["number", "null"], "description": "Size in m²"},
|
"description": "Subset of the candidate URLs that show the actual flat — "
|
||||||
"rent_cold": {"type": ["number", "null"], "description": "Kaltmiete in €"},
|
"interior, exterior, floorplan. Keep ordering of input.",
|
||||||
"rent_total": {"type": ["number", "null"], "description": "Warm/Gesamtmiete in €"},
|
|
||||||
"utilities": {"type": ["number", "null"], "description": "Nebenkosten in €"},
|
|
||||||
"deposit": {"type": ["number", "null"], "description": "Kaution in €"},
|
|
||||||
"available_from": {"type": ["string", "null"], "description": "Bezugsfrei ab (text)"},
|
|
||||||
"floor": {"type": ["string", "null"], "description": "Etage (text, z.B. '3. OG')"},
|
|
||||||
"heating": {"type": ["string", "null"]},
|
|
||||||
"energy_certificate": {"type": ["string", "null"]},
|
|
||||||
"energy_value": {"type": ["string", "null"]},
|
|
||||||
"year_built": {"type": ["string", "null"]},
|
|
||||||
"wbs_required": {"type": ["boolean", "null"]},
|
|
||||||
"wbs_type": {"type": ["string", "null"], "description": "WBS-Typ, z.B. '160' oder null"},
|
|
||||||
"description": {
|
|
||||||
"type": ["string", "null"],
|
|
||||||
"description": "Kurze 2–3-Satz-Beschreibung der Wohnung auf Deutsch. Fakten, keine Werbesprache.",
|
|
||||||
},
|
|
||||||
"features": {
|
|
||||||
"type": "array", "items": {"type": "string"},
|
|
||||||
"description": "Ausstattungsmerkmale (z.B. 'Balkon', 'Einbauküche', 'Parkett')",
|
|
||||||
},
|
|
||||||
"pros": {
|
|
||||||
"type": "array", "items": {"type": "string"},
|
|
||||||
"description": "2–4 konkrete Vorteile aus Bewerbersicht (keine Werbung)",
|
|
||||||
},
|
|
||||||
"cons": {
|
|
||||||
"type": "array", "items": {"type": "string"},
|
|
||||||
"description": "2–4 mögliche Nachteile / Punkte zum Beachten",
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
"required": [],
|
"required": ["urls"],
|
||||||
"additionalProperties": False,
|
"additionalProperties": False,
|
||||||
}
|
}
|
||||||
|
|
||||||
SYSTEM_PROMPT = (
|
SYSTEM_PROMPT = (
|
||||||
"Du extrahierst strukturierte Wohnungsdaten aus deutschem HTML-Quelltext von "
|
"Du bekommst eine Liste von Bild-URLs einer Wohnungsanzeige. Wähle nur "
|
||||||
"Berliner Wohnungsbaugesellschaften (howoge, gewobag, degewo, gesobau, wbm, "
|
"die URLs aus, die ein Foto der Wohnung zeigen (Innenraum, Außenansicht "
|
||||||
"stadt-und-land). Antworte AUSSCHLIESSLICH über den bereitgestellten Tool-Call. "
|
"des Gebäudes, Grundriss). Verwerfe Logos, Icons, Banner, Ads, "
|
||||||
"Fehlende Werte → null. Keine Erfindungen — wenn etwas nicht klar aus dem HTML "
|
"Bewertungs-Sterne, Karten/Stadtpläne, Mitarbeiter-Portraits, Tracking-"
|
||||||
"hervorgeht, lass das Feld null. Zahlen bitte als Zahlen (nicht als String), "
|
"Pixel. Behalte die Reihenfolge der Input-Liste bei. Antworte "
|
||||||
"Beschreibung/Pros/Cons auf Deutsch."
|
"ausschließlich über den Tool-Call."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def extract_flat_details(html: str, url: str,
|
def select_flat_image_urls(candidates: list[str], page_url: str,
|
||||||
max_html_chars: int = 60_000,
|
timeout: int = 30) -> list[str]:
|
||||||
timeout: int = 60) -> Optional[dict]:
|
"""Return the LLM-filtered subset, or the original list on any failure."""
|
||||||
"""Call Haiku; return the structured dict or None on failure."""
|
if not ANTHROPIC_API_KEY or not candidates:
|
||||||
if not ANTHROPIC_API_KEY:
|
return candidates
|
||||||
logger.info("skipping enrichment: ANTHROPIC_API_KEY not set")
|
|
||||||
return None
|
|
||||||
|
|
||||||
user_content = (
|
user_text = (
|
||||||
f"URL: {url}\n\n"
|
f"Seite: {page_url}\n\n"
|
||||||
f"HTML-Quellcode (ggf. gekürzt):\n---\n{html[:max_html_chars]}\n---"
|
"Kandidaten-URLs (nummeriert):\n"
|
||||||
|
+ "\n".join(f"{i+1}. {u}" for i, u in enumerate(candidates))
|
||||||
)
|
)
|
||||||
body = {
|
body = {
|
||||||
"model": ANTHROPIC_MODEL,
|
"model": ANTHROPIC_MODEL,
|
||||||
|
|
@ -86,11 +61,11 @@ def extract_flat_details(html: str, url: str,
|
||||||
"system": SYSTEM_PROMPT,
|
"system": SYSTEM_PROMPT,
|
||||||
"tools": [{
|
"tools": [{
|
||||||
"name": TOOL_NAME,
|
"name": TOOL_NAME,
|
||||||
"description": "Persist the extracted flat details.",
|
"description": "Persist the selected flat-photo URLs.",
|
||||||
"input_schema": TOOL_SCHEMA,
|
"input_schema": TOOL_SCHEMA,
|
||||||
}],
|
}],
|
||||||
"tool_choice": {"type": "tool", "name": TOOL_NAME},
|
"tool_choice": {"type": "tool", "name": TOOL_NAME},
|
||||||
"messages": [{"role": "user", "content": user_content}],
|
"messages": [{"role": "user", "content": user_text}],
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
r = requests.post(
|
r = requests.post(
|
||||||
|
|
@ -104,16 +79,18 @@ def extract_flat_details(html: str, url: str,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
)
|
)
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
logger.warning("anthropic request failed: %s", e)
|
logger.warning("anthropic image-select request failed: %s", e)
|
||||||
return None
|
return candidates
|
||||||
|
|
||||||
if r.status_code >= 400:
|
if r.status_code >= 400:
|
||||||
logger.warning("anthropic %s: %s", r.status_code, r.text[:300])
|
logger.warning("anthropic image-select %s: %s", r.status_code, r.text[:300])
|
||||||
return None
|
return candidates
|
||||||
|
|
||||||
data = r.json()
|
data = r.json()
|
||||||
for block in data.get("content", []):
|
for block in data.get("content", []):
|
||||||
if block.get("type") == "tool_use" and block.get("name") == TOOL_NAME:
|
if block.get("type") == "tool_use" and block.get("name") == TOOL_NAME:
|
||||||
return block.get("input") or {}
|
urls = (block.get("input") or {}).get("urls") or []
|
||||||
logger.warning("anthropic returned no tool_use block: %s", data)
|
# Constrain to the original candidate set so the model can't
|
||||||
return None
|
# invent URLs (it sometimes lightly rewrites them otherwise).
|
||||||
|
allowed = set(candidates)
|
||||||
|
return [u for u in urls if u in allowed]
|
||||||
|
return candidates
|
||||||
|
|
|
||||||
|
|
@ -6,3 +6,5 @@ itsdangerous==2.2.0
|
||||||
python-multipart==0.0.17
|
python-multipart==0.0.17
|
||||||
python-dotenv==1.0.1
|
python-dotenv==1.0.1
|
||||||
requests==2.32.5
|
requests==2.32.5
|
||||||
|
Pillow==11.0.0
|
||||||
|
ImageHash==4.3.1
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,13 @@
|
||||||
{# Expanded detail for a single flat, loaded into #flat-detail-<id> via HTMX. #}
|
{# Expanded detail for a single flat — only shows downloaded images. #}
|
||||||
{% if enrichment_status == 'pending' %}
|
{% if enrichment_status == 'pending' %}
|
||||||
<div class="px-4 py-5 text-sm text-slate-500">Analyse läuft – kommt in wenigen Augenblicken zurück…</div>
|
<div class="px-4 py-5 text-sm text-slate-500">Bilder werden abgerufen…</div>
|
||||||
{% elif enrichment_status == 'failed' %}
|
{% elif enrichment_status == 'failed' %}
|
||||||
<div class="px-4 py-5 text-sm text-slate-500">
|
<div class="px-4 py-5 text-sm text-slate-500">
|
||||||
Detail-Analyse konnte nicht abgerufen werden.
|
Bilder konnten nicht geladen werden.
|
||||||
<a href="{{ flat.link }}" target="_blank" rel="noopener">Zur Original-Anzeige →</a>
|
<a href="{{ flat.link }}" target="_blank" rel="noopener" class="ml-1">Zur Original-Anzeige →</a>
|
||||||
</div>
|
</div>
|
||||||
{% else %}
|
{% elif image_urls %}
|
||||||
<div class="px-4 py-4 space-y-4">
|
<div class="px-4 py-4 space-y-3">
|
||||||
{% if image_urls %}
|
|
||||||
<div class="flat-gallery">
|
<div class="flat-gallery">
|
||||||
{% for src in image_urls %}
|
{% for src in image_urls %}
|
||||||
<a class="flat-gallery-tile" href="{{ src }}" target="_blank" rel="noopener">
|
<a class="flat-gallery-tile" href="{{ src }}" target="_blank" rel="noopener">
|
||||||
|
|
@ -16,67 +15,13 @@
|
||||||
</a>
|
</a>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
{% if enrichment and enrichment.description %}
|
|
||||||
<p class="text-sm text-slate-700">{{ enrichment.description }}</p>
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
{% if enrichment %}
|
|
||||||
<div class="grid grid-cols-2 md:grid-cols-3 gap-x-6 gap-y-1.5 text-xs">
|
|
||||||
{% macro kv(label, value) %}
|
|
||||||
{% if value is not none and value != '' %}
|
|
||||||
<div class="flex justify-between gap-3 border-b border-soft py-1">
|
|
||||||
<span class="text-slate-500">{{ label }}</span>
|
|
||||||
<span class="text-slate-800 text-right">{{ value }}</span>
|
|
||||||
</div>
|
|
||||||
{% endif %}
|
|
||||||
{% endmacro %}
|
|
||||||
{{ kv('Adresse', enrichment.address) }}
|
|
||||||
{{ kv('Zimmer', enrichment.rooms) }}
|
|
||||||
{{ kv('Größe', enrichment.size_sqm ~ ' m²' if enrichment.size_sqm else none) }}
|
|
||||||
{{ kv('Kaltmiete', enrichment.rent_cold ~ ' €' if enrichment.rent_cold else none) }}
|
|
||||||
{{ kv('Nebenkosten', enrichment.utilities ~ ' €' if enrichment.utilities else none) }}
|
|
||||||
{{ kv('Gesamtmiete', enrichment.rent_total ~ ' €' if enrichment.rent_total else none) }}
|
|
||||||
{{ kv('Kaution', enrichment.deposit ~ ' €' if enrichment.deposit else none) }}
|
|
||||||
{{ kv('Bezugsfrei ab', enrichment.available_from) }}
|
|
||||||
{{ kv('Etage', enrichment.floor) }}
|
|
||||||
{{ kv('Heizung', enrichment.heating) }}
|
|
||||||
{{ kv('Energieausweis', enrichment.energy_certificate) }}
|
|
||||||
{{ kv('Energiewert', enrichment.energy_value) }}
|
|
||||||
{{ kv('Baujahr', enrichment.year_built) }}
|
|
||||||
{{ kv('WBS', 'erforderlich' if enrichment.wbs_required else ('nicht erforderlich' if enrichment.wbs_required == false else none)) }}
|
|
||||||
{{ kv('WBS-Typ', enrichment.wbs_type) }}
|
|
||||||
</div>
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
{% if enrichment and enrichment.features %}
|
|
||||||
<div class="flex flex-wrap gap-1.5">
|
|
||||||
{% for f in enrichment.features %}<span class="chip chip-info">{{ f }}</span>{% endfor %}
|
|
||||||
</div>
|
|
||||||
{% endif %}
|
|
||||||
|
|
||||||
<div class="grid grid-cols-1 md:grid-cols-2 gap-4">
|
|
||||||
{% if enrichment and enrichment.pros %}
|
|
||||||
<div>
|
|
||||||
<div class="text-xs uppercase tracking-wide text-slate-500 mb-1">Pro</div>
|
|
||||||
<ul class="text-sm space-y-1">
|
|
||||||
{% for p in enrichment.pros %}<li>+ {{ p }}</li>{% endfor %}
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
{% endif %}
|
|
||||||
{% if enrichment and enrichment.cons %}
|
|
||||||
<div>
|
|
||||||
<div class="text-xs uppercase tracking-wide text-slate-500 mb-1">Contra</div>
|
|
||||||
<ul class="text-sm space-y-1">
|
|
||||||
{% for c in enrichment.cons %}<li>− {{ c }}</li>{% endfor %}
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
{% endif %}
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="text-xs">
|
<div class="text-xs">
|
||||||
<a href="{{ flat.link }}" target="_blank" rel="noopener">Zur Original-Anzeige →</a>
|
<a href="{{ flat.link }}" target="_blank" rel="noopener">Zur Original-Anzeige →</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
{% else %}
|
||||||
|
<div class="px-4 py-5 text-sm text-slate-500">
|
||||||
|
Keine Bilder gefunden.
|
||||||
|
<a href="{{ flat.link }}" target="_blank" rel="noopener" class="ml-1">Zur Original-Anzeige →</a>
|
||||||
|
</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
|
||||||
|
|
@ -90,8 +90,8 @@
|
||||||
hx-post="/actions/enrich-all" hx-target="#wohnungen-body" hx-swap="outerHTML">
|
hx-post="/actions/enrich-all" hx-target="#wohnungen-body" hx-swap="outerHTML">
|
||||||
<input type="hidden" name="csrf" value="{{ csrf }}">
|
<input type="hidden" name="csrf" value="{{ csrf }}">
|
||||||
<button class="btn btn-ghost text-xs" type="submit"
|
<button class="btn btn-ghost text-xs" type="submit"
|
||||||
hx-confirm="Altbestand jetzt durch Haiku nachträglich anreichern? Kann einige Minuten dauern.">
|
hx-confirm="Bilder für ausstehende Wohnungen nachladen? Kann einige Minuten dauern.">
|
||||||
Anreichern ({{ enrichment_counts.pending + enrichment_counts.failed }})
|
Bilder nachladen ({{ enrichment_counts.pending + enrichment_counts.failed }})
|
||||||
</button>
|
</button>
|
||||||
</form>
|
</form>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
|
@ -133,52 +133,16 @@
|
||||||
{% elif item.last and item.last.success == 1 %}<span class="chip chip-ok">beworben</span>
|
{% elif item.last and item.last.success == 1 %}<span class="chip chip-ok">beworben</span>
|
||||||
{% elif item.last and item.last.success == 0 %}<span class="chip chip-bad">fehlgeschlagen</span>
|
{% elif item.last and item.last.success == 0 %}<span class="chip chip-bad">fehlgeschlagen</span>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% if f.enrichment_status == 'pending' %}<span class="chip">analysiert…</span>
|
|
||||||
{% elif f.enrichment_status == 'failed' %}<span class="chip chip-warn" title="Detail-Analyse fehlgeschlagen">?</span>
|
|
||||||
{% endif %}
|
|
||||||
</div>
|
</div>
|
||||||
<div class="text-xs text-slate-500 mt-0.5">
|
<div class="text-xs text-slate-500 mt-0.5">
|
||||||
{% if f.enrichment_status == 'pending' %}
|
|
||||||
Infos werden abgerufen…
|
|
||||||
· <span data-rel-utc="{{ f.discovered_at|iso_utc }}" title="{{ f.discovered_at|de_dt }}">…</span>
|
|
||||||
{% elif f.enrichment_status == 'failed' %}
|
|
||||||
{% set err = (item.enrichment or {}).get('_error') or 'unbekannt' %}
|
|
||||||
<span title="{{ err }}">Fehler beim Abrufen der Infos</span>
|
|
||||||
{% if is_admin %}
|
|
||||||
<form method="post" action="/actions/enrich-flat" class="inline"
|
|
||||||
hx-post="/actions/enrich-flat" hx-target="#wohnungen-body" hx-swap="outerHTML">
|
|
||||||
<input type="hidden" name="csrf" value="{{ csrf }}">
|
|
||||||
<input type="hidden" name="flat_id" value="{{ f.id }}">
|
|
||||||
<button type="submit" class="underline text-slate-600 hover:text-slate-900 ml-1">erneut versuchen</button>
|
|
||||||
</form>
|
|
||||||
{% endif %}
|
|
||||||
· <span data-rel-utc="{{ f.discovered_at|iso_utc }}" title="{{ f.discovered_at|de_dt }}">…</span>
|
|
||||||
{% else %}
|
|
||||||
{# LLM first, scraper as fallback. The scraper data
|
|
||||||
from inberlinwohnen.de is reliable; we only
|
|
||||||
replace it when the LLM has a concrete value. #}
|
|
||||||
{% set e = item.enrichment or {} %}
|
|
||||||
{% set rooms = e.rooms if e.rooms is not none else f.rooms %}
|
|
||||||
{% set size = e.size_sqm if e.size_sqm is not none else f.size %}
|
|
||||||
{% set rent = e.rent_total or e.rent_cold or f.total_rent %}
|
|
||||||
{% if e.wbs_required is sameas true %}
|
|
||||||
{% set wbs_label = 'WBS: ' ~ (e.wbs_type or 'erforderlich') %}
|
|
||||||
{% elif e.wbs_required is sameas false %}
|
|
||||||
{% set wbs_label = 'ohne WBS' %}
|
|
||||||
{% elif f.wbs == 'erforderlich' %}
|
|
||||||
{% set wbs_label = 'WBS: erforderlich' %}
|
|
||||||
{% elif f.wbs == 'nicht erforderlich' %}
|
|
||||||
{% set wbs_label = 'ohne WBS' %}
|
|
||||||
{% else %}
|
|
||||||
{% set wbs_label = '' %}
|
|
||||||
{% endif %}
|
|
||||||
{% set parts = [] %}
|
{% set parts = [] %}
|
||||||
{% if rooms %}{% set _ = parts.append('%g Z'|format(rooms)) %}{% endif %}
|
{% if f.rooms %}{% set _ = parts.append('%g Z'|format(f.rooms)) %}{% endif %}
|
||||||
{% if size %}{% set _ = parts.append('%.0f m²'|format(size)) %}{% endif %}
|
{% if f.size %}{% set _ = parts.append('%.0f m²'|format(f.size)) %}{% endif %}
|
||||||
{% if rent %}{% set _ = parts.append('%.0f €'|format(rent)) %}{% endif %}
|
{% if f.total_rent %}{% set _ = parts.append('%.0f €'|format(f.total_rent)) %}{% endif %}
|
||||||
{% if wbs_label %}{% set _ = parts.append(wbs_label) %}{% endif %}
|
{% if f.wbs == 'erforderlich' %}{% set _ = parts.append('WBS: erforderlich') %}
|
||||||
{{ parts|join(' · ') }}{% if parts %} · {% endif %}<span data-rel-utc="{{ f.discovered_at|iso_utc }}" title="{{ f.discovered_at|de_dt }}">…</span>
|
{% elif f.wbs == 'nicht erforderlich' %}{% set _ = parts.append('ohne WBS') %}
|
||||||
{% endif %}
|
{% endif %}
|
||||||
|
{{ parts|join(' · ') }}{% if parts %} · {% endif %}<span data-rel-utc="{{ f.discovered_at|iso_utc }}" title="{{ f.discovered_at|de_dt }}">…</span>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="flex gap-2 items-center">
|
<div class="flex gap-2 items-center">
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue