diff --git a/web/app.py b/web/app.py index 2999224..7ce4dd9 100644 --- a/web/app.py +++ b/web/app.py @@ -403,13 +403,7 @@ def _wohnungen_context(user) -> dict: }, filters): continue last = db.last_application_for_flat(uid, f["id"]) - enrichment_data = None - if f["enrichment_json"]: - try: - enrichment_data = json.loads(f["enrichment_json"]) - except Exception: - enrichment_data = None - flats_view.append({"row": f, "last": last, "enrichment": enrichment_data}) + flats_view.append({"row": f, "last": last}) rejected_view = db.rejected_flats(uid) enrichment_counts = db.enrichment_counts() @@ -489,12 +483,6 @@ def partial_wohnung_detail(request: Request, flat_id: str, user=Depends(require_ flat = db.get_flat(flat_id) if not flat: raise HTTPException(404) - enrichment_data = None - if flat["enrichment_json"]: - try: - enrichment_data = json.loads(flat["enrichment_json"]) - except Exception: - enrichment_data = None slug = enrichment.flat_slug(flat_id) image_urls = [ f"/flat-images/{slug}/{i}" @@ -503,7 +491,6 @@ def partial_wohnung_detail(request: Request, flat_id: str, user=Depends(require_ ctx = { "request": request, "flat": flat, - "enrichment": enrichment_data, "enrichment_status": flat["enrichment_status"], "image_urls": image_urls, } diff --git a/web/enrichment.py b/web/enrichment.py index 56f7f30..fb9bc42 100644 --- a/web/enrichment.py +++ b/web/enrichment.py @@ -1,19 +1,20 @@ -"""Flat-enrichment pipeline. +"""Image enrichment pipeline. For each new flat we: 1. Ask the apply service to fetch the listing via Playwright (bypasses bot guards) -2. Feed the HTML to Haiku via `llm.extract_flat_details` → structured dict -3. Download each image URL directly into /data/flats//NN. -4. Persist result on the flat row (enrichment_json + image_count + status) +2. Optionally let Haiku narrow the URL list down to actual flat photos +3. Download the images into /data/flats//NN., deduplicating by + exact bytes (SHA256) and visual similarity (perceptual hash) -Kicked as a detached asyncio task from /internal/flats so scraping stays fast. -Every failure is caught, stashed in enrichment_json as {"_error": "...", ...} -and mirrored into the errors log so /logs/protokoll explains what went wrong. +Structured fields (rooms / size / rent / WBS) come from the inberlinwohnen.de +scraper and are not re-derived by the LLM. Failures land in +enrichment_status='failed' with the reason in enrichment_json._error. """ from __future__ import annotations import asyncio import hashlib +import io import logging import mimetypes import os @@ -33,12 +34,12 @@ IMAGES_DIR = DATA_DIR / "flats" IMAGES_DIR.mkdir(parents=True, exist_ok=True) MAX_IMAGES = 12 -MAX_IMAGE_BYTES = 3_000_000 # 3 MB per image +MAX_IMAGE_BYTES = 5_000_000 IMAGE_TIMEOUT = 15 +PHASH_DUPLICATE_THRESHOLD = 5 # Hamming distance ≤ N → considered the same picture class EnrichmentError(Exception): - """Raised by each pipeline step with a human-readable reason.""" def __init__(self, step: str, reason: str): self.step = step self.reason = reason @@ -46,7 +47,6 @@ class EnrichmentError(Exception): def flat_slug(flat_id: str) -> str: - """Filesystem-safe short identifier for a flat (IDs are URLs).""" return hashlib.sha1(flat_id.encode("utf-8")).hexdigest()[:16] @@ -77,25 +77,50 @@ def _fetch_listing(url: str) -> dict: raise EnrichmentError("fetch", "apply returned non-JSON response") -def _ext_from_response(resp: requests.Response, url: str) -> str: - ct = resp.headers.get("content-type", "").split(";")[0].strip().lower() +def _ext_for(content_type: str, url: str) -> str: + ct = content_type.split(";")[0].strip().lower() if ct: ext = mimetypes.guess_extension(ct) or "" if ext: return ext.replace(".jpe", ".jpg") - path = urlparse(url).path - _, ext = os.path.splitext(path) + _, ext = os.path.splitext(urlparse(url).path) return ext.lower() or ".jpg" +def _phash(data: bytes): + """Return an imagehash.ImageHash for the bytes, or None if unsupported.""" + try: + from PIL import Image + import imagehash + except ImportError: + return None + try: + with Image.open(io.BytesIO(data)) as img: + img.load() + return imagehash.phash(img) + except Exception: + return None + + def _download_images(flat_id: str, urls: list[str], referer: str) -> int: + """Download up to MAX_IMAGES distinct flat photos. Dedup tiers: + 1. Skip URLs that fail / aren't images + 2. Skip exact byte-equal duplicates (different URL pointing to same file) + 3. Skip visually-equivalent images (same picture re-encoded at a + different size — common with srcset/CDN variants) + """ d = flat_image_dir(flat_id) for old in d.iterdir(): try: old.unlink() except OSError: pass + seen_sha: set[str] = set() + seen_phash: list = [] # list of imagehash objects + saved = 0 - for raw_url in urls[:MAX_IMAGES]: + for raw_url in urls: + if saved >= MAX_IMAGES: + break try: r = requests.get( raw_url, @@ -109,48 +134,53 @@ def _download_images(flat_id: str, urls: list[str], referer: str) -> int: ct = r.headers.get("content-type", "").split(";")[0].strip().lower() if not ct.startswith("image/"): continue - ext = _ext_from_response(r, raw_url) - path = d / f"{saved + 1:02d}{ext}" - total = 0 - with open(path, "wb") as f: - for chunk in r.iter_content(chunk_size=65_536): - if not chunk: - continue - total += len(chunk) - if total > MAX_IMAGE_BYTES: - break - f.write(chunk) - if total == 0: - path.unlink(missing_ok=True) + data = bytearray() + for chunk in r.iter_content(chunk_size=65_536): + if not chunk: + continue + data.extend(chunk) + if len(data) > MAX_IMAGE_BYTES: + break + if not data: continue - saved += 1 except requests.RequestException as e: logger.info("image download failed %s: %s", raw_url, e) continue + + data_bytes = bytes(data) + + sha = hashlib.sha256(data_bytes).hexdigest() + if sha in seen_sha: + continue + seen_sha.add(sha) + + ph = _phash(data_bytes) + if ph is not None: + if any((ph - prev) <= PHASH_DUPLICATE_THRESHOLD for prev in seen_phash): + continue + seen_phash.append(ph) + + ext = _ext_for(ct, raw_url) + path = d / f"{saved + 1:02d}{ext}" + path.write_bytes(data_bytes) + saved += 1 + return saved def enrich_flat_sync(flat_id: str) -> None: - """Run the full enrichment pipeline for one flat. Blocking.""" flat = db.get_flat(flat_id) if not flat: return url = flat["link"] - logger.info("enrich start flat=%s url=%s", flat_id, url) + logger.info("enrich start flat=%s", flat_id) try: listing = _fetch_listing(url) - html = listing.get("html") or "" - final_url = listing.get("final_url") or url - if not html.strip(): - raise EnrichmentError("fetch", "apply returned empty HTML") - - details = llm.extract_flat_details(html, final_url) - if details is None: - raise EnrichmentError("llm", "model returned no tool_use or call failed (see web logs)") - - image_urls = listing.get("image_urls") or [] - image_count = _download_images(flat_id, image_urls, referer=url) + candidates = listing.get("image_urls") or [] + if candidates: + candidates = llm.select_flat_image_urls(candidates, listing.get("final_url") or url) + image_count = _download_images(flat_id, candidates, referer=url) except EnrichmentError as e: _record_failure(flat_id, e.step, e.reason) return @@ -159,7 +189,7 @@ def enrich_flat_sync(flat_id: str) -> None: _record_failure(flat_id, "crash", f"{type(e).__name__}: {e}") return - db.set_flat_enrichment(flat_id, "ok", enrichment=details, image_count=image_count) + db.set_flat_enrichment(flat_id, "ok", enrichment=None, image_count=image_count) logger.info("enrich done flat=%s images=%d", flat_id, image_count) @@ -179,7 +209,6 @@ def _record_failure(flat_id: str, step: str, reason: str) -> None: def kick(flat_id: str) -> None: - """Fire-and-forget enrichment in a background thread.""" asyncio.create_task(asyncio.to_thread(enrich_flat_sync, flat_id)) diff --git a/web/llm.py b/web/llm.py index 491f0d3..73afa88 100644 --- a/web/llm.py +++ b/web/llm.py @@ -1,12 +1,14 @@ -"""Minimal Anthropic Messages API wrapper for flat enrichment. +"""Anthropic Haiku helper — used only to pick which `` URLs on a +listing page are actual photos of the flat (vs. nav icons, badges, ads…). -Uses tool-use forced output so Haiku returns structured JSON instead of free -text we'd have to regex. No SDK — plain `requests` is enough here. +If the API key is missing or the call fails, the caller passes the original +candidates straight through, so this is a soft enhancement, not a +dependency. """ from __future__ import annotations import logging -from typing import Any, Optional +from typing import Optional import requests @@ -17,68 +19,41 @@ logger = logging.getLogger("web.llm") API_URL = "https://api.anthropic.com/v1/messages" API_VERSION = "2023-06-01" -TOOL_NAME = "record_flat_details" -TOOL_SCHEMA: dict[str, Any] = { +TOOL_NAME = "select_flat_images" +TOOL_SCHEMA = { "type": "object", "properties": { - "address": {"type": ["string", "null"], - "description": "Full street address incl. postcode+city if present"}, - "rooms": {"type": ["number", "null"], "description": "Number of rooms (decimal ok)"}, - "size_sqm": {"type": ["number", "null"], "description": "Size in m²"}, - "rent_cold": {"type": ["number", "null"], "description": "Kaltmiete in €"}, - "rent_total": {"type": ["number", "null"], "description": "Warm/Gesamtmiete in €"}, - "utilities": {"type": ["number", "null"], "description": "Nebenkosten in €"}, - "deposit": {"type": ["number", "null"], "description": "Kaution in €"}, - "available_from": {"type": ["string", "null"], "description": "Bezugsfrei ab (text)"}, - "floor": {"type": ["string", "null"], "description": "Etage (text, z.B. '3. OG')"}, - "heating": {"type": ["string", "null"]}, - "energy_certificate": {"type": ["string", "null"]}, - "energy_value": {"type": ["string", "null"]}, - "year_built": {"type": ["string", "null"]}, - "wbs_required": {"type": ["boolean", "null"]}, - "wbs_type": {"type": ["string", "null"], "description": "WBS-Typ, z.B. '160' oder null"}, - "description": { - "type": ["string", "null"], - "description": "Kurze 2–3-Satz-Beschreibung der Wohnung auf Deutsch. Fakten, keine Werbesprache.", - }, - "features": { - "type": "array", "items": {"type": "string"}, - "description": "Ausstattungsmerkmale (z.B. 'Balkon', 'Einbauküche', 'Parkett')", - }, - "pros": { - "type": "array", "items": {"type": "string"}, - "description": "2–4 konkrete Vorteile aus Bewerbersicht (keine Werbung)", - }, - "cons": { - "type": "array", "items": {"type": "string"}, - "description": "2–4 mögliche Nachteile / Punkte zum Beachten", + "urls": { + "type": "array", + "items": {"type": "string"}, + "description": "Subset of the candidate URLs that show the actual flat — " + "interior, exterior, floorplan. Keep ordering of input.", }, }, - "required": [], + "required": ["urls"], "additionalProperties": False, } SYSTEM_PROMPT = ( - "Du extrahierst strukturierte Wohnungsdaten aus deutschem HTML-Quelltext von " - "Berliner Wohnungsbaugesellschaften (howoge, gewobag, degewo, gesobau, wbm, " - "stadt-und-land). Antworte AUSSCHLIESSLICH über den bereitgestellten Tool-Call. " - "Fehlende Werte → null. Keine Erfindungen — wenn etwas nicht klar aus dem HTML " - "hervorgeht, lass das Feld null. Zahlen bitte als Zahlen (nicht als String), " - "Beschreibung/Pros/Cons auf Deutsch." + "Du bekommst eine Liste von Bild-URLs einer Wohnungsanzeige. Wähle nur " + "die URLs aus, die ein Foto der Wohnung zeigen (Innenraum, Außenansicht " + "des Gebäudes, Grundriss). Verwerfe Logos, Icons, Banner, Ads, " + "Bewertungs-Sterne, Karten/Stadtpläne, Mitarbeiter-Portraits, Tracking-" + "Pixel. Behalte die Reihenfolge der Input-Liste bei. Antworte " + "ausschließlich über den Tool-Call." ) -def extract_flat_details(html: str, url: str, - max_html_chars: int = 60_000, - timeout: int = 60) -> Optional[dict]: - """Call Haiku; return the structured dict or None on failure.""" - if not ANTHROPIC_API_KEY: - logger.info("skipping enrichment: ANTHROPIC_API_KEY not set") - return None +def select_flat_image_urls(candidates: list[str], page_url: str, + timeout: int = 30) -> list[str]: + """Return the LLM-filtered subset, or the original list on any failure.""" + if not ANTHROPIC_API_KEY or not candidates: + return candidates - user_content = ( - f"URL: {url}\n\n" - f"HTML-Quellcode (ggf. gekürzt):\n---\n{html[:max_html_chars]}\n---" + user_text = ( + f"Seite: {page_url}\n\n" + "Kandidaten-URLs (nummeriert):\n" + + "\n".join(f"{i+1}. {u}" for i, u in enumerate(candidates)) ) body = { "model": ANTHROPIC_MODEL, @@ -86,11 +61,11 @@ def extract_flat_details(html: str, url: str, "system": SYSTEM_PROMPT, "tools": [{ "name": TOOL_NAME, - "description": "Persist the extracted flat details.", + "description": "Persist the selected flat-photo URLs.", "input_schema": TOOL_SCHEMA, }], "tool_choice": {"type": "tool", "name": TOOL_NAME}, - "messages": [{"role": "user", "content": user_content}], + "messages": [{"role": "user", "content": user_text}], } try: r = requests.post( @@ -104,16 +79,18 @@ def extract_flat_details(html: str, url: str, timeout=timeout, ) except requests.RequestException as e: - logger.warning("anthropic request failed: %s", e) - return None - + logger.warning("anthropic image-select request failed: %s", e) + return candidates if r.status_code >= 400: - logger.warning("anthropic %s: %s", r.status_code, r.text[:300]) - return None + logger.warning("anthropic image-select %s: %s", r.status_code, r.text[:300]) + return candidates data = r.json() for block in data.get("content", []): if block.get("type") == "tool_use" and block.get("name") == TOOL_NAME: - return block.get("input") or {} - logger.warning("anthropic returned no tool_use block: %s", data) - return None + urls = (block.get("input") or {}).get("urls") or [] + # Constrain to the original candidate set so the model can't + # invent URLs (it sometimes lightly rewrites them otherwise). + allowed = set(candidates) + return [u for u in urls if u in allowed] + return candidates diff --git a/web/requirements.txt b/web/requirements.txt index 56293ef..a54f003 100644 --- a/web/requirements.txt +++ b/web/requirements.txt @@ -6,3 +6,5 @@ itsdangerous==2.2.0 python-multipart==0.0.17 python-dotenv==1.0.1 requests==2.32.5 +Pillow==11.0.0 +ImageHash==4.3.1 diff --git a/web/templates/_wohnung_detail.html b/web/templates/_wohnung_detail.html index b8d9152..e3ea633 100644 --- a/web/templates/_wohnung_detail.html +++ b/web/templates/_wohnung_detail.html @@ -1,14 +1,13 @@ -{# Expanded detail for a single flat, loaded into #flat-detail- via HTMX. #} +{# Expanded detail for a single flat — only shows downloaded images. #} {% if enrichment_status == 'pending' %} -
Analyse läuft – kommt in wenigen Augenblicken zurück…
+
Bilder werden abgerufen…
{% elif enrichment_status == 'failed' %}
- Detail-Analyse konnte nicht abgerufen werden. - Zur Original-Anzeige → + Bilder konnten nicht geladen werden. + Zur Original-Anzeige →
-{% else %} -
- {% if image_urls %} +{% elif image_urls %} +
- {% endif %} - - {% if enrichment and enrichment.description %} -

{{ enrichment.description }}

- {% endif %} - - {% if enrichment %} -
- {% macro kv(label, value) %} - {% if value is not none and value != '' %} -
- {{ label }} - {{ value }} -
- {% endif %} - {% endmacro %} - {{ kv('Adresse', enrichment.address) }} - {{ kv('Zimmer', enrichment.rooms) }} - {{ kv('Größe', enrichment.size_sqm ~ ' m²' if enrichment.size_sqm else none) }} - {{ kv('Kaltmiete', enrichment.rent_cold ~ ' €' if enrichment.rent_cold else none) }} - {{ kv('Nebenkosten', enrichment.utilities ~ ' €' if enrichment.utilities else none) }} - {{ kv('Gesamtmiete', enrichment.rent_total ~ ' €' if enrichment.rent_total else none) }} - {{ kv('Kaution', enrichment.deposit ~ ' €' if enrichment.deposit else none) }} - {{ kv('Bezugsfrei ab', enrichment.available_from) }} - {{ kv('Etage', enrichment.floor) }} - {{ kv('Heizung', enrichment.heating) }} - {{ kv('Energieausweis', enrichment.energy_certificate) }} - {{ kv('Energiewert', enrichment.energy_value) }} - {{ kv('Baujahr', enrichment.year_built) }} - {{ kv('WBS', 'erforderlich' if enrichment.wbs_required else ('nicht erforderlich' if enrichment.wbs_required == false else none)) }} - {{ kv('WBS-Typ', enrichment.wbs_type) }} -
- {% endif %} - - {% if enrichment and enrichment.features %} -
- {% for f in enrichment.features %}{{ f }}{% endfor %} -
- {% endif %} - -
- {% if enrichment and enrichment.pros %} -
-
Pro
-
    - {% for p in enrichment.pros %}
  • + {{ p }}
  • {% endfor %} -
-
- {% endif %} - {% if enrichment and enrichment.cons %} -
-
Contra
-
    - {% for c in enrichment.cons %}
  • − {{ c }}
  • {% endfor %} -
-
- {% endif %} -
-
+{% else %} +
+ Keine Bilder gefunden. + Zur Original-Anzeige → +
{% endif %} diff --git a/web/templates/_wohnungen_body.html b/web/templates/_wohnungen_body.html index fda8748..cc14ba6 100644 --- a/web/templates/_wohnungen_body.html +++ b/web/templates/_wohnungen_body.html @@ -90,8 +90,8 @@ hx-post="/actions/enrich-all" hx-target="#wohnungen-body" hx-swap="outerHTML"> {% endif %} @@ -133,52 +133,16 @@ {% elif item.last and item.last.success == 1 %}beworben {% elif item.last and item.last.success == 0 %}fehlgeschlagen {% endif %} - {% if f.enrichment_status == 'pending' %}analysiert… - {% elif f.enrichment_status == 'failed' %}? - {% endif %}
- {% if f.enrichment_status == 'pending' %} - Infos werden abgerufen… - · - {% elif f.enrichment_status == 'failed' %} - {% set err = (item.enrichment or {}).get('_error') or 'unbekannt' %} - Fehler beim Abrufen der Infos - {% if is_admin %} -
- - - -
- {% endif %} - · - {% else %} - {# LLM first, scraper as fallback. The scraper data - from inberlinwohnen.de is reliable; we only - replace it when the LLM has a concrete value. #} - {% set e = item.enrichment or {} %} - {% set rooms = e.rooms if e.rooms is not none else f.rooms %} - {% set size = e.size_sqm if e.size_sqm is not none else f.size %} - {% set rent = e.rent_total or e.rent_cold or f.total_rent %} - {% if e.wbs_required is sameas true %} - {% set wbs_label = 'WBS: ' ~ (e.wbs_type or 'erforderlich') %} - {% elif e.wbs_required is sameas false %} - {% set wbs_label = 'ohne WBS' %} - {% elif f.wbs == 'erforderlich' %} - {% set wbs_label = 'WBS: erforderlich' %} - {% elif f.wbs == 'nicht erforderlich' %} - {% set wbs_label = 'ohne WBS' %} - {% else %} - {% set wbs_label = '' %} - {% endif %} - {% set parts = [] %} - {% if rooms %}{% set _ = parts.append('%g Z'|format(rooms)) %}{% endif %} - {% if size %}{% set _ = parts.append('%.0f m²'|format(size)) %}{% endif %} - {% if rent %}{% set _ = parts.append('%.0f €'|format(rent)) %}{% endif %} - {% if wbs_label %}{% set _ = parts.append(wbs_label) %}{% endif %} - {{ parts|join(' · ') }}{% if parts %} · {% endif %} + {% set parts = [] %} + {% if f.rooms %}{% set _ = parts.append('%g Z'|format(f.rooms)) %}{% endif %} + {% if f.size %}{% set _ = parts.append('%.0f m²'|format(f.size)) %}{% endif %} + {% if f.total_rent %}{% set _ = parts.append('%.0f €'|format(f.total_rent)) %}{% endif %} + {% if f.wbs == 'erforderlich' %}{% set _ = parts.append('WBS: erforderlich') %} + {% elif f.wbs == 'nicht erforderlich' %}{% set _ = parts.append('ohne WBS') %} {% endif %} + {{ parts|join(' · ') }}{% if parts %} · {% endif %}