From 83db8cd9020e8ac1ba835a3f69dbf1fec025e801 Mon Sep 17 00:00:00 2001 From: EiSiMo Date: Tue, 21 Apr 2026 16:44:47 +0200 Subject: [PATCH] enrichment: size floor + LLM fallback for opaque CDN URLs Two issues surfaced on HOWOGE and similar sites: 1. Tiny icons/1x1 tracking pixels leaked through (e.g. image #5, 1.8 KB). Added MIN_IMAGE_BYTES = 15_000 and MIN_IMAGE_DIMENSION = 400 px on the short side; files below either threshold are dropped before saving. Pillow already gives us the dims as part of the phash pass, so the check is free. 2. Listings whose image URLs are opaque CDN hashes (.../fileadmin/_processed_/2/3/xcsm_.webp.pagespeed.ic..webp) caused the LLM URL picker to reject every candidate, yielding 0 images for legit flats. Fixes: (a) prompt now explicitly instructs Haiku to keep same-host /fileadmin/_processed_/ style URLs even when the filename is illegible, (b) if the model still returns an empty set we fall back to the unfiltered Playwright candidates, trusting the pre-filter instead of erasing the gallery. Co-Authored-By: Claude Opus 4.7 (1M context) --- web/enrichment.py | 24 +++++++++++++++++------- web/llm.py | 27 +++++++++++++++++++++------ 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/web/enrichment.py b/web/enrichment.py index fb9bc42..7e8b103 100644 --- a/web/enrichment.py +++ b/web/enrichment.py @@ -35,8 +35,10 @@ IMAGES_DIR.mkdir(parents=True, exist_ok=True) MAX_IMAGES = 12 MAX_IMAGE_BYTES = 5_000_000 +MIN_IMAGE_BYTES = 15_000 # Below this, it's an icon / tracking pixel +MIN_IMAGE_DIMENSION = 400 # Shortest side in pixels — filters thumbs IMAGE_TIMEOUT = 15 -PHASH_DUPLICATE_THRESHOLD = 5 # Hamming distance ≤ N → considered the same picture +PHASH_DUPLICATE_THRESHOLD = 5 # Hamming distance ≤ N → same picture class EnrichmentError(Exception): @@ -87,19 +89,19 @@ def _ext_for(content_type: str, url: str) -> str: return ext.lower() or ".jpg" -def _phash(data: bytes): - """Return an imagehash.ImageHash for the bytes, or None if unsupported.""" +def _image_info(data: bytes): + """Return (phash, (width, height)) for the bytes, or (None, None) on failure.""" try: from PIL import Image import imagehash except ImportError: - return None + return None, None try: with Image.open(io.BytesIO(data)) as img: img.load() - return imagehash.phash(img) + return imagehash.phash(img), img.size except Exception: - return None + return None, None def _download_images(flat_id: str, urls: list[str], referer: str) -> int: @@ -149,12 +151,20 @@ def _download_images(flat_id: str, urls: list[str], referer: str) -> int: data_bytes = bytes(data) + # Size filter: files under ~15 KB are icons/tracking pixels. + if len(data_bytes) < MIN_IMAGE_BYTES: + continue + sha = hashlib.sha256(data_bytes).hexdigest() if sha in seen_sha: continue seen_sha.add(sha) - ph = _phash(data_bytes) + ph, dims = _image_info(data_bytes) + # Pixel filter: anything smaller than 400 px on the short side is a + # thumbnail/avatar, not a real flat photo. + if dims is not None and min(dims) < MIN_IMAGE_DIMENSION: + continue if ph is not None: if any((ph - prev) <= PHASH_DUPLICATE_THRESHOLD for prev in seen_phash): continue diff --git a/web/llm.py b/web/llm.py index 73afa88..de9299b 100644 --- a/web/llm.py +++ b/web/llm.py @@ -36,11 +36,18 @@ TOOL_SCHEMA = { SYSTEM_PROMPT = ( "Du bekommst eine Liste von Bild-URLs einer Wohnungsanzeige. Wähle nur " - "die URLs aus, die ein Foto der Wohnung zeigen (Innenraum, Außenansicht " - "des Gebäudes, Grundriss). Verwerfe Logos, Icons, Banner, Ads, " - "Bewertungs-Sterne, Karten/Stadtpläne, Mitarbeiter-Portraits, Tracking-" - "Pixel. Behalte die Reihenfolge der Input-Liste bei. Antworte " - "ausschließlich über den Tool-Call." + "die URLs aus, die ein Foto der Wohnung zeigen (Innenraum, Außenansicht, " + "Grundriss). Verwerfe Logos, Icons, Banner, Ads, Bewertungs-Sterne, " + "Karten/Stadtpläne, Mitarbeiter-Portraits, Tracking-Pixel.\n\n" + "WICHTIG: Viele Wohnungsbaugesellschaften liefern ihre Fotos über ein " + "CDN mit kryptischen Dateinamen aus (z.B. " + "fileadmin/_processed_/2/3/xcsm_abc123def.webp.pagespeed.ic.xyz.webp). " + "Solche URLs sind IMMER zu behalten, wenn sie vom selben Host wie die " + "Anzeige kommen und in /fileadmin/_processed_/ oder /assets/media/ " + "oder ähnlichen Media-Pfaden liegen. Lehne nichts nur wegen eines " + "unleserlichen Dateinamens ab.\n\n" + "Behalte die Reihenfolge der Input-Liste bei. Antworte ausschließlich " + "über den Tool-Call." ) @@ -92,5 +99,13 @@ def select_flat_image_urls(candidates: list[str], page_url: str, # Constrain to the original candidate set so the model can't # invent URLs (it sometimes lightly rewrites them otherwise). allowed = set(candidates) - return [u for u in urls if u in allowed] + kept = [u for u in urls if u in allowed] + if not kept and candidates: + # The model rejected everything — usually because the URLs + # are opaque CDN hashes it couldn't identify positively. + # Trust the Playwright pre-filter instead of returning []. + logger.warning("image-select returned 0 of %d — falling back to unfiltered list", + len(candidates)) + return candidates + return kept return candidates