diff --git a/web/enrichment.py b/web/enrichment.py index fb9bc42..7e8b103 100644 --- a/web/enrichment.py +++ b/web/enrichment.py @@ -35,8 +35,10 @@ IMAGES_DIR.mkdir(parents=True, exist_ok=True) MAX_IMAGES = 12 MAX_IMAGE_BYTES = 5_000_000 +MIN_IMAGE_BYTES = 15_000 # Below this, it's an icon / tracking pixel +MIN_IMAGE_DIMENSION = 400 # Shortest side in pixels — filters thumbs IMAGE_TIMEOUT = 15 -PHASH_DUPLICATE_THRESHOLD = 5 # Hamming distance ≤ N → considered the same picture +PHASH_DUPLICATE_THRESHOLD = 5 # Hamming distance ≤ N → same picture class EnrichmentError(Exception): @@ -87,19 +89,19 @@ def _ext_for(content_type: str, url: str) -> str: return ext.lower() or ".jpg" -def _phash(data: bytes): - """Return an imagehash.ImageHash for the bytes, or None if unsupported.""" +def _image_info(data: bytes): + """Return (phash, (width, height)) for the bytes, or (None, None) on failure.""" try: from PIL import Image import imagehash except ImportError: - return None + return None, None try: with Image.open(io.BytesIO(data)) as img: img.load() - return imagehash.phash(img) + return imagehash.phash(img), img.size except Exception: - return None + return None, None def _download_images(flat_id: str, urls: list[str], referer: str) -> int: @@ -149,12 +151,20 @@ def _download_images(flat_id: str, urls: list[str], referer: str) -> int: data_bytes = bytes(data) + # Size filter: files under ~15 KB are icons/tracking pixels. + if len(data_bytes) < MIN_IMAGE_BYTES: + continue + sha = hashlib.sha256(data_bytes).hexdigest() if sha in seen_sha: continue seen_sha.add(sha) - ph = _phash(data_bytes) + ph, dims = _image_info(data_bytes) + # Pixel filter: anything smaller than 400 px on the short side is a + # thumbnail/avatar, not a real flat photo. + if dims is not None and min(dims) < MIN_IMAGE_DIMENSION: + continue if ph is not None: if any((ph - prev) <= PHASH_DUPLICATE_THRESHOLD for prev in seen_phash): continue diff --git a/web/llm.py b/web/llm.py index 73afa88..de9299b 100644 --- a/web/llm.py +++ b/web/llm.py @@ -36,11 +36,18 @@ TOOL_SCHEMA = { SYSTEM_PROMPT = ( "Du bekommst eine Liste von Bild-URLs einer Wohnungsanzeige. Wähle nur " - "die URLs aus, die ein Foto der Wohnung zeigen (Innenraum, Außenansicht " - "des Gebäudes, Grundriss). Verwerfe Logos, Icons, Banner, Ads, " - "Bewertungs-Sterne, Karten/Stadtpläne, Mitarbeiter-Portraits, Tracking-" - "Pixel. Behalte die Reihenfolge der Input-Liste bei. Antworte " - "ausschließlich über den Tool-Call." + "die URLs aus, die ein Foto der Wohnung zeigen (Innenraum, Außenansicht, " + "Grundriss). Verwerfe Logos, Icons, Banner, Ads, Bewertungs-Sterne, " + "Karten/Stadtpläne, Mitarbeiter-Portraits, Tracking-Pixel.\n\n" + "WICHTIG: Viele Wohnungsbaugesellschaften liefern ihre Fotos über ein " + "CDN mit kryptischen Dateinamen aus (z.B. " + "fileadmin/_processed_/2/3/xcsm_abc123def.webp.pagespeed.ic.xyz.webp). " + "Solche URLs sind IMMER zu behalten, wenn sie vom selben Host wie die " + "Anzeige kommen und in /fileadmin/_processed_/ oder /assets/media/ " + "oder ähnlichen Media-Pfaden liegen. Lehne nichts nur wegen eines " + "unleserlichen Dateinamens ab.\n\n" + "Behalte die Reihenfolge der Input-Liste bei. Antworte ausschließlich " + "über den Tool-Call." ) @@ -92,5 +99,13 @@ def select_flat_image_urls(candidates: list[str], page_url: str, # Constrain to the original candidate set so the model can't # invent URLs (it sometimes lightly rewrites them otherwise). allowed = set(candidates) - return [u for u in urls if u in allowed] + kept = [u for u in urls if u in allowed] + if not kept and candidates: + # The model rejected everything — usually because the URLs + # are opaque CDN hashes it couldn't identify positively. + # Trust the Playwright pre-filter instead of returning []. + logger.warning("image-select returned 0 of %d — falling back to unfiltered list", + len(candidates)) + return candidates + return kept return candidates