enrichment: size floor + LLM fallback for opaque CDN URLs

Two issues surfaced on HOWOGE and similar sites:

1. Tiny icons/1x1 tracking pixels leaked through (e.g. image #5, 1.8 KB).
   Added MIN_IMAGE_BYTES = 15_000 and MIN_IMAGE_DIMENSION = 400 px on the
   short side; files below either threshold are dropped before saving.
   Pillow already gives us the dims as part of the phash pass, so the
   check is free.

2. Listings whose image URLs are opaque CDN hashes
   (.../fileadmin/_processed_/2/3/xcsm_<hash>.webp.pagespeed.ic.<hash>.webp)
   caused the LLM URL picker to reject every candidate, yielding 0 images
   for legit flats. Fixes: (a) prompt now explicitly instructs Haiku to
   keep same-host /fileadmin/_processed_/ style URLs even when the filename
   is illegible, (b) if the model still returns an empty set we fall back
   to the unfiltered Playwright candidates, trusting the pre-filter instead
   of erasing the gallery.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
EiSiMo 2026-04-21 16:44:47 +02:00
parent 0aa4c6c2bb
commit 83db8cd902
2 changed files with 38 additions and 13 deletions

View file

@ -35,8 +35,10 @@ IMAGES_DIR.mkdir(parents=True, exist_ok=True)
MAX_IMAGES = 12 MAX_IMAGES = 12
MAX_IMAGE_BYTES = 5_000_000 MAX_IMAGE_BYTES = 5_000_000
MIN_IMAGE_BYTES = 15_000 # Below this, it's an icon / tracking pixel
MIN_IMAGE_DIMENSION = 400 # Shortest side in pixels — filters thumbs
IMAGE_TIMEOUT = 15 IMAGE_TIMEOUT = 15
PHASH_DUPLICATE_THRESHOLD = 5 # Hamming distance ≤ N → considered the same picture PHASH_DUPLICATE_THRESHOLD = 5 # Hamming distance ≤ N → same picture
class EnrichmentError(Exception): class EnrichmentError(Exception):
@ -87,19 +89,19 @@ def _ext_for(content_type: str, url: str) -> str:
return ext.lower() or ".jpg" return ext.lower() or ".jpg"
def _phash(data: bytes): def _image_info(data: bytes):
"""Return an imagehash.ImageHash for the bytes, or None if unsupported.""" """Return (phash, (width, height)) for the bytes, or (None, None) on failure."""
try: try:
from PIL import Image from PIL import Image
import imagehash import imagehash
except ImportError: except ImportError:
return None return None, None
try: try:
with Image.open(io.BytesIO(data)) as img: with Image.open(io.BytesIO(data)) as img:
img.load() img.load()
return imagehash.phash(img) return imagehash.phash(img), img.size
except Exception: except Exception:
return None return None, None
def _download_images(flat_id: str, urls: list[str], referer: str) -> int: def _download_images(flat_id: str, urls: list[str], referer: str) -> int:
@ -149,12 +151,20 @@ def _download_images(flat_id: str, urls: list[str], referer: str) -> int:
data_bytes = bytes(data) data_bytes = bytes(data)
# Size filter: files under ~15 KB are icons/tracking pixels.
if len(data_bytes) < MIN_IMAGE_BYTES:
continue
sha = hashlib.sha256(data_bytes).hexdigest() sha = hashlib.sha256(data_bytes).hexdigest()
if sha in seen_sha: if sha in seen_sha:
continue continue
seen_sha.add(sha) seen_sha.add(sha)
ph = _phash(data_bytes) ph, dims = _image_info(data_bytes)
# Pixel filter: anything smaller than 400 px on the short side is a
# thumbnail/avatar, not a real flat photo.
if dims is not None and min(dims) < MIN_IMAGE_DIMENSION:
continue
if ph is not None: if ph is not None:
if any((ph - prev) <= PHASH_DUPLICATE_THRESHOLD for prev in seen_phash): if any((ph - prev) <= PHASH_DUPLICATE_THRESHOLD for prev in seen_phash):
continue continue

View file

@ -36,11 +36,18 @@ TOOL_SCHEMA = {
SYSTEM_PROMPT = ( SYSTEM_PROMPT = (
"Du bekommst eine Liste von Bild-URLs einer Wohnungsanzeige. Wähle nur " "Du bekommst eine Liste von Bild-URLs einer Wohnungsanzeige. Wähle nur "
"die URLs aus, die ein Foto der Wohnung zeigen (Innenraum, Außenansicht " "die URLs aus, die ein Foto der Wohnung zeigen (Innenraum, Außenansicht, "
"des Gebäudes, Grundriss). Verwerfe Logos, Icons, Banner, Ads, " "Grundriss). Verwerfe Logos, Icons, Banner, Ads, Bewertungs-Sterne, "
"Bewertungs-Sterne, Karten/Stadtpläne, Mitarbeiter-Portraits, Tracking-" "Karten/Stadtpläne, Mitarbeiter-Portraits, Tracking-Pixel.\n\n"
"Pixel. Behalte die Reihenfolge der Input-Liste bei. Antworte " "WICHTIG: Viele Wohnungsbaugesellschaften liefern ihre Fotos über ein "
"ausschließlich über den Tool-Call." "CDN mit kryptischen Dateinamen aus (z.B. "
"fileadmin/_processed_/2/3/xcsm_abc123def.webp.pagespeed.ic.xyz.webp). "
"Solche URLs sind IMMER zu behalten, wenn sie vom selben Host wie die "
"Anzeige kommen und in /fileadmin/_processed_/ oder /assets/media/ "
"oder ähnlichen Media-Pfaden liegen. Lehne nichts nur wegen eines "
"unleserlichen Dateinamens ab.\n\n"
"Behalte die Reihenfolge der Input-Liste bei. Antworte ausschließlich "
"über den Tool-Call."
) )
@ -92,5 +99,13 @@ def select_flat_image_urls(candidates: list[str], page_url: str,
# Constrain to the original candidate set so the model can't # Constrain to the original candidate set so the model can't
# invent URLs (it sometimes lightly rewrites them otherwise). # invent URLs (it sometimes lightly rewrites them otherwise).
allowed = set(candidates) allowed = set(candidates)
return [u for u in urls if u in allowed] kept = [u for u in urls if u in allowed]
if not kept and candidates:
# The model rejected everything — usually because the URLs
# are opaque CDN hashes it couldn't identify positively.
# Trust the Playwright pre-filter instead of returning [].
logger.warning("image-select returned 0 of %d — falling back to unfiltered list",
len(candidates))
return candidates
return kept
return candidates return candidates