enrichment: size floor + LLM fallback for opaque CDN URLs
Two issues surfaced on HOWOGE and similar sites: 1. Tiny icons/1x1 tracking pixels leaked through (e.g. image #5, 1.8 KB). Added MIN_IMAGE_BYTES = 15_000 and MIN_IMAGE_DIMENSION = 400 px on the short side; files below either threshold are dropped before saving. Pillow already gives us the dims as part of the phash pass, so the check is free. 2. Listings whose image URLs are opaque CDN hashes (.../fileadmin/_processed_/2/3/xcsm_<hash>.webp.pagespeed.ic.<hash>.webp) caused the LLM URL picker to reject every candidate, yielding 0 images for legit flats. Fixes: (a) prompt now explicitly instructs Haiku to keep same-host /fileadmin/_processed_/ style URLs even when the filename is illegible, (b) if the model still returns an empty set we fall back to the unfiltered Playwright candidates, trusting the pre-filter instead of erasing the gallery. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
0aa4c6c2bb
commit
83db8cd902
2 changed files with 38 additions and 13 deletions
|
|
@ -35,8 +35,10 @@ IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
|||
|
||||
MAX_IMAGES = 12
|
||||
MAX_IMAGE_BYTES = 5_000_000
|
||||
MIN_IMAGE_BYTES = 15_000 # Below this, it's an icon / tracking pixel
|
||||
MIN_IMAGE_DIMENSION = 400 # Shortest side in pixels — filters thumbs
|
||||
IMAGE_TIMEOUT = 15
|
||||
PHASH_DUPLICATE_THRESHOLD = 5 # Hamming distance ≤ N → considered the same picture
|
||||
PHASH_DUPLICATE_THRESHOLD = 5 # Hamming distance ≤ N → same picture
|
||||
|
||||
|
||||
class EnrichmentError(Exception):
|
||||
|
|
@ -87,19 +89,19 @@ def _ext_for(content_type: str, url: str) -> str:
|
|||
return ext.lower() or ".jpg"
|
||||
|
||||
|
||||
def _phash(data: bytes):
|
||||
"""Return an imagehash.ImageHash for the bytes, or None if unsupported."""
|
||||
def _image_info(data: bytes):
|
||||
"""Return (phash, (width, height)) for the bytes, or (None, None) on failure."""
|
||||
try:
|
||||
from PIL import Image
|
||||
import imagehash
|
||||
except ImportError:
|
||||
return None
|
||||
return None, None
|
||||
try:
|
||||
with Image.open(io.BytesIO(data)) as img:
|
||||
img.load()
|
||||
return imagehash.phash(img)
|
||||
return imagehash.phash(img), img.size
|
||||
except Exception:
|
||||
return None
|
||||
return None, None
|
||||
|
||||
|
||||
def _download_images(flat_id: str, urls: list[str], referer: str) -> int:
|
||||
|
|
@ -149,12 +151,20 @@ def _download_images(flat_id: str, urls: list[str], referer: str) -> int:
|
|||
|
||||
data_bytes = bytes(data)
|
||||
|
||||
# Size filter: files under ~15 KB are icons/tracking pixels.
|
||||
if len(data_bytes) < MIN_IMAGE_BYTES:
|
||||
continue
|
||||
|
||||
sha = hashlib.sha256(data_bytes).hexdigest()
|
||||
if sha in seen_sha:
|
||||
continue
|
||||
seen_sha.add(sha)
|
||||
|
||||
ph = _phash(data_bytes)
|
||||
ph, dims = _image_info(data_bytes)
|
||||
# Pixel filter: anything smaller than 400 px on the short side is a
|
||||
# thumbnail/avatar, not a real flat photo.
|
||||
if dims is not None and min(dims) < MIN_IMAGE_DIMENSION:
|
||||
continue
|
||||
if ph is not None:
|
||||
if any((ph - prev) <= PHASH_DUPLICATE_THRESHOLD for prev in seen_phash):
|
||||
continue
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue