Two issues surfaced on HOWOGE and similar sites: 1. Tiny icons/1x1 tracking pixels leaked through (e.g. image #5, 1.8 KB). Added MIN_IMAGE_BYTES = 15_000 and MIN_IMAGE_DIMENSION = 400 px on the short side; files below either threshold are dropped before saving. Pillow already gives us the dims as part of the phash pass, so the check is free. 2. Listings whose image URLs are opaque CDN hashes (.../fileadmin/_processed_/2/3/xcsm_<hash>.webp.pagespeed.ic.<hash>.webp) caused the LLM URL picker to reject every candidate, yielding 0 images for legit flats. Fixes: (a) prompt now explicitly instructs Haiku to keep same-host /fileadmin/_processed_/ style URLs even when the filename is illegible, (b) if the model still returns an empty set we fall back to the unfiltered Playwright candidates, trusting the pre-filter instead of erasing the gallery. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
111 lines
4.2 KiB
Python
111 lines
4.2 KiB
Python
"""Anthropic Haiku helper — used only to pick which `<img>` URLs on a
|
|
listing page are actual photos of the flat (vs. nav icons, badges, ads…).
|
|
|
|
If the API key is missing or the call fails, the caller passes the original
|
|
candidates straight through, so this is a soft enhancement, not a
|
|
dependency.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from typing import Optional
|
|
|
|
import requests
|
|
|
|
from settings import ANTHROPIC_API_KEY, ANTHROPIC_MODEL
|
|
|
|
logger = logging.getLogger("web.llm")
|
|
|
|
API_URL = "https://api.anthropic.com/v1/messages"
|
|
API_VERSION = "2023-06-01"
|
|
|
|
TOOL_NAME = "select_flat_images"
|
|
TOOL_SCHEMA = {
|
|
"type": "object",
|
|
"properties": {
|
|
"urls": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"description": "Subset of the candidate URLs that show the actual flat — "
|
|
"interior, exterior, floorplan. Keep ordering of input.",
|
|
},
|
|
},
|
|
"required": ["urls"],
|
|
"additionalProperties": False,
|
|
}
|
|
|
|
SYSTEM_PROMPT = (
|
|
"Du bekommst eine Liste von Bild-URLs einer Wohnungsanzeige. Wähle nur "
|
|
"die URLs aus, die ein Foto der Wohnung zeigen (Innenraum, Außenansicht, "
|
|
"Grundriss). Verwerfe Logos, Icons, Banner, Ads, Bewertungs-Sterne, "
|
|
"Karten/Stadtpläne, Mitarbeiter-Portraits, Tracking-Pixel.\n\n"
|
|
"WICHTIG: Viele Wohnungsbaugesellschaften liefern ihre Fotos über ein "
|
|
"CDN mit kryptischen Dateinamen aus (z.B. "
|
|
"fileadmin/_processed_/2/3/xcsm_abc123def.webp.pagespeed.ic.xyz.webp). "
|
|
"Solche URLs sind IMMER zu behalten, wenn sie vom selben Host wie die "
|
|
"Anzeige kommen und in /fileadmin/_processed_/ oder /assets/media/ "
|
|
"oder ähnlichen Media-Pfaden liegen. Lehne nichts nur wegen eines "
|
|
"unleserlichen Dateinamens ab.\n\n"
|
|
"Behalte die Reihenfolge der Input-Liste bei. Antworte ausschließlich "
|
|
"über den Tool-Call."
|
|
)
|
|
|
|
|
|
def select_flat_image_urls(candidates: list[str], page_url: str,
|
|
timeout: int = 30) -> list[str]:
|
|
"""Return the LLM-filtered subset, or the original list on any failure."""
|
|
if not ANTHROPIC_API_KEY or not candidates:
|
|
return candidates
|
|
|
|
user_text = (
|
|
f"Seite: {page_url}\n\n"
|
|
"Kandidaten-URLs (nummeriert):\n"
|
|
+ "\n".join(f"{i+1}. {u}" for i, u in enumerate(candidates))
|
|
)
|
|
body = {
|
|
"model": ANTHROPIC_MODEL,
|
|
"max_tokens": 1500,
|
|
"system": SYSTEM_PROMPT,
|
|
"tools": [{
|
|
"name": TOOL_NAME,
|
|
"description": "Persist the selected flat-photo URLs.",
|
|
"input_schema": TOOL_SCHEMA,
|
|
}],
|
|
"tool_choice": {"type": "tool", "name": TOOL_NAME},
|
|
"messages": [{"role": "user", "content": user_text}],
|
|
}
|
|
try:
|
|
r = requests.post(
|
|
API_URL,
|
|
headers={
|
|
"x-api-key": ANTHROPIC_API_KEY,
|
|
"anthropic-version": API_VERSION,
|
|
"content-type": "application/json",
|
|
},
|
|
json=body,
|
|
timeout=timeout,
|
|
)
|
|
except requests.RequestException as e:
|
|
logger.warning("anthropic image-select request failed: %s", e)
|
|
return candidates
|
|
if r.status_code >= 400:
|
|
logger.warning("anthropic image-select %s: %s", r.status_code, r.text[:300])
|
|
return candidates
|
|
|
|
data = r.json()
|
|
for block in data.get("content", []):
|
|
if block.get("type") == "tool_use" and block.get("name") == TOOL_NAME:
|
|
urls = (block.get("input") or {}).get("urls") or []
|
|
# Constrain to the original candidate set so the model can't
|
|
# invent URLs (it sometimes lightly rewrites them otherwise).
|
|
allowed = set(candidates)
|
|
kept = [u for u in urls if u in allowed]
|
|
if not kept and candidates:
|
|
# The model rejected everything — usually because the URLs
|
|
# are opaque CDN hashes it couldn't identify positively.
|
|
# Trust the Playwright pre-filter instead of returning [].
|
|
logger.warning("image-select returned 0 of %d — falling back to unfiltered list",
|
|
len(candidates))
|
|
return candidates
|
|
return kept
|
|
return candidates
|