1. Admin → Geheimnisse sub-tab lets you edit ANTHROPIC_API_KEY + BERLIN_WOHNEN_USERNAME/PASSWORD at runtime. Migration v7 adds a secrets(key,value,updated_at) table; startup seeds missing keys from env (idempotent). web reads secrets DB-first (env fallback) via llm._api_key(); alert fetches them from web /internal/secrets on each scan, passes them into Scraper(). Rotating creds no longer needs a redeploy. Masked display: 6 leading + 4 trailing chars, "…" in the middle. Blank form fields leave the stored value untouched. 2. Drop the max_morning_commute filter from UI + server + FILTER_KEYS + filter summary (the underlying Maps.calculate_score code stays for potential future re-enable). 3. /static/didi.webp wired as favicon via <link rel="icon"> in base.html. 4. apply.open_page wraps page.goto in try/except so a failed load still produces a "goto.failed" step + screenshot instead of returning an empty forensics blob. networkidle + post-submission sleep are also made best-effort. The error ZIP export already writes screenshot+HTML per step and final_html — with this change every apply run leaves a reconstructable trail even when the listing is already offline. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
124 lines
4.4 KiB
Python
124 lines
4.4 KiB
Python
"""Anthropic Haiku helper — used only to pick which `<img>` URLs on a
|
|
listing page are actual photos of the flat (vs. nav icons, badges, ads…).
|
|
|
|
If the API key is missing or the call fails, the caller passes the original
|
|
candidates straight through, so this is a soft enhancement, not a
|
|
dependency.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from typing import Optional
|
|
|
|
import requests
|
|
|
|
import db
|
|
from settings import ANTHROPIC_API_KEY, ANTHROPIC_MODEL
|
|
|
|
logger = logging.getLogger("web.llm")
|
|
|
|
|
|
def _api_key() -> str:
|
|
"""DB-first so the admin UI can rotate the key without a redeploy."""
|
|
try:
|
|
k = db.get_secret("ANTHROPIC_API_KEY")
|
|
if k:
|
|
return k
|
|
except Exception:
|
|
pass
|
|
return ANTHROPIC_API_KEY
|
|
|
|
API_URL = "https://api.anthropic.com/v1/messages"
|
|
API_VERSION = "2023-06-01"
|
|
|
|
TOOL_NAME = "select_flat_images"
|
|
TOOL_SCHEMA = {
|
|
"type": "object",
|
|
"properties": {
|
|
"urls": {
|
|
"type": "array",
|
|
"items": {"type": "string"},
|
|
"description": "Subset of the candidate URLs that show the actual flat — "
|
|
"interior, exterior, floorplan. Keep ordering of input.",
|
|
},
|
|
},
|
|
"required": ["urls"],
|
|
"additionalProperties": False,
|
|
}
|
|
|
|
SYSTEM_PROMPT = (
|
|
"Du bekommst eine Liste von Bild-URLs einer Wohnungsanzeige. Wähle nur "
|
|
"die URLs aus, die ein Foto der Wohnung zeigen (Innenraum, Außenansicht, "
|
|
"Grundriss). Verwerfe Logos, Icons, Banner, Ads, Bewertungs-Sterne, "
|
|
"Karten/Stadtpläne, Mitarbeiter-Portraits, Tracking-Pixel.\n\n"
|
|
"WICHTIG: Viele Wohnungsbaugesellschaften liefern ihre Fotos über ein "
|
|
"CDN mit kryptischen Dateinamen aus (z.B. "
|
|
"fileadmin/_processed_/2/3/xcsm_abc123def.webp.pagespeed.ic.xyz.webp). "
|
|
"Solche URLs sind IMMER zu behalten, wenn sie vom selben Host wie die "
|
|
"Anzeige kommen und in /fileadmin/_processed_/ oder /assets/media/ "
|
|
"oder ähnlichen Media-Pfaden liegen. Lehne nichts nur wegen eines "
|
|
"unleserlichen Dateinamens ab.\n\n"
|
|
"Behalte die Reihenfolge der Input-Liste bei. Antworte ausschließlich "
|
|
"über den Tool-Call."
|
|
)
|
|
|
|
|
|
def select_flat_image_urls(candidates: list[str], page_url: str,
|
|
timeout: int = 30) -> list[str]:
|
|
"""Return the LLM-filtered subset, or the original list on any failure."""
|
|
key = _api_key()
|
|
if not key or not candidates:
|
|
return candidates
|
|
|
|
user_text = (
|
|
f"Seite: {page_url}\n\n"
|
|
"Kandidaten-URLs (nummeriert):\n"
|
|
+ "\n".join(f"{i+1}. {u}" for i, u in enumerate(candidates))
|
|
)
|
|
body = {
|
|
"model": ANTHROPIC_MODEL,
|
|
"max_tokens": 1500,
|
|
"system": SYSTEM_PROMPT,
|
|
"tools": [{
|
|
"name": TOOL_NAME,
|
|
"description": "Persist the selected flat-photo URLs.",
|
|
"input_schema": TOOL_SCHEMA,
|
|
}],
|
|
"tool_choice": {"type": "tool", "name": TOOL_NAME},
|
|
"messages": [{"role": "user", "content": user_text}],
|
|
}
|
|
try:
|
|
r = requests.post(
|
|
API_URL,
|
|
headers={
|
|
"x-api-key": key,
|
|
"anthropic-version": API_VERSION,
|
|
"content-type": "application/json",
|
|
},
|
|
json=body,
|
|
timeout=timeout,
|
|
)
|
|
except requests.RequestException as e:
|
|
logger.warning("anthropic image-select request failed: %s", e)
|
|
return candidates
|
|
if r.status_code >= 400:
|
|
logger.warning("anthropic image-select %s: %s", r.status_code, r.text[:300])
|
|
return candidates
|
|
|
|
data = r.json()
|
|
for block in data.get("content", []):
|
|
if block.get("type") == "tool_use" and block.get("name") == TOOL_NAME:
|
|
urls = (block.get("input") or {}).get("urls") or []
|
|
# Constrain to the original candidate set so the model can't
|
|
# invent URLs (it sometimes lightly rewrites them otherwise).
|
|
allowed = set(candidates)
|
|
kept = [u for u in urls if u in allowed]
|
|
if not kept and candidates:
|
|
# The model rejected everything — usually because the URLs
|
|
# are opaque CDN hashes it couldn't identify positively.
|
|
# Trust the Playwright pre-filter instead of returning [].
|
|
logger.warning("image-select returned 0 of %d — falling back to unfiltered list",
|
|
len(candidates))
|
|
return candidates
|
|
return kept
|
|
return candidates
|