lazyflat/web/llm.py
EiSiMo 3bb04210c4 secrets tab, drop commute filter, favicon, robust error reports
1. Admin → Geheimnisse sub-tab lets you edit ANTHROPIC_API_KEY +
   BERLIN_WOHNEN_USERNAME/PASSWORD at runtime. Migration v7 adds a
   secrets(key,value,updated_at) table; startup seeds missing keys from
   env (idempotent). web reads secrets DB-first (env fallback) via
   llm._api_key(); alert fetches them from web /internal/secrets on each
   scan, passes them into Scraper(). Rotating creds no longer needs a
   redeploy.
   Masked display: 6 leading + 4 trailing chars, "…" in the middle.
   Blank form fields leave the stored value untouched.

2. Drop the max_morning_commute filter from UI + server + FILTER_KEYS +
   filter summary (the underlying Maps.calculate_score code stays for
   potential future re-enable).

3. /static/didi.webp wired as favicon via <link rel="icon"> in base.html.

4. apply.open_page wraps page.goto in try/except so a failed load still
   produces a "goto.failed" step + screenshot instead of returning an
   empty forensics blob. networkidle + post-submission sleep are also
   made best-effort. The error ZIP export already writes screenshot+HTML
   per step and final_html — with this change every apply run leaves a
   reconstructable trail even when the listing is already offline.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-21 17:56:57 +02:00

124 lines
4.4 KiB
Python

"""Anthropic Haiku helper — used only to pick which `<img>` URLs on a
listing page are actual photos of the flat (vs. nav icons, badges, ads…).
If the API key is missing or the call fails, the caller passes the original
candidates straight through, so this is a soft enhancement, not a
dependency.
"""
from __future__ import annotations
import logging
from typing import Optional
import requests
import db
from settings import ANTHROPIC_API_KEY, ANTHROPIC_MODEL
logger = logging.getLogger("web.llm")
def _api_key() -> str:
"""DB-first so the admin UI can rotate the key without a redeploy."""
try:
k = db.get_secret("ANTHROPIC_API_KEY")
if k:
return k
except Exception:
pass
return ANTHROPIC_API_KEY
API_URL = "https://api.anthropic.com/v1/messages"
API_VERSION = "2023-06-01"
TOOL_NAME = "select_flat_images"
TOOL_SCHEMA = {
"type": "object",
"properties": {
"urls": {
"type": "array",
"items": {"type": "string"},
"description": "Subset of the candidate URLs that show the actual flat — "
"interior, exterior, floorplan. Keep ordering of input.",
},
},
"required": ["urls"],
"additionalProperties": False,
}
SYSTEM_PROMPT = (
"Du bekommst eine Liste von Bild-URLs einer Wohnungsanzeige. Wähle nur "
"die URLs aus, die ein Foto der Wohnung zeigen (Innenraum, Außenansicht, "
"Grundriss). Verwerfe Logos, Icons, Banner, Ads, Bewertungs-Sterne, "
"Karten/Stadtpläne, Mitarbeiter-Portraits, Tracking-Pixel.\n\n"
"WICHTIG: Viele Wohnungsbaugesellschaften liefern ihre Fotos über ein "
"CDN mit kryptischen Dateinamen aus (z.B. "
"fileadmin/_processed_/2/3/xcsm_abc123def.webp.pagespeed.ic.xyz.webp). "
"Solche URLs sind IMMER zu behalten, wenn sie vom selben Host wie die "
"Anzeige kommen und in /fileadmin/_processed_/ oder /assets/media/ "
"oder ähnlichen Media-Pfaden liegen. Lehne nichts nur wegen eines "
"unleserlichen Dateinamens ab.\n\n"
"Behalte die Reihenfolge der Input-Liste bei. Antworte ausschließlich "
"über den Tool-Call."
)
def select_flat_image_urls(candidates: list[str], page_url: str,
timeout: int = 30) -> list[str]:
"""Return the LLM-filtered subset, or the original list on any failure."""
key = _api_key()
if not key or not candidates:
return candidates
user_text = (
f"Seite: {page_url}\n\n"
"Kandidaten-URLs (nummeriert):\n"
+ "\n".join(f"{i+1}. {u}" for i, u in enumerate(candidates))
)
body = {
"model": ANTHROPIC_MODEL,
"max_tokens": 1500,
"system": SYSTEM_PROMPT,
"tools": [{
"name": TOOL_NAME,
"description": "Persist the selected flat-photo URLs.",
"input_schema": TOOL_SCHEMA,
}],
"tool_choice": {"type": "tool", "name": TOOL_NAME},
"messages": [{"role": "user", "content": user_text}],
}
try:
r = requests.post(
API_URL,
headers={
"x-api-key": key,
"anthropic-version": API_VERSION,
"content-type": "application/json",
},
json=body,
timeout=timeout,
)
except requests.RequestException as e:
logger.warning("anthropic image-select request failed: %s", e)
return candidates
if r.status_code >= 400:
logger.warning("anthropic image-select %s: %s", r.status_code, r.text[:300])
return candidates
data = r.json()
for block in data.get("content", []):
if block.get("type") == "tool_use" and block.get("name") == TOOL_NAME:
urls = (block.get("input") or {}).get("urls") or []
# Constrain to the original candidate set so the model can't
# invent URLs (it sometimes lightly rewrites them otherwise).
allowed = set(candidates)
kept = [u for u in urls if u in allowed]
if not kept and candidates:
# The model rejected everything — usually because the URLs
# are opaque CDN hashes it couldn't identify positively.
# Trust the Playwright pre-filter instead of returning [].
logger.warning("image-select returned 0 of %d — falling back to unfiltered list",
len(candidates))
return candidates
return kept
return candidates