enrichment: capture failure cause + admin retry button

Each enrichment failure now records {"_error": "...", "_step": "..."} into
enrichment_json, mirrors the message into the errors log (visible in
/logs/protokoll), and the list shows the cause as a tooltip on the
"Fehler beim Abrufen der Infos" text. Admins also get a "erneut versuchen"
link per failed row that re-queues just that flat (POST /actions/enrich-flat).

The pipeline raises a typed EnrichmentError per step (fetch / llm / crash)
so future failure modes don't get swallowed as a silent "failed".

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
EiSiMo 2026-04-21 15:05:39 +02:00
parent e0ac869425
commit a8f698bf5e
3 changed files with 79 additions and 24 deletions

View file

@ -1045,6 +1045,21 @@ async def action_enrich_all(
return _wohnungen_partial_or_redirect(request, admin) return _wohnungen_partial_or_redirect(request, admin)
@app.post("/actions/enrich-flat")
async def action_enrich_flat(
request: Request,
flat_id: str = Form(...),
csrf: str = Form(...),
admin=Depends(require_admin),
):
require_csrf(admin["id"], csrf)
db.set_flat_enrichment(flat_id, "pending")
enrichment.kick(flat_id)
db.log_audit(admin["username"], "enrichment.retry",
f"flat={flat_id}", user_id=admin["id"], ip=client_ip(request))
return _wohnungen_partial_or_redirect(request, admin)
@app.post("/actions/users/delete") @app.post("/actions/users/delete")
async def action_users_delete( async def action_users_delete(
request: Request, request: Request,

View file

@ -7,7 +7,8 @@ For each new flat we:
4. Persist result on the flat row (enrichment_json + image_count + status) 4. Persist result on the flat row (enrichment_json + image_count + status)
Kicked as a detached asyncio task from /internal/flats so scraping stays fast. Kicked as a detached asyncio task from /internal/flats so scraping stays fast.
A small queue cap + per-call lock would be next steps if we ever need them. Every failure is caught, stashed in enrichment_json as {"_error": "...", ...}
and mirrored into the errors log so /logs/protokoll explains what went wrong.
""" """
from __future__ import annotations from __future__ import annotations
@ -17,7 +18,6 @@ import logging
import mimetypes import mimetypes
import os import os
from pathlib import Path from pathlib import Path
from typing import Optional
from urllib.parse import urlparse from urllib.parse import urlparse
import requests import requests
@ -37,6 +37,14 @@ MAX_IMAGE_BYTES = 3_000_000 # 3 MB per image
IMAGE_TIMEOUT = 15 IMAGE_TIMEOUT = 15
class EnrichmentError(Exception):
"""Raised by each pipeline step with a human-readable reason."""
def __init__(self, step: str, reason: str):
self.step = step
self.reason = reason
super().__init__(f"{step}: {reason}")
def flat_slug(flat_id: str) -> str: def flat_slug(flat_id: str) -> str:
"""Filesystem-safe short identifier for a flat (IDs are URLs).""" """Filesystem-safe short identifier for a flat (IDs are URLs)."""
return hashlib.sha1(flat_id.encode("utf-8")).hexdigest()[:16] return hashlib.sha1(flat_id.encode("utf-8")).hexdigest()[:16]
@ -48,7 +56,9 @@ def flat_image_dir(flat_id: str) -> Path:
return d return d
def _fetch_listing(url: str) -> Optional[dict]: def _fetch_listing(url: str) -> dict:
if not INTERNAL_API_KEY:
raise EnrichmentError("fetch", "INTERNAL_API_KEY not configured in web env")
try: try:
r = requests.post( r = requests.post(
APPLY_FETCH_URL, APPLY_FETCH_URL,
@ -57,12 +67,14 @@ def _fetch_listing(url: str) -> Optional[dict]:
timeout=90, timeout=90,
) )
except requests.RequestException as e: except requests.RequestException as e:
logger.warning("fetch-listing request failed for %s: %s", url, e) raise EnrichmentError("fetch", f"apply unreachable: {e}")
return None
if r.status_code >= 400: if r.status_code >= 400:
logger.warning("fetch-listing %s: %s", r.status_code, r.text[:300]) snippet = (r.text or "")[:200].replace("\n", " ")
return None raise EnrichmentError("fetch", f"apply returned HTTP {r.status_code}: {snippet}")
return r.json() try:
return r.json()
except ValueError:
raise EnrichmentError("fetch", "apply returned non-JSON response")
def _ext_from_response(resp: requests.Response, url: str) -> str: def _ext_from_response(resp: requests.Response, url: str) -> str:
@ -78,7 +90,6 @@ def _ext_from_response(resp: requests.Response, url: str) -> str:
def _download_images(flat_id: str, urls: list[str], referer: str) -> int: def _download_images(flat_id: str, urls: list[str], referer: str) -> int:
d = flat_image_dir(flat_id) d = flat_image_dir(flat_id)
# Clear any previous attempts so re-enrichment doesn't pile up dupes.
for old in d.iterdir(): for old in d.iterdir():
try: old.unlink() try: old.unlink()
except OSError: pass except OSError: pass
@ -126,24 +137,47 @@ def enrich_flat_sync(flat_id: str) -> None:
return return
url = flat["link"] url = flat["link"]
logger.info("enrich start flat=%s url=%s", flat_id, url) logger.info("enrich start flat=%s url=%s", flat_id, url)
listing = _fetch_listing(url)
if not listing:
db.set_flat_enrichment(flat_id, "failed")
return
details = llm.extract_flat_details(listing.get("html") or "", try:
listing.get("final_url") or url) listing = _fetch_listing(url)
if details is None: html = listing.get("html") or ""
db.set_flat_enrichment(flat_id, "failed") final_url = listing.get("final_url") or url
return if not html.strip():
raise EnrichmentError("fetch", "apply returned empty HTML")
image_urls = listing.get("image_urls") or [] details = llm.extract_flat_details(html, final_url)
image_count = _download_images(flat_id, image_urls, referer=url) if details is None:
raise EnrichmentError("llm", "model returned no tool_use or call failed (see web logs)")
image_urls = listing.get("image_urls") or []
image_count = _download_images(flat_id, image_urls, referer=url)
except EnrichmentError as e:
_record_failure(flat_id, e.step, e.reason)
return
except Exception as e:
logger.exception("enrich crashed flat=%s", flat_id)
_record_failure(flat_id, "crash", f"{type(e).__name__}: {e}")
return
db.set_flat_enrichment(flat_id, "ok", enrichment=details, image_count=image_count) db.set_flat_enrichment(flat_id, "ok", enrichment=details, image_count=image_count)
logger.info("enrich done flat=%s images=%d", flat_id, image_count) logger.info("enrich done flat=%s images=%d", flat_id, image_count)
def _record_failure(flat_id: str, step: str, reason: str) -> None:
logger.warning("enrich failed flat=%s step=%s: %s", flat_id, step, reason)
db.set_flat_enrichment(
flat_id, "failed",
enrichment={"_error": reason, "_step": step},
)
try:
db.log_error(
source="enrichment", kind=f"enrich_{step}",
summary=f"flat={flat_id}: {reason}",
)
except Exception:
pass
def kick(flat_id: str) -> None: def kick(flat_id: str) -> None:
"""Fire-and-forget enrichment in a background thread.""" """Fire-and-forget enrichment in a background thread."""
asyncio.create_task(asyncio.to_thread(enrich_flat_sync, flat_id)) asyncio.create_task(asyncio.to_thread(enrich_flat_sync, flat_id))
@ -160,9 +194,6 @@ async def _backfill_runner() -> None:
def kick_backfill() -> int: def kick_backfill() -> int:
"""Queue enrichment for every flat still pending/failed. Returns how many
flats are queued; the actual work happens in a detached task so the admin
UI doesn't block for minutes."""
pending = db.flats_needing_enrichment(limit=200) pending = db.flats_needing_enrichment(limit=200)
asyncio.create_task(_backfill_runner()) asyncio.create_task(_backfill_runner())
return len(pending) return len(pending)

View file

@ -142,7 +142,16 @@
Infos werden abgerufen… Infos werden abgerufen…
· <span data-rel-utc="{{ f.discovered_at|iso_utc }}" title="{{ f.discovered_at|de_dt }}"></span> · <span data-rel-utc="{{ f.discovered_at|iso_utc }}" title="{{ f.discovered_at|de_dt }}"></span>
{% elif f.enrichment_status == 'failed' %} {% elif f.enrichment_status == 'failed' %}
Fehler beim Abrufen der Infos {% set err = (item.enrichment or {}).get('_error') or 'unbekannt' %}
<span title="{{ err }}">Fehler beim Abrufen der Infos</span>
{% if is_admin %}
<form method="post" action="/actions/enrich-flat" class="inline"
hx-post="/actions/enrich-flat" hx-target="#wohnungen-body" hx-swap="outerHTML">
<input type="hidden" name="csrf" value="{{ csrf }}">
<input type="hidden" name="flat_id" value="{{ f.id }}">
<button type="submit" class="underline text-slate-600 hover:text-slate-900 ml-1">erneut versuchen</button>
</form>
{% endif %}
· <span data-rel-utc="{{ f.discovered_at|iso_utc }}" title="{{ f.discovered_at|de_dt }}"></span> · <span data-rel-utc="{{ f.discovered_at|iso_utc }}" title="{{ f.discovered_at|de_dt }}"></span>
{% else %} {% else %}
{% set e = item.enrichment or {} %} {% set e = item.enrichment or {} %}