From a8f698bf5e2ca302b331e86649b43d94f6f3c0c1 Mon Sep 17 00:00:00 2001 From: EiSiMo Date: Tue, 21 Apr 2026 15:05:39 +0200 Subject: [PATCH] enrichment: capture failure cause + admin retry button Each enrichment failure now records {"_error": "...", "_step": "..."} into enrichment_json, mirrors the message into the errors log (visible in /logs/protokoll), and the list shows the cause as a tooltip on the "Fehler beim Abrufen der Infos" text. Admins also get a "erneut versuchen" link per failed row that re-queues just that flat (POST /actions/enrich-flat). The pipeline raises a typed EnrichmentError per step (fetch / llm / crash) so future failure modes don't get swallowed as a silent "failed". Co-Authored-By: Claude Opus 4.7 (1M context) --- web/app.py | 15 ++++++ web/enrichment.py | 77 +++++++++++++++++++++--------- web/templates/_wohnungen_body.html | 11 ++++- 3 files changed, 79 insertions(+), 24 deletions(-) diff --git a/web/app.py b/web/app.py index 64f4f88..2999224 100644 --- a/web/app.py +++ b/web/app.py @@ -1045,6 +1045,21 @@ async def action_enrich_all( return _wohnungen_partial_or_redirect(request, admin) +@app.post("/actions/enrich-flat") +async def action_enrich_flat( + request: Request, + flat_id: str = Form(...), + csrf: str = Form(...), + admin=Depends(require_admin), +): + require_csrf(admin["id"], csrf) + db.set_flat_enrichment(flat_id, "pending") + enrichment.kick(flat_id) + db.log_audit(admin["username"], "enrichment.retry", + f"flat={flat_id}", user_id=admin["id"], ip=client_ip(request)) + return _wohnungen_partial_or_redirect(request, admin) + + @app.post("/actions/users/delete") async def action_users_delete( request: Request, diff --git a/web/enrichment.py b/web/enrichment.py index 50e8f33..56f7f30 100644 --- a/web/enrichment.py +++ b/web/enrichment.py @@ -7,7 +7,8 @@ For each new flat we: 4. Persist result on the flat row (enrichment_json + image_count + status) Kicked as a detached asyncio task from /internal/flats so scraping stays fast. -A small queue cap + per-call lock would be next steps if we ever need them. +Every failure is caught, stashed in enrichment_json as {"_error": "...", ...} +and mirrored into the errors log so /logs/protokoll explains what went wrong. """ from __future__ import annotations @@ -17,7 +18,6 @@ import logging import mimetypes import os from pathlib import Path -from typing import Optional from urllib.parse import urlparse import requests @@ -37,6 +37,14 @@ MAX_IMAGE_BYTES = 3_000_000 # 3 MB per image IMAGE_TIMEOUT = 15 +class EnrichmentError(Exception): + """Raised by each pipeline step with a human-readable reason.""" + def __init__(self, step: str, reason: str): + self.step = step + self.reason = reason + super().__init__(f"{step}: {reason}") + + def flat_slug(flat_id: str) -> str: """Filesystem-safe short identifier for a flat (IDs are URLs).""" return hashlib.sha1(flat_id.encode("utf-8")).hexdigest()[:16] @@ -48,7 +56,9 @@ def flat_image_dir(flat_id: str) -> Path: return d -def _fetch_listing(url: str) -> Optional[dict]: +def _fetch_listing(url: str) -> dict: + if not INTERNAL_API_KEY: + raise EnrichmentError("fetch", "INTERNAL_API_KEY not configured in web env") try: r = requests.post( APPLY_FETCH_URL, @@ -57,12 +67,14 @@ def _fetch_listing(url: str) -> Optional[dict]: timeout=90, ) except requests.RequestException as e: - logger.warning("fetch-listing request failed for %s: %s", url, e) - return None + raise EnrichmentError("fetch", f"apply unreachable: {e}") if r.status_code >= 400: - logger.warning("fetch-listing %s: %s", r.status_code, r.text[:300]) - return None - return r.json() + snippet = (r.text or "")[:200].replace("\n", " ") + raise EnrichmentError("fetch", f"apply returned HTTP {r.status_code}: {snippet}") + try: + return r.json() + except ValueError: + raise EnrichmentError("fetch", "apply returned non-JSON response") def _ext_from_response(resp: requests.Response, url: str) -> str: @@ -78,7 +90,6 @@ def _ext_from_response(resp: requests.Response, url: str) -> str: def _download_images(flat_id: str, urls: list[str], referer: str) -> int: d = flat_image_dir(flat_id) - # Clear any previous attempts so re-enrichment doesn't pile up dupes. for old in d.iterdir(): try: old.unlink() except OSError: pass @@ -126,24 +137,47 @@ def enrich_flat_sync(flat_id: str) -> None: return url = flat["link"] logger.info("enrich start flat=%s url=%s", flat_id, url) - listing = _fetch_listing(url) - if not listing: - db.set_flat_enrichment(flat_id, "failed") - return - details = llm.extract_flat_details(listing.get("html") or "", - listing.get("final_url") or url) - if details is None: - db.set_flat_enrichment(flat_id, "failed") - return + try: + listing = _fetch_listing(url) + html = listing.get("html") or "" + final_url = listing.get("final_url") or url + if not html.strip(): + raise EnrichmentError("fetch", "apply returned empty HTML") - image_urls = listing.get("image_urls") or [] - image_count = _download_images(flat_id, image_urls, referer=url) + details = llm.extract_flat_details(html, final_url) + if details is None: + raise EnrichmentError("llm", "model returned no tool_use or call failed (see web logs)") + + image_urls = listing.get("image_urls") or [] + image_count = _download_images(flat_id, image_urls, referer=url) + except EnrichmentError as e: + _record_failure(flat_id, e.step, e.reason) + return + except Exception as e: + logger.exception("enrich crashed flat=%s", flat_id) + _record_failure(flat_id, "crash", f"{type(e).__name__}: {e}") + return db.set_flat_enrichment(flat_id, "ok", enrichment=details, image_count=image_count) logger.info("enrich done flat=%s images=%d", flat_id, image_count) +def _record_failure(flat_id: str, step: str, reason: str) -> None: + logger.warning("enrich failed flat=%s step=%s: %s", flat_id, step, reason) + db.set_flat_enrichment( + flat_id, "failed", + enrichment={"_error": reason, "_step": step}, + ) + try: + db.log_error( + source="enrichment", kind=f"enrich_{step}", + summary=f"flat={flat_id}: {reason}", + ) + except Exception: + pass + + def kick(flat_id: str) -> None: """Fire-and-forget enrichment in a background thread.""" asyncio.create_task(asyncio.to_thread(enrich_flat_sync, flat_id)) @@ -160,9 +194,6 @@ async def _backfill_runner() -> None: def kick_backfill() -> int: - """Queue enrichment for every flat still pending/failed. Returns how many - flats are queued; the actual work happens in a detached task so the admin - UI doesn't block for minutes.""" pending = db.flats_needing_enrichment(limit=200) asyncio.create_task(_backfill_runner()) return len(pending) diff --git a/web/templates/_wohnungen_body.html b/web/templates/_wohnungen_body.html index 46c2f15..58eb219 100644 --- a/web/templates/_wohnungen_body.html +++ b/web/templates/_wohnungen_body.html @@ -142,7 +142,16 @@ Infos werden abgerufen… · {% elif f.enrichment_status == 'failed' %} - Fehler beim Abrufen der Infos + {% set err = (item.enrichment or {}).get('_error') or 'unbekannt' %} + Fehler beim Abrufen der Infos + {% if is_admin %} +
+ + + +
+ {% endif %} · {% else %} {% set e = item.enrichment or {} %}