enrichment: capture failure cause + admin retry button
Each enrichment failure now records {"_error": "...", "_step": "..."} into
enrichment_json, mirrors the message into the errors log (visible in
/logs/protokoll), and the list shows the cause as a tooltip on the
"Fehler beim Abrufen der Infos" text. Admins also get a "erneut versuchen"
link per failed row that re-queues just that flat (POST /actions/enrich-flat).
The pipeline raises a typed EnrichmentError per step (fetch / llm / crash)
so future failure modes don't get swallowed as a silent "failed".
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
e0ac869425
commit
a8f698bf5e
3 changed files with 79 additions and 24 deletions
15
web/app.py
15
web/app.py
|
|
@ -1045,6 +1045,21 @@ async def action_enrich_all(
|
||||||
return _wohnungen_partial_or_redirect(request, admin)
|
return _wohnungen_partial_or_redirect(request, admin)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/actions/enrich-flat")
|
||||||
|
async def action_enrich_flat(
|
||||||
|
request: Request,
|
||||||
|
flat_id: str = Form(...),
|
||||||
|
csrf: str = Form(...),
|
||||||
|
admin=Depends(require_admin),
|
||||||
|
):
|
||||||
|
require_csrf(admin["id"], csrf)
|
||||||
|
db.set_flat_enrichment(flat_id, "pending")
|
||||||
|
enrichment.kick(flat_id)
|
||||||
|
db.log_audit(admin["username"], "enrichment.retry",
|
||||||
|
f"flat={flat_id}", user_id=admin["id"], ip=client_ip(request))
|
||||||
|
return _wohnungen_partial_or_redirect(request, admin)
|
||||||
|
|
||||||
|
|
||||||
@app.post("/actions/users/delete")
|
@app.post("/actions/users/delete")
|
||||||
async def action_users_delete(
|
async def action_users_delete(
|
||||||
request: Request,
|
request: Request,
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,8 @@ For each new flat we:
|
||||||
4. Persist result on the flat row (enrichment_json + image_count + status)
|
4. Persist result on the flat row (enrichment_json + image_count + status)
|
||||||
|
|
||||||
Kicked as a detached asyncio task from /internal/flats so scraping stays fast.
|
Kicked as a detached asyncio task from /internal/flats so scraping stays fast.
|
||||||
A small queue cap + per-call lock would be next steps if we ever need them.
|
Every failure is caught, stashed in enrichment_json as {"_error": "...", ...}
|
||||||
|
and mirrored into the errors log so /logs/protokoll explains what went wrong.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
@ -17,7 +18,6 @@ import logging
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
@ -37,6 +37,14 @@ MAX_IMAGE_BYTES = 3_000_000 # 3 MB per image
|
||||||
IMAGE_TIMEOUT = 15
|
IMAGE_TIMEOUT = 15
|
||||||
|
|
||||||
|
|
||||||
|
class EnrichmentError(Exception):
|
||||||
|
"""Raised by each pipeline step with a human-readable reason."""
|
||||||
|
def __init__(self, step: str, reason: str):
|
||||||
|
self.step = step
|
||||||
|
self.reason = reason
|
||||||
|
super().__init__(f"{step}: {reason}")
|
||||||
|
|
||||||
|
|
||||||
def flat_slug(flat_id: str) -> str:
|
def flat_slug(flat_id: str) -> str:
|
||||||
"""Filesystem-safe short identifier for a flat (IDs are URLs)."""
|
"""Filesystem-safe short identifier for a flat (IDs are URLs)."""
|
||||||
return hashlib.sha1(flat_id.encode("utf-8")).hexdigest()[:16]
|
return hashlib.sha1(flat_id.encode("utf-8")).hexdigest()[:16]
|
||||||
|
|
@ -48,7 +56,9 @@ def flat_image_dir(flat_id: str) -> Path:
|
||||||
return d
|
return d
|
||||||
|
|
||||||
|
|
||||||
def _fetch_listing(url: str) -> Optional[dict]:
|
def _fetch_listing(url: str) -> dict:
|
||||||
|
if not INTERNAL_API_KEY:
|
||||||
|
raise EnrichmentError("fetch", "INTERNAL_API_KEY not configured in web env")
|
||||||
try:
|
try:
|
||||||
r = requests.post(
|
r = requests.post(
|
||||||
APPLY_FETCH_URL,
|
APPLY_FETCH_URL,
|
||||||
|
|
@ -57,12 +67,14 @@ def _fetch_listing(url: str) -> Optional[dict]:
|
||||||
timeout=90,
|
timeout=90,
|
||||||
)
|
)
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
logger.warning("fetch-listing request failed for %s: %s", url, e)
|
raise EnrichmentError("fetch", f"apply unreachable: {e}")
|
||||||
return None
|
|
||||||
if r.status_code >= 400:
|
if r.status_code >= 400:
|
||||||
logger.warning("fetch-listing %s: %s", r.status_code, r.text[:300])
|
snippet = (r.text or "")[:200].replace("\n", " ")
|
||||||
return None
|
raise EnrichmentError("fetch", f"apply returned HTTP {r.status_code}: {snippet}")
|
||||||
|
try:
|
||||||
return r.json()
|
return r.json()
|
||||||
|
except ValueError:
|
||||||
|
raise EnrichmentError("fetch", "apply returned non-JSON response")
|
||||||
|
|
||||||
|
|
||||||
def _ext_from_response(resp: requests.Response, url: str) -> str:
|
def _ext_from_response(resp: requests.Response, url: str) -> str:
|
||||||
|
|
@ -78,7 +90,6 @@ def _ext_from_response(resp: requests.Response, url: str) -> str:
|
||||||
|
|
||||||
def _download_images(flat_id: str, urls: list[str], referer: str) -> int:
|
def _download_images(flat_id: str, urls: list[str], referer: str) -> int:
|
||||||
d = flat_image_dir(flat_id)
|
d = flat_image_dir(flat_id)
|
||||||
# Clear any previous attempts so re-enrichment doesn't pile up dupes.
|
|
||||||
for old in d.iterdir():
|
for old in d.iterdir():
|
||||||
try: old.unlink()
|
try: old.unlink()
|
||||||
except OSError: pass
|
except OSError: pass
|
||||||
|
|
@ -126,24 +137,47 @@ def enrich_flat_sync(flat_id: str) -> None:
|
||||||
return
|
return
|
||||||
url = flat["link"]
|
url = flat["link"]
|
||||||
logger.info("enrich start flat=%s url=%s", flat_id, url)
|
logger.info("enrich start flat=%s url=%s", flat_id, url)
|
||||||
listing = _fetch_listing(url)
|
|
||||||
if not listing:
|
|
||||||
db.set_flat_enrichment(flat_id, "failed")
|
|
||||||
return
|
|
||||||
|
|
||||||
details = llm.extract_flat_details(listing.get("html") or "",
|
try:
|
||||||
listing.get("final_url") or url)
|
listing = _fetch_listing(url)
|
||||||
|
html = listing.get("html") or ""
|
||||||
|
final_url = listing.get("final_url") or url
|
||||||
|
if not html.strip():
|
||||||
|
raise EnrichmentError("fetch", "apply returned empty HTML")
|
||||||
|
|
||||||
|
details = llm.extract_flat_details(html, final_url)
|
||||||
if details is None:
|
if details is None:
|
||||||
db.set_flat_enrichment(flat_id, "failed")
|
raise EnrichmentError("llm", "model returned no tool_use or call failed (see web logs)")
|
||||||
return
|
|
||||||
|
|
||||||
image_urls = listing.get("image_urls") or []
|
image_urls = listing.get("image_urls") or []
|
||||||
image_count = _download_images(flat_id, image_urls, referer=url)
|
image_count = _download_images(flat_id, image_urls, referer=url)
|
||||||
|
except EnrichmentError as e:
|
||||||
|
_record_failure(flat_id, e.step, e.reason)
|
||||||
|
return
|
||||||
|
except Exception as e:
|
||||||
|
logger.exception("enrich crashed flat=%s", flat_id)
|
||||||
|
_record_failure(flat_id, "crash", f"{type(e).__name__}: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
db.set_flat_enrichment(flat_id, "ok", enrichment=details, image_count=image_count)
|
db.set_flat_enrichment(flat_id, "ok", enrichment=details, image_count=image_count)
|
||||||
logger.info("enrich done flat=%s images=%d", flat_id, image_count)
|
logger.info("enrich done flat=%s images=%d", flat_id, image_count)
|
||||||
|
|
||||||
|
|
||||||
|
def _record_failure(flat_id: str, step: str, reason: str) -> None:
|
||||||
|
logger.warning("enrich failed flat=%s step=%s: %s", flat_id, step, reason)
|
||||||
|
db.set_flat_enrichment(
|
||||||
|
flat_id, "failed",
|
||||||
|
enrichment={"_error": reason, "_step": step},
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
db.log_error(
|
||||||
|
source="enrichment", kind=f"enrich_{step}",
|
||||||
|
summary=f"flat={flat_id}: {reason}",
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def kick(flat_id: str) -> None:
|
def kick(flat_id: str) -> None:
|
||||||
"""Fire-and-forget enrichment in a background thread."""
|
"""Fire-and-forget enrichment in a background thread."""
|
||||||
asyncio.create_task(asyncio.to_thread(enrich_flat_sync, flat_id))
|
asyncio.create_task(asyncio.to_thread(enrich_flat_sync, flat_id))
|
||||||
|
|
@ -160,9 +194,6 @@ async def _backfill_runner() -> None:
|
||||||
|
|
||||||
|
|
||||||
def kick_backfill() -> int:
|
def kick_backfill() -> int:
|
||||||
"""Queue enrichment for every flat still pending/failed. Returns how many
|
|
||||||
flats are queued; the actual work happens in a detached task so the admin
|
|
||||||
UI doesn't block for minutes."""
|
|
||||||
pending = db.flats_needing_enrichment(limit=200)
|
pending = db.flats_needing_enrichment(limit=200)
|
||||||
asyncio.create_task(_backfill_runner())
|
asyncio.create_task(_backfill_runner())
|
||||||
return len(pending)
|
return len(pending)
|
||||||
|
|
|
||||||
|
|
@ -142,7 +142,16 @@
|
||||||
Infos werden abgerufen…
|
Infos werden abgerufen…
|
||||||
· <span data-rel-utc="{{ f.discovered_at|iso_utc }}" title="{{ f.discovered_at|de_dt }}">…</span>
|
· <span data-rel-utc="{{ f.discovered_at|iso_utc }}" title="{{ f.discovered_at|de_dt }}">…</span>
|
||||||
{% elif f.enrichment_status == 'failed' %}
|
{% elif f.enrichment_status == 'failed' %}
|
||||||
Fehler beim Abrufen der Infos
|
{% set err = (item.enrichment or {}).get('_error') or 'unbekannt' %}
|
||||||
|
<span title="{{ err }}">Fehler beim Abrufen der Infos</span>
|
||||||
|
{% if is_admin %}
|
||||||
|
<form method="post" action="/actions/enrich-flat" class="inline"
|
||||||
|
hx-post="/actions/enrich-flat" hx-target="#wohnungen-body" hx-swap="outerHTML">
|
||||||
|
<input type="hidden" name="csrf" value="{{ csrf }}">
|
||||||
|
<input type="hidden" name="flat_id" value="{{ f.id }}">
|
||||||
|
<button type="submit" class="underline text-slate-600 hover:text-slate-900 ml-1">erneut versuchen</button>
|
||||||
|
</form>
|
||||||
|
{% endif %}
|
||||||
· <span data-rel-utc="{{ f.discovered_at|iso_utc }}" title="{{ f.discovered_at|de_dt }}">…</span>
|
· <span data-rel-utc="{{ f.discovered_at|iso_utc }}" title="{{ f.discovered_at|de_dt }}">…</span>
|
||||||
{% else %}
|
{% else %}
|
||||||
{% set e = item.enrichment or {} %}
|
{% set e = item.enrichment or {} %}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue