enrichment: Haiku flat details + image gallery on expand

apply service - POST /internal/fetch-listing: headless Playwright fetch of a listing URL, returns {html, image_urls[], final_url}. Uses the same browser fingerprint/profile as the apply run so bot guards don't kick in web service - New enrichment pipeline (web/enrichment.py): /internal/flats → upsert → kick() enrichment in a background thread 1. POST /internal/fetch-listing on apply 2. llm.extract_flat_details(html, url) — Haiku tool-use call returns structured JSON (address, rooms, rent, description, pros/cons, etc.) 3. Download each image directly to /data/flats/<slug>/NN.<ext> 4. Persist enrichment_json + image_count + enrichment_status on the flat - llm.py: minimal Anthropic /v1/messages wrapper, no SDK - DB migration v5 adds enrichment_json/_status/_updated_at + image_count - Admin "Altbestand anreichern" button (POST /actions/enrich-all) queues backfill for all pending/failed rows; runs in a detached task - GET /partials/wohnung/<id> renders _wohnung_detail.html - GET /flat-images/<slug>/<n> serves the downloaded image UI - Chevron on each list row toggles an inline detail pane (HTMX fetch on first open, hx-preserve keeps it open across the 3–30 s polls) - CSS .flat-gallery normalises image tiles to a 4/3 aspect with object-fit: cover so different source sizes align cleanly - "analysiert…" / "?" chips on the list reflect enrichment_status Config - ANTHROPIC_API_KEY + ANTHROPIC_MODEL wired into docker-compose's web service (default model: claude-haiku-4-5-20251001) Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-21 14:46:12 +02:00 · 2026-04-21 14:46:12 +02:00 · eb66284172
commit eb66284172
parent 2609d3504a
11 changed files with 688 additions and 44 deletions
--- a/apply/main.py
+++ b/apply/main.py
@ -1,8 +1,9 @@
 import logging
 from contextlib import asynccontextmanager
-from urllib.parse import urlparse
+from urllib.parse import urljoin, urlparse

 from fastapi import Depends, FastAPI, Header, HTTPException, status
+from playwright.async_api import ViewportSize, async_playwright
 from pydantic import BaseModel, Field
 from rich.console import Console
 from rich.logging import RichHandler
@ -13,7 +14,7 @@ from classes.application_result import ApplicationResult
 from classes.profile import Profile
 from language import _
 from providers._provider import ApplyContext
-from settings import INTERNAL_API_KEY
+from settings import BROWSER_HEIGHT, BROWSER_LOCALE, BROWSER_WIDTH, HEADLESS, INTERNAL_API_KEY


 def setup_logging():
@ -125,3 +126,93 @@ async def apply(req: ApplyRequest):
        application_id=req.application_id,
        forensics=recorder.to_json(),
    )
+
+
+class FetchListingRequest(BaseModel):
+    url: str
+
+
+class FetchListingResponse(BaseModel):
+    final_url: str
+    html: str
+    image_urls: list[str]
+
+
+MAX_FETCH_HTML_BYTES = 400_000
+MAX_FETCH_IMAGES = 30
+
+
+@app.post(
+    "/internal/fetch-listing",
+    response_model=FetchListingResponse,
+    dependencies=[Depends(require_api_key)],
+)
+async def fetch_listing(req: FetchListingRequest):
+    """Headless Playwright fetch of a flat listing — returns page HTML +
+    absolute image URLs. Used by the web service's LLM enrichment pipeline
+    so we look like a real browser and don't get bounced by bot guards."""
+    url = req.url.strip()
+    if not url:
+        raise HTTPException(400, "url required")
+    logger.info("fetch-listing url=%s", url)
+
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(
+            headless=HEADLESS,
+            args=["--disable-blink-features=AutomationControlled"],
+        )
+        try:
+            context = await browser.new_context(
+                viewport=ViewportSize({"width": BROWSER_WIDTH, "height": BROWSER_HEIGHT}),
+                locale=BROWSER_LOCALE,
+            )
+            page = await context.new_page()
+            await page.goto(url, timeout=30_000)
+            try:
+                await page.wait_for_load_state("networkidle", timeout=10_000)
+            except Exception:
+                pass
+            final_url = page.url
+            html = await page.content()
+            # Collect image candidates: <img src> + <img data-src> + srcset first URL.
+            raw_imgs: list[str] = await page.evaluate(
+                """() => {
+                    const out = [];
+                    document.querySelectorAll('img').forEach((img) => {
+                        if (img.src) out.push(img.src);
+                        const ds = img.getAttribute('data-src');
+                        if (ds) out.push(ds);
+                        const ss = img.getAttribute('srcset');
+                        if (ss) {
+                            const first = ss.split(',')[0].trim().split(' ')[0];
+                            if (first) out.push(first);
+                        }
+                    });
+                    return out;
+                }"""
+            )
+        finally:
+            await browser.close()
+
+    # Absolutize, dedupe, drop tiny icons/data-uris.
+    seen: set[str] = set()
+    image_urls: list[str] = []
+    for u in raw_imgs:
+        if not u or u.startswith("data:"):
+            continue
+        absu = urljoin(final_url, u)
+        if absu in seen:
+            continue
+        seen.add(absu)
+        lower = absu.lower()
+        if any(x in lower for x in ("logo", "favicon", "sprite", "icon", ".svg")):
+            continue
+        image_urls.append(absu)
+        if len(image_urls) >= MAX_FETCH_IMAGES:
+            break
+
+    return FetchListingResponse(
+        final_url=final_url,
+        html=html[:MAX_FETCH_HTML_BYTES],
+        image_urls=image_urls,
+    )