enrichment: Haiku flat details + image gallery on expand
apply service
- POST /internal/fetch-listing: headless Playwright fetch of a listing URL,
returns {html, image_urls[], final_url}. Uses the same browser
fingerprint/profile as the apply run so bot guards don't kick in
web service
- New enrichment pipeline (web/enrichment.py):
/internal/flats → upsert → kick() enrichment in a background thread
1. POST /internal/fetch-listing on apply
2. llm.extract_flat_details(html, url) — Haiku tool-use call returns
structured JSON (address, rooms, rent, description, pros/cons, etc.)
3. Download each image directly to /data/flats/<slug>/NN.<ext>
4. Persist enrichment_json + image_count + enrichment_status on the flat
- llm.py: minimal Anthropic /v1/messages wrapper, no SDK
- DB migration v5 adds enrichment_json/_status/_updated_at + image_count
- Admin "Altbestand anreichern" button (POST /actions/enrich-all) queues
backfill for all pending/failed rows; runs in a detached task
- GET /partials/wohnung/<id> renders _wohnung_detail.html
- GET /flat-images/<slug>/<n> serves the downloaded image
UI
- Chevron on each list row toggles an inline detail pane (HTMX fetch on
first open, hx-preserve keeps it open across the 3–30 s polls)
- CSS .flat-gallery normalises image tiles to a 4/3 aspect with object-fit:
cover so different source sizes align cleanly
- "analysiert…" / "?" chips on the list reflect enrichment_status
Config
- ANTHROPIC_API_KEY + ANTHROPIC_MODEL wired into docker-compose's web
service (default model: claude-haiku-4-5-20251001)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
2609d3504a
commit
eb66284172
11 changed files with 688 additions and 44 deletions
|
|
@ -1,8 +1,9 @@
|
|||
import logging
|
||||
from contextlib import asynccontextmanager
|
||||
from urllib.parse import urlparse
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
from fastapi import Depends, FastAPI, Header, HTTPException, status
|
||||
from playwright.async_api import ViewportSize, async_playwright
|
||||
from pydantic import BaseModel, Field
|
||||
from rich.console import Console
|
||||
from rich.logging import RichHandler
|
||||
|
|
@ -13,7 +14,7 @@ from classes.application_result import ApplicationResult
|
|||
from classes.profile import Profile
|
||||
from language import _
|
||||
from providers._provider import ApplyContext
|
||||
from settings import INTERNAL_API_KEY
|
||||
from settings import BROWSER_HEIGHT, BROWSER_LOCALE, BROWSER_WIDTH, HEADLESS, INTERNAL_API_KEY
|
||||
|
||||
|
||||
def setup_logging():
|
||||
|
|
@ -125,3 +126,93 @@ async def apply(req: ApplyRequest):
|
|||
application_id=req.application_id,
|
||||
forensics=recorder.to_json(),
|
||||
)
|
||||
|
||||
|
||||
class FetchListingRequest(BaseModel):
|
||||
url: str
|
||||
|
||||
|
||||
class FetchListingResponse(BaseModel):
|
||||
final_url: str
|
||||
html: str
|
||||
image_urls: list[str]
|
||||
|
||||
|
||||
MAX_FETCH_HTML_BYTES = 400_000
|
||||
MAX_FETCH_IMAGES = 30
|
||||
|
||||
|
||||
@app.post(
|
||||
"/internal/fetch-listing",
|
||||
response_model=FetchListingResponse,
|
||||
dependencies=[Depends(require_api_key)],
|
||||
)
|
||||
async def fetch_listing(req: FetchListingRequest):
|
||||
"""Headless Playwright fetch of a flat listing — returns page HTML +
|
||||
absolute image URLs. Used by the web service's LLM enrichment pipeline
|
||||
so we look like a real browser and don't get bounced by bot guards."""
|
||||
url = req.url.strip()
|
||||
if not url:
|
||||
raise HTTPException(400, "url required")
|
||||
logger.info("fetch-listing url=%s", url)
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(
|
||||
headless=HEADLESS,
|
||||
args=["--disable-blink-features=AutomationControlled"],
|
||||
)
|
||||
try:
|
||||
context = await browser.new_context(
|
||||
viewport=ViewportSize({"width": BROWSER_WIDTH, "height": BROWSER_HEIGHT}),
|
||||
locale=BROWSER_LOCALE,
|
||||
)
|
||||
page = await context.new_page()
|
||||
await page.goto(url, timeout=30_000)
|
||||
try:
|
||||
await page.wait_for_load_state("networkidle", timeout=10_000)
|
||||
except Exception:
|
||||
pass
|
||||
final_url = page.url
|
||||
html = await page.content()
|
||||
# Collect image candidates: <img src> + <img data-src> + srcset first URL.
|
||||
raw_imgs: list[str] = await page.evaluate(
|
||||
"""() => {
|
||||
const out = [];
|
||||
document.querySelectorAll('img').forEach((img) => {
|
||||
if (img.src) out.push(img.src);
|
||||
const ds = img.getAttribute('data-src');
|
||||
if (ds) out.push(ds);
|
||||
const ss = img.getAttribute('srcset');
|
||||
if (ss) {
|
||||
const first = ss.split(',')[0].trim().split(' ')[0];
|
||||
if (first) out.push(first);
|
||||
}
|
||||
});
|
||||
return out;
|
||||
}"""
|
||||
)
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
# Absolutize, dedupe, drop tiny icons/data-uris.
|
||||
seen: set[str] = set()
|
||||
image_urls: list[str] = []
|
||||
for u in raw_imgs:
|
||||
if not u or u.startswith("data:"):
|
||||
continue
|
||||
absu = urljoin(final_url, u)
|
||||
if absu in seen:
|
||||
continue
|
||||
seen.add(absu)
|
||||
lower = absu.lower()
|
||||
if any(x in lower for x in ("logo", "favicon", "sprite", "icon", ".svg")):
|
||||
continue
|
||||
image_urls.append(absu)
|
||||
if len(image_urls) >= MAX_FETCH_IMAGES:
|
||||
break
|
||||
|
||||
return FetchListingResponse(
|
||||
final_url=final_url,
|
||||
html=html[:MAX_FETCH_HTML_BYTES],
|
||||
image_urls=image_urls,
|
||||
)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue