apply service
- POST /internal/fetch-listing: headless Playwright fetch of a listing URL,
returns {html, image_urls[], final_url}. Uses the same browser
fingerprint/profile as the apply run so bot guards don't kick in
web service
- New enrichment pipeline (web/enrichment.py):
/internal/flats → upsert → kick() enrichment in a background thread
1. POST /internal/fetch-listing on apply
2. llm.extract_flat_details(html, url) — Haiku tool-use call returns
structured JSON (address, rooms, rent, description, pros/cons, etc.)
3. Download each image directly to /data/flats/<slug>/NN.<ext>
4. Persist enrichment_json + image_count + enrichment_status on the flat
- llm.py: minimal Anthropic /v1/messages wrapper, no SDK
- DB migration v5 adds enrichment_json/_status/_updated_at + image_count
- Admin "Altbestand anreichern" button (POST /actions/enrich-all) queues
backfill for all pending/failed rows; runs in a detached task
- GET /partials/wohnung/<id> renders _wohnung_detail.html
- GET /flat-images/<slug>/<n> serves the downloaded image
UI
- Chevron on each list row toggles an inline detail pane (HTMX fetch on
first open, hx-preserve keeps it open across the 3–30 s polls)
- CSS .flat-gallery normalises image tiles to a 4/3 aspect with object-fit:
cover so different source sizes align cleanly
- "analysiert…" / "?" chips on the list reflect enrichment_status
Config
- ANTHROPIC_API_KEY + ANTHROPIC_MODEL wired into docker-compose's web
service (default model: claude-haiku-4-5-20251001)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
218 lines
7.1 KiB
Python
218 lines
7.1 KiB
Python
import logging
|
|
from contextlib import asynccontextmanager
|
|
from urllib.parse import urljoin, urlparse
|
|
|
|
from fastapi import Depends, FastAPI, Header, HTTPException, status
|
|
from playwright.async_api import ViewportSize, async_playwright
|
|
from pydantic import BaseModel, Field
|
|
from rich.console import Console
|
|
from rich.logging import RichHandler
|
|
|
|
import providers
|
|
from actions import Recorder
|
|
from classes.application_result import ApplicationResult
|
|
from classes.profile import Profile
|
|
from language import _
|
|
from providers._provider import ApplyContext
|
|
from settings import BROWSER_HEIGHT, BROWSER_LOCALE, BROWSER_WIDTH, HEADLESS, INTERNAL_API_KEY
|
|
|
|
|
|
def setup_logging():
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)-5s %(name)s: %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
handlers=[RichHandler(markup=False, console=Console(width=140), show_time=False, show_path=False)],
|
|
)
|
|
logging.getLogger("flat-apply").setLevel(logging.DEBUG)
|
|
logging.getLogger("playwright").setLevel(logging.INFO)
|
|
|
|
|
|
logger = logging.getLogger("flat-apply")
|
|
setup_logging()
|
|
|
|
|
|
class ProfileModel(BaseModel):
|
|
salutation: str = "Herr"
|
|
firstname: str = ""
|
|
lastname: str = ""
|
|
email: str = ""
|
|
telephone: str = ""
|
|
street: str = ""
|
|
house_number: str = ""
|
|
postcode: str = ""
|
|
city: str = ""
|
|
is_possessing_wbs: bool = False
|
|
wbs_type: str = "0"
|
|
wbs_valid_till: str = "1970-01-01"
|
|
wbs_rooms: int = 0
|
|
wbs_adults: int = 0
|
|
wbs_children: int = 0
|
|
is_prio_wbs: bool = False
|
|
immomio_email: str = ""
|
|
immomio_password: str = ""
|
|
|
|
|
|
class ApplyRequest(BaseModel):
|
|
url: str
|
|
profile: ProfileModel
|
|
submit_forms: bool = False
|
|
application_id: int | None = None # echoed back in logs
|
|
|
|
|
|
class ApplyResponse(BaseModel):
|
|
success: bool
|
|
message: str
|
|
provider: str
|
|
application_id: int | None = None
|
|
forensics: dict
|
|
|
|
|
|
def require_api_key(x_internal_api_key: str | None = Header(default=None)) -> None:
|
|
if not INTERNAL_API_KEY:
|
|
raise HTTPException(status_code=503, detail="INTERNAL_API_KEY not configured")
|
|
if x_internal_api_key != INTERNAL_API_KEY:
|
|
raise HTTPException(status_code=401, detail="invalid api key")
|
|
|
|
|
|
@asynccontextmanager
|
|
async def lifespan(_app: FastAPI):
|
|
logger.info("apply ready, providers: %s", sorted(providers.PROVIDERS))
|
|
yield
|
|
|
|
|
|
app = FastAPI(lifespan=lifespan, title="lazyflat-apply", docs_url=None, redoc_url=None)
|
|
|
|
|
|
@app.get("/health")
|
|
def health():
|
|
return {"status": "ok", "providers": sorted(providers.PROVIDERS)}
|
|
|
|
|
|
@app.post("/apply", response_model=ApplyResponse, dependencies=[Depends(require_api_key)])
|
|
async def apply(req: ApplyRequest):
|
|
url = req.url.strip()
|
|
domain = urlparse(url).netloc.lower().removeprefix("www.")
|
|
logger.info("apply request application_id=%s domain=%s submit=%s",
|
|
req.application_id, domain, req.submit_forms)
|
|
|
|
recorder = Recorder(url)
|
|
recorder.step("request.received", detail=f"application_id={req.application_id} domain={domain} submit={req.submit_forms}")
|
|
|
|
if domain not in providers.PROVIDERS:
|
|
recorder.step("unsupported_provider", "warn", domain)
|
|
result = ApplicationResult(False, message=_("unsupported_association"))
|
|
return ApplyResponse(
|
|
success=False, message=str(result), provider="",
|
|
application_id=req.application_id, forensics=recorder.to_json(),
|
|
)
|
|
|
|
provider = providers.PROVIDERS[domain]
|
|
profile = Profile.from_dict(req.profile.model_dump())
|
|
ctx = ApplyContext(profile=profile, submit_forms=req.submit_forms, recorder=recorder)
|
|
|
|
try:
|
|
result = await provider.apply_for_flat(url, ctx)
|
|
logger.info("apply outcome application_id=%s: %r", req.application_id, result)
|
|
except Exception as e:
|
|
logger.exception("apply crashed application_id=%s", req.application_id)
|
|
recorder.step("exception", "error", f"{type(e).__name__}: {e}")
|
|
result = ApplicationResult(False, f"Script Error:\n{e}")
|
|
|
|
return ApplyResponse(
|
|
success=result.success,
|
|
message=str(result),
|
|
provider=provider.domain,
|
|
application_id=req.application_id,
|
|
forensics=recorder.to_json(),
|
|
)
|
|
|
|
|
|
class FetchListingRequest(BaseModel):
|
|
url: str
|
|
|
|
|
|
class FetchListingResponse(BaseModel):
|
|
final_url: str
|
|
html: str
|
|
image_urls: list[str]
|
|
|
|
|
|
MAX_FETCH_HTML_BYTES = 400_000
|
|
MAX_FETCH_IMAGES = 30
|
|
|
|
|
|
@app.post(
|
|
"/internal/fetch-listing",
|
|
response_model=FetchListingResponse,
|
|
dependencies=[Depends(require_api_key)],
|
|
)
|
|
async def fetch_listing(req: FetchListingRequest):
|
|
"""Headless Playwright fetch of a flat listing — returns page HTML +
|
|
absolute image URLs. Used by the web service's LLM enrichment pipeline
|
|
so we look like a real browser and don't get bounced by bot guards."""
|
|
url = req.url.strip()
|
|
if not url:
|
|
raise HTTPException(400, "url required")
|
|
logger.info("fetch-listing url=%s", url)
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(
|
|
headless=HEADLESS,
|
|
args=["--disable-blink-features=AutomationControlled"],
|
|
)
|
|
try:
|
|
context = await browser.new_context(
|
|
viewport=ViewportSize({"width": BROWSER_WIDTH, "height": BROWSER_HEIGHT}),
|
|
locale=BROWSER_LOCALE,
|
|
)
|
|
page = await context.new_page()
|
|
await page.goto(url, timeout=30_000)
|
|
try:
|
|
await page.wait_for_load_state("networkidle", timeout=10_000)
|
|
except Exception:
|
|
pass
|
|
final_url = page.url
|
|
html = await page.content()
|
|
# Collect image candidates: <img src> + <img data-src> + srcset first URL.
|
|
raw_imgs: list[str] = await page.evaluate(
|
|
"""() => {
|
|
const out = [];
|
|
document.querySelectorAll('img').forEach((img) => {
|
|
if (img.src) out.push(img.src);
|
|
const ds = img.getAttribute('data-src');
|
|
if (ds) out.push(ds);
|
|
const ss = img.getAttribute('srcset');
|
|
if (ss) {
|
|
const first = ss.split(',')[0].trim().split(' ')[0];
|
|
if (first) out.push(first);
|
|
}
|
|
});
|
|
return out;
|
|
}"""
|
|
)
|
|
finally:
|
|
await browser.close()
|
|
|
|
# Absolutize, dedupe, drop tiny icons/data-uris.
|
|
seen: set[str] = set()
|
|
image_urls: list[str] = []
|
|
for u in raw_imgs:
|
|
if not u or u.startswith("data:"):
|
|
continue
|
|
absu = urljoin(final_url, u)
|
|
if absu in seen:
|
|
continue
|
|
seen.add(absu)
|
|
lower = absu.lower()
|
|
if any(x in lower for x in ("logo", "favicon", "sprite", "icon", ".svg")):
|
|
continue
|
|
image_urls.append(absu)
|
|
if len(image_urls) >= MAX_FETCH_IMAGES:
|
|
break
|
|
|
|
return FetchListingResponse(
|
|
final_url=final_url,
|
|
html=html[:MAX_FETCH_HTML_BYTES],
|
|
image_urls=image_urls,
|
|
)
|