lazyflat/apply/main.py

import logging
from contextlib import asynccontextmanager
from urllib.parse import urljoin, urlparse

from fastapi import Depends, FastAPI, Header, HTTPException, status
from playwright.async_api import ViewportSize, async_playwright
from pydantic import BaseModel, Field
from rich.console import Console
from rich.logging import RichHandler

import providers
from actions import Recorder
from classes.application_result import ApplicationResult
from classes.profile import Profile
from language import _
from providers._provider import ApplyContext
from settings import BROWSER_HEIGHT, BROWSER_LOCALE, BROWSER_WIDTH, HEADLESS, INTERNAL_API_KEY


def setup_logging():
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)-5s %(name)s: %(message)s",
        datefmt="%H:%M:%S",
        handlers=[RichHandler(markup=False, console=Console(width=140), show_time=False, show_path=False)],
    )
    logging.getLogger("flat-apply").setLevel(logging.DEBUG)
    logging.getLogger("playwright").setLevel(logging.INFO)


logger = logging.getLogger("flat-apply")
setup_logging()


class ProfileModel(BaseModel):
    salutation: str = "Herr"
    firstname: str = ""
    lastname: str = ""
    email: str = ""
    telephone: str = ""
    street: str = ""
    house_number: str = ""
    postcode: str = ""
    city: str = ""
    is_possessing_wbs: bool = False
    wbs_type: str = "0"
    wbs_valid_till: str = "1970-01-01"
    wbs_rooms: int = 0
    wbs_adults: int = 0
    wbs_children: int = 0
    is_prio_wbs: bool = False
    immomio_email: str = ""
    immomio_password: str = ""


class ApplyRequest(BaseModel):
    url: str
    profile: ProfileModel
    submit_forms: bool = False
    application_id: int | None = None  # echoed back in logs


class ApplyResponse(BaseModel):
    success: bool
    message: str
    provider: str
    application_id: int | None = None
    forensics: dict


def require_api_key(x_internal_api_key: str | None = Header(default=None)) -> None:
    if not INTERNAL_API_KEY:
        raise HTTPException(status_code=503, detail="INTERNAL_API_KEY not configured")
    if x_internal_api_key != INTERNAL_API_KEY:
        raise HTTPException(status_code=401, detail="invalid api key")


@asynccontextmanager
async def lifespan(_app: FastAPI):
    logger.info("apply ready, providers: %s", sorted(providers.PROVIDERS))
    yield


app = FastAPI(lifespan=lifespan, title="lazyflat-apply", docs_url=None, redoc_url=None)


@app.get("/health")
def health():
    return {"status": "ok", "providers": sorted(providers.PROVIDERS)}


@app.post("/apply", response_model=ApplyResponse, dependencies=[Depends(require_api_key)])
async def apply(req: ApplyRequest):
    url = req.url.strip()
    domain = urlparse(url).netloc.lower().removeprefix("www.")
    logger.info("apply request application_id=%s domain=%s submit=%s",
                req.application_id, domain, req.submit_forms)

    recorder = Recorder(url)
    recorder.step("request.received", detail=f"application_id={req.application_id} domain={domain} submit={req.submit_forms}")

    if domain not in providers.PROVIDERS:
        recorder.step("unsupported_provider", "warn", domain)
        result = ApplicationResult(False, message=_("unsupported_association"))
        return ApplyResponse(
            success=False, message=str(result), provider="",
            application_id=req.application_id, forensics=recorder.to_json(),
        )

    provider = providers.PROVIDERS[domain]
    profile = Profile.from_dict(req.profile.model_dump())
    ctx = ApplyContext(profile=profile, submit_forms=req.submit_forms, recorder=recorder)

    try:
        result = await provider.apply_for_flat(url, ctx)
        logger.info("apply outcome application_id=%s: %r", req.application_id, result)
    except Exception as e:
        logger.exception("apply crashed application_id=%s", req.application_id)
        recorder.step("exception", "error", f"{type(e).__name__}: {e}")
        result = ApplicationResult(False, f"Script Error:\n{e}")

    return ApplyResponse(
        success=result.success,
        message=str(result),
        provider=provider.domain,
        application_id=req.application_id,
        forensics=recorder.to_json(),
    )


class FetchListingRequest(BaseModel):
    url: str


class FetchListingResponse(BaseModel):
    final_url: str
    html: str
    image_urls: list[str]


MAX_FETCH_HTML_BYTES = 400_000
MAX_FETCH_IMAGES = 30


@app.post(
    "/internal/fetch-listing",
    response_model=FetchListingResponse,
    dependencies=[Depends(require_api_key)],
)
async def fetch_listing(req: FetchListingRequest):
    """Headless Playwright fetch of a flat listing — returns page HTML +
    absolute image URLs. Used by the web service's LLM enrichment pipeline
    so we look like a real browser and don't get bounced by bot guards."""
    url = req.url.strip()
    if not url:
        raise HTTPException(400, "url required")
    logger.info("fetch-listing url=%s", url)

    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=HEADLESS,
            args=["--disable-blink-features=AutomationControlled"],
        )
        try:
            context = await browser.new_context(
                viewport=ViewportSize({"width": BROWSER_WIDTH, "height": BROWSER_HEIGHT}),
                locale=BROWSER_LOCALE,
            )
            page = await context.new_page()
            await page.goto(url, timeout=30_000)
            try:
                await page.wait_for_load_state("networkidle", timeout=10_000)
            except Exception:
                pass
            final_url = page.url
            html = await page.content()
            # Collect image candidates: <img src> + <img data-src> + srcset first URL.
            raw_imgs: list[str] = await page.evaluate(
                """() => {
                    const out = [];
                    document.querySelectorAll('img').forEach((img) => {
                        if (img.src) out.push(img.src);
                        const ds = img.getAttribute('data-src');
                        if (ds) out.push(ds);
                        const ss = img.getAttribute('srcset');
                        if (ss) {
                            const first = ss.split(',')[0].trim().split(' ')[0];
                            if (first) out.push(first);
                        }
                    });
                    return out;
                }"""
            )
        finally:
            await browser.close()

    # Absolutize, dedupe, drop tiny icons/data-uris.
    seen: set[str] = set()
    image_urls: list[str] = []
    for u in raw_imgs:
        if not u or u.startswith("data:"):
            continue
        absu = urljoin(final_url, u)
        if absu in seen:
            continue
        seen.add(absu)
        lower = absu.lower()
        if any(x in lower for x in ("logo", "favicon", "sprite", "icon", ".svg")):
            continue
        image_urls.append(absu)
        if len(image_urls) >= MAX_FETCH_IMAGES:
            break

    return FetchListingResponse(
        final_url=final_url,
        html=html[:MAX_FETCH_HTML_BYTES],
        image_urls=image_urls,
    )