lazyflat/alert/scraper.py
EiSiMo 3bb04210c4 secrets tab, drop commute filter, favicon, robust error reports
1. Admin → Geheimnisse sub-tab lets you edit ANTHROPIC_API_KEY +
   BERLIN_WOHNEN_USERNAME/PASSWORD at runtime. Migration v7 adds a
   secrets(key,value,updated_at) table; startup seeds missing keys from
   env (idempotent). web reads secrets DB-first (env fallback) via
   llm._api_key(); alert fetches them from web /internal/secrets on each
   scan, passes them into Scraper(). Rotating creds no longer needs a
   redeploy.
   Masked display: 6 leading + 4 trailing chars, "…" in the middle.
   Blank form fields leave the stored value untouched.

2. Drop the max_morning_commute filter from UI + server + FILTER_KEYS +
   filter summary (the underlying Maps.calculate_score code stays for
   potential future re-enable).

3. /static/didi.webp wired as favicon via <link rel="icon"> in base.html.

4. apply.open_page wraps page.goto in try/except so a failed load still
   produces a "goto.failed" step + screenshot instead of returning an
   empty forensics blob. networkidle + post-submission sleep are also
   made best-effort. The error ZIP export already writes screenshot+HTML
   per step and final_html — with this change every apply run leaves a
   reconstructable trail even when the listing is already offline.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-21 17:56:57 +02:00

97 lines
3.8 KiB
Python

import requests
import re
import logging
from bs4 import BeautifulSoup
from settings import BERLIN_WOHNEN_USERNAME, BERLIN_WOHNEN_PASSWORD # env fallback
logger = logging.getLogger("flat-alert")
class Scraper:
URL_LOGIN = 'https://www.inberlinwohnen.de/login'
URL_FINDER = 'https://www.inberlinwohnen.de/mein-bereich/wohnungsfinder'
BASE_URL = 'https://www.inberlinwohnen.de'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'de,en;q=0.9,en-US;q=0.8',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
}
def __init__(self, username: str = "", password: str = ""):
self.username = username or BERLIN_WOHNEN_USERNAME
self.password = password or BERLIN_WOHNEN_PASSWORD
self.session = requests.Session()
self.session.headers.update(self.HEADERS)
def login(self):
if not self.username or not self.password:
logger.critical("BERLIN_WOHNEN credentials missing — nothing to log in with")
return False
logger.info("fetching inberlinwohnen.de login page")
resp_login_page = self.session.get(self.URL_LOGIN, timeout=30)
token_search = re.search(r'name="csrf-token" content="([^"]+)"', resp_login_page.text)
if not token_search:
logger.critical("no CSRF token found on login page.")
return False
csrf_token = token_search.group(1)
payload_login = {
'_token': csrf_token,
'email': self.username,
'password': self.password,
'remember': 'on'
}
headers_login = self.HEADERS.copy()
headers_login['Referer'] = self.URL_LOGIN
logger.info("attempting login")
resp_post = self.session.post(self.URL_LOGIN, data=payload_login, headers=headers_login, timeout=30)
if not resp_post.ok or "login" in resp_post.url:
logger.critical("login failed")
logger.info(f"status code: {resp_post.status_code}")
logger.info(f"url: {resp_post.url}")
return False
logger.info("login successful")
return True
def get_flats(self):
logger.info("fetching flat list")
self.session.headers.update({'Referer': 'https://www.inberlinwohnen.de/mein-bereich'})
resp_finder = self.session.get(self.URL_FINDER, timeout=30)
soup = BeautifulSoup(resp_finder.text, 'html.parser')
apartment_divs = soup.find_all('div', id=re.compile(r'^apartment-\d+'))
logger.info(f"found {len(apartment_divs)} apartments on page.")
flats_data = []
for div in apartment_divs:
flat_id = div['id'].replace('apartment-', '')
data = {'id': flat_id}
link_elem = None
for link in div.find_all('a'):
if "alle details" in link.get_text(strip=True).lower():
link_elem = link.get('href')
break
if link_elem:
data['link'] = link_elem if link_elem.startswith('http') else self.BASE_URL + link_elem
else:
data['link'] = self.BASE_URL
details_list = div.find('dl')
if details_list:
dt_elements = details_list.find_all('dt')
for dt in dt_elements:
key = dt.get_text(strip=True).rstrip(':')
dd = dt.find_next_sibling('dd')
if dd:
data[key] = dd.get_text(strip=True)
flats_data.append(data)
return flats_data