roughly working again, now dev docker exists
This commit is contained in:
parent
a77a0c0393
commit
155ab39368
26 changed files with 1976 additions and 235 deletions
|
|
@ -6,10 +6,49 @@ from handlers.degewo_handler import DegewoHandler
|
|||
from handlers.gesobau_handler import GesobauHandler
|
||||
from handlers.stadtundland_handler import StadtUndLandHandler
|
||||
from handlers.wbm_handler import WBMHandler
|
||||
import json
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from typing import Optional
|
||||
import matplotlib.pyplot as plt
|
||||
import matplotlib.dates as mdates
|
||||
import logging
|
||||
import matplotlib
|
||||
import matplotlib.font_manager as fm
|
||||
import html
|
||||
import re
|
||||
import hashlib
|
||||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
import os
|
||||
|
||||
STATE_FILE = Path("data/state.json")
|
||||
APPLICATIONS_FILE = Path("data/applications.json")
|
||||
TIMING_FILE = Path("data/timing.csv")
|
||||
LISTINGS_FILE = Path("data/listings.json")
|
||||
DATA_DIR = Path("data")
|
||||
|
||||
|
||||
# --- Matplotlib Font Setup (for emoji support in plots) ---
|
||||
font_cache_dir = Path("data/fonts")
|
||||
font_cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
matplotlib.get_configdir = lambda: str(font_cache_dir)
|
||||
fm.findSystemFonts(fontpaths=str(font_cache_dir), fontext='ttf')
|
||||
matplotlib.rcParams['font.family'] = 'Noto Sans'
|
||||
|
||||
# Use the root logger for consistency with main.py
|
||||
logger = logging.getLogger()
|
||||
|
||||
class ApplicationHandler:
|
||||
def __init__(self, browser_context):
|
||||
"""
|
||||
Main handler for apartment monitoring, application automation, and notification logic.
|
||||
Handles browser automation, listing extraction, application delegation, and Telegram notifications.
|
||||
"""
|
||||
|
||||
def __init__(self, browser_context, state_manager, applications_file: Path = None):
|
||||
self.context = browser_context
|
||||
self.state_manager = state_manager
|
||||
self.applications_file = applications_file or APPLICATIONS_FILE
|
||||
self.handlers = {
|
||||
"howoge": HowogeHandler(browser_context),
|
||||
"gewobag": GewobagHandler(browser_context),
|
||||
|
|
@ -19,6 +58,142 @@ class ApplicationHandler:
|
|||
"wbm": WBMHandler(browser_context),
|
||||
}
|
||||
|
||||
def set_telegram_bot(self, telegram_bot):
|
||||
"""Attach a TelegramBot instance for notifications."""
|
||||
self.telegram_bot = telegram_bot
|
||||
|
||||
def notify_new_listings(self, new_listings: list[dict], application_results: Optional[dict] = None):
|
||||
"""
|
||||
Send a Telegram notification for each new listing.
|
||||
Includes application result if autopilot was enabled.
|
||||
"""
|
||||
if not new_listings:
|
||||
return
|
||||
|
||||
for listing in new_listings:
|
||||
link = listing.get('link', 'https://www.inberlinwohnen.de/wohnungsfinder/')
|
||||
# Detect company for header
|
||||
company = self._detect_company(link)
|
||||
company_label = company.capitalize() if company != "unknown" else "Wohnung"
|
||||
message = (
|
||||
f"🏠 <b>[{company_label}] Neue Wohnung!</b>\n\n"
|
||||
f"🚪 <b>{listing['rooms']}</b>\n"
|
||||
f"📐 {listing['size']}\n"
|
||||
f"💰 {listing['price']}\n"
|
||||
f"📍 {listing['address']}\n\n"
|
||||
f"👉 <a href=\"{link}\">Alle Details</a>"
|
||||
)
|
||||
|
||||
# Add autopilot/apply status if attempted
|
||||
if application_results and listing["id"] in application_results:
|
||||
result = application_results[listing["id"]]
|
||||
if result["success"]:
|
||||
message += f"\n\n🤖 <b>Auto-applied!</b> ({result['company']})"
|
||||
if result["message"]:
|
||||
message += f"\n<i>{result['message']}</i>"
|
||||
else:
|
||||
message += f"\n\n⚠️ <b>Auto-apply failed</b> ({result['company']})"
|
||||
if result["message"]:
|
||||
message += f"\n<i>{result['message']}</i>"
|
||||
|
||||
# Send via TelegramBot if available
|
||||
if hasattr(self, 'telegram_bot') and self.telegram_bot:
|
||||
logger.info(f"Notifying Telegram: {listing['address']} ({listing['rooms']}, {listing['size']}, {listing['price']})")
|
||||
self.telegram_bot._send_message(message)
|
||||
else:
|
||||
logger.info(f"[TELEGRAM] Would send message for: {listing['address']} ({listing['rooms']}, {listing['size']}, {listing['price']})")
|
||||
|
||||
async def apply_to_listings(self, listings: list[dict]) -> dict:
|
||||
"""
|
||||
Apply to multiple listings (autopilot mode).
|
||||
Returns a dict of application results keyed by listing ID.
|
||||
"""
|
||||
results = {}
|
||||
for listing in listings:
|
||||
if self.has_applied(listing["id"]):
|
||||
logger.info(f"Already applied to {listing['id']} ({listing['address']}), skipping.")
|
||||
continue
|
||||
result = await self.apply(listing)
|
||||
results[listing["id"]] = result
|
||||
self.save_application(result)
|
||||
status = "✅" if result["success"] else "❌"
|
||||
logger.info(f"Application {status} for {listing['address']}: {result['message']}")
|
||||
await asyncio.sleep(2)
|
||||
return results
|
||||
|
||||
|
||||
def log_listing_times(self, new_listings: list[dict]):
|
||||
"""
|
||||
Log new listing appearance times to CSV for later analysis and pattern mining.
|
||||
Appends to data/listing_times.csv, creating header if needed.
|
||||
"""
|
||||
if not new_listings:
|
||||
return
|
||||
|
||||
import csv
|
||||
TIMING_FILE = Path("data/listing_times.csv")
|
||||
file_exists = TIMING_FILE.exists()
|
||||
|
||||
with open(TIMING_FILE, "a", newline="", encoding="utf-8") as f:
|
||||
writer = csv.writer(f)
|
||||
if not file_exists:
|
||||
writer.writerow(["timestamp", "weekday", "hour", "minute", "rooms", "size", "price", "address", "listing_id"])
|
||||
|
||||
now = datetime.now()
|
||||
for listing in new_listings:
|
||||
writer.writerow([
|
||||
now.isoformat(),
|
||||
now.strftime("%A"), # Weekday name
|
||||
now.hour,
|
||||
now.minute,
|
||||
listing["rooms"],
|
||||
listing["size"],
|
||||
listing["price"],
|
||||
listing["address"],
|
||||
listing["id"]
|
||||
])
|
||||
|
||||
logger.info(f"Logged {len(new_listings)} new listing times to CSV.")
|
||||
|
||||
def __init__(self, browser_context, state_manager):
|
||||
self.context = browser_context
|
||||
self.state_manager = state_manager
|
||||
self.handlers = {
|
||||
"howoge": HowogeHandler(browser_context),
|
||||
"gewobag": GewobagHandler(browser_context),
|
||||
"degewo": DegewoHandler(browser_context),
|
||||
"gesobau": GesobauHandler(browser_context),
|
||||
"stadtundland": StadtUndLandHandler(browser_context),
|
||||
"wbm": WBMHandler(browser_context),
|
||||
}
|
||||
self.applications_file = applications_file or APPLICATIONS_FILE
|
||||
|
||||
def __init__(self, browser_context, state_manager, applications_file: Path = None):
|
||||
self.context = browser_context
|
||||
self.state_manager = state_manager
|
||||
self.applications_file = applications_file or APPLICATIONS_FILE
|
||||
self.handlers = {
|
||||
"howoge": HowogeHandler(browser_context),
|
||||
"gewobag": GewobagHandler(browser_context),
|
||||
"degewo": DegewoHandler(browser_context),
|
||||
"gesobau": GesobauHandler(browser_context),
|
||||
"stadtundland": StadtUndLandHandler(browser_context),
|
||||
"wbm": WBMHandler(browser_context),
|
||||
}
|
||||
|
||||
|
||||
async def init_browser(self):
|
||||
"""Initialize Playwright browser (minimal, like test script)"""
|
||||
if not hasattr(self, 'browser') or self.browser is None:
|
||||
self.playwright = await async_playwright().start()
|
||||
self.browser = await self.playwright.chromium.launch(headless=True)
|
||||
self.context = await self.browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
|
||||
)
|
||||
logger.info("Browser initialized (minimal context)")
|
||||
self.application_handler = ApplicationHandler(self.context, self.state_manager)
|
||||
|
||||
|
||||
async def apply(self, listing: dict) -> dict:
|
||||
company = self._detect_company(listing.get("link", ""))
|
||||
handler = self.handlers.get(company)
|
||||
|
|
@ -41,11 +216,463 @@ class ApplicationHandler:
|
|||
|
||||
return result
|
||||
|
||||
|
||||
def _detect_company(self, link: str) -> str:
|
||||
if "howoge.de" in link: return "howoge"
|
||||
elif "gewobag.de" in link: return "gewobag"
|
||||
elif "degewo.de" in link: return "degewo"
|
||||
elif "gesobau.de" in link: return "gesobau"
|
||||
elif "stadtundland.de" in link: return "stadtundland"
|
||||
elif "wbm.de" in link: return "wbm"
|
||||
return "unknown"
|
||||
"""Robust company detection logic, matching monitor.py as closely as possible."""
|
||||
link = (link or "").lower()
|
||||
# Remove URL scheme and www for easier matching
|
||||
link = re.sub(r"^https?://(www\.)?", "", link)
|
||||
# Use domain-based matching, including subdomains
|
||||
if re.search(r"howoge\\.de", link):
|
||||
return "howoge"
|
||||
if re.search(r"gewobag\\.de", link):
|
||||
return "gewobag"
|
||||
if re.search(r"degewo\\.de", link):
|
||||
return "degewo"
|
||||
if re.search(r"gesobau\\.de", link):
|
||||
return "gesobau"
|
||||
if re.search(r"stadt-und-land\\.de|stadtundland\\.de", link):
|
||||
return "stadtundland"
|
||||
if re.search(r"wbm\\.de", link):
|
||||
return "wbm"
|
||||
# Also check for company in the path or query (legacy/edge cases)
|
||||
if re.search(r"howoge", link):
|
||||
return "howoge"
|
||||
if re.search(r"gewobag", link):
|
||||
return "gewobag"
|
||||
if re.search(r"degewo", link):
|
||||
return "degewo"
|
||||
if re.search(r"gesobau", link):
|
||||
return "gesobau"
|
||||
if re.search(r"stadt-und-land|stadtundland", link):
|
||||
return "stadtundland"
|
||||
if re.search(r"wbm", link):
|
||||
return "wbm"
|
||||
return "unknown"
|
||||
|
||||
|
||||
def load_state(self) -> dict:
|
||||
"""Load persistent state"""
|
||||
if STATE_FILE.exists():
|
||||
with open(STATE_FILE, "r") as f:
|
||||
return json.load(f)
|
||||
return {"autopilot": False}
|
||||
|
||||
|
||||
def save_state(self, state: dict):
|
||||
"""Save persistent state"""
|
||||
with open(STATE_FILE, "w") as f:
|
||||
json.dump(state, f, indent=2)
|
||||
|
||||
|
||||
def set_autopilot(self, enabled: bool):
|
||||
"""Enable or disable autopilot mode"""
|
||||
self.state_manager.set_autopilot(enabled)
|
||||
|
||||
|
||||
def is_autopilot_enabled(self) -> bool:
|
||||
"""Check if autopilot mode is enabled"""
|
||||
return self.state_manager.is_autopilot_enabled()
|
||||
|
||||
|
||||
def load_applications(self) -> dict:
|
||||
"""Load application history."""
|
||||
if self.applications_file.exists():
|
||||
try:
|
||||
with open(self.applications_file, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
logger.error("Failed to decode applications file. Returning empty history.")
|
||||
return {}
|
||||
|
||||
|
||||
def save_application(self, result: dict):
|
||||
"""Save an application result."""
|
||||
applications = self.load_applications()
|
||||
applications[result["listing_id"]] = result
|
||||
with open(self.applications_file, "w", encoding="utf-8") as f:
|
||||
json.dump(applications, f, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
def has_applied(self, listing_id: str) -> bool:
|
||||
"""Check if we've already applied to this listing."""
|
||||
return listing_id in self.load_applications()
|
||||
|
||||
|
||||
def load_previous_listings(self) -> dict:
|
||||
"""Load previously saved listings"""
|
||||
if LISTINGS_FILE.exists():
|
||||
with open(LISTINGS_FILE, "r") as f:
|
||||
return json.load(f)
|
||||
return {}
|
||||
|
||||
|
||||
def save_listings(self, listings: list[dict]):
|
||||
"""Save current listings"""
|
||||
listings_dict = {l["id"]: l for l in listings}
|
||||
with open(LISTINGS_FILE, "w") as f:
|
||||
json.dump(listings_dict, f, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
def find_new_listings(self, current: list[dict], previous: dict) -> list[dict]:
|
||||
"""Find listings that are new since last check"""
|
||||
new = []
|
||||
for listing in current:
|
||||
if listing["id"] not in previous:
|
||||
new.append(listing)
|
||||
return new
|
||||
|
||||
|
||||
def _generate_weekly_plot(self) -> str:
|
||||
"""Generate a heatmap of listings by day of week and hour"""
|
||||
if not TIMING_FILE.exists():
|
||||
logger.warning("No timing file found for weekly plot")
|
||||
return ""
|
||||
|
||||
try:
|
||||
df = pd.read_csv(TIMING_FILE, parse_dates=["timestamp"])
|
||||
df["day_of_week"] = df["timestamp"].dt.dayofweek
|
||||
df["hour"] = df["timestamp"].dt.hour
|
||||
|
||||
heatmap_data = df.groupby(["day_of_week", "hour"]).size().unstack(fill_value=0)
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 6))
|
||||
cax = ax.matshow(heatmap_data, cmap="YlGnBu", aspect="auto")
|
||||
fig.colorbar(cax)
|
||||
|
||||
ax.set_xticks(range(24))
|
||||
ax.set_yticks(range(7))
|
||||
ax.set_xticklabels([f"{h}:00" for h in range(24)], rotation=90)
|
||||
ax.set_yticklabels(["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])
|
||||
|
||||
ax.set_title("Listings Heatmap (Day of Week vs Hour)")
|
||||
|
||||
plot_path = DATA_DIR / "weekly_plot.png"
|
||||
plt.savefig(plot_path)
|
||||
plt.close(fig)
|
||||
|
||||
logger.info(f"Weekly plot saved to {plot_path}")
|
||||
return str(plot_path)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate weekly plot: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
def _generate_error_rate_plot(self):
|
||||
"""Read applications.json and produce a plot image + summary text.
|
||||
|
||||
Returns (plot_path, summary_text) or (None, "") if insufficient data.
|
||||
"""
|
||||
if not self.applications_file.exists():
|
||||
logger.warning("No applications.json found for errorrate plot")
|
||||
return None, ""
|
||||
|
||||
try:
|
||||
with open(self.applications_file, 'r', encoding='utf-8') as f:
|
||||
apps = json.load(f)
|
||||
|
||||
if not apps:
|
||||
logger.warning("No application data available for errorrate plot")
|
||||
return None, ""
|
||||
|
||||
# Convert to DataFrame
|
||||
rows = []
|
||||
for _id, rec in apps.items():
|
||||
rows.append({
|
||||
"id": _id,
|
||||
"ts": pd.to_datetime(rec.get("timestamp")),
|
||||
"success": rec.get("success", False),
|
||||
"company": rec.get("company", "unknown")
|
||||
})
|
||||
|
||||
df = pd.DataFrame(rows)
|
||||
df = df.dropna(subset=['ts'])
|
||||
if df.empty:
|
||||
logger.warning("No valid data for errorrate plot")
|
||||
return None, ""
|
||||
|
||||
df['date'] = df['ts'].dt.floor('D')
|
||||
grouped = df.groupby('date').agg(total=('id','count'), successes=('success', lambda x: x.sum()))
|
||||
grouped['failures'] = grouped['total'] - grouped['successes']
|
||||
grouped['error_rate'] = grouped['failures'] / grouped['total']
|
||||
|
||||
# Ensure index is sorted by date for plotting
|
||||
grouped = grouped.sort_index()
|
||||
|
||||
# Prepare plot
|
||||
fig, ax = plt.subplots(figsize=(10, 6))
|
||||
ax.plot(grouped.index, grouped['error_rate'], marker='o', color='red', label='Error Rate')
|
||||
ax.set_title('Autopilot Error Rate Over Time')
|
||||
ax.set_xlabel('Date')
|
||||
ax.set_ylabel('Error Rate')
|
||||
ax.legend()
|
||||
ax.grid(True)
|
||||
|
||||
# Save plot to the same directory as the applications file
|
||||
plot_path = self.applications_file.parent / 'error_rate.png'
|
||||
plt.savefig(plot_path)
|
||||
plt.close(fig)
|
||||
|
||||
# Summary
|
||||
total_attempts = int(grouped['total'].sum())
|
||||
total_success = int(grouped['successes'].sum())
|
||||
total_fail = int(grouped['failures'].sum())
|
||||
overall_error = (total_fail / total_attempts) if total_attempts > 0 else 0.0
|
||||
summary = f"<b>Total attempts:</b> {total_attempts}\n<b>Successes:</b> {total_success}\n<b>Failures:</b> {total_fail}\n<b>Overall error rate:</b> {overall_error:.1%}"
|
||||
|
||||
return plot_path, summary
|
||||
except Exception as e:
|
||||
logger.exception(f"Failed to generate error rate plot: {e}")
|
||||
return None, ""
|
||||
|
||||
|
||||
async def login(self, page):
|
||||
"""Login to inberlinwohnen.de (minimal, like test script)"""
|
||||
if not self.state_manager.email or not self.state_manager.password:
|
||||
logger.warning("No credentials provided. Ensure INBERLIN_EMAIL and INBERLIN_PASSWORD are set in the environment.")
|
||||
return False
|
||||
|
||||
try:
|
||||
logger.info("Navigating to login page...")
|
||||
login_response = await page.goto("https://www.inberlinwohnen.de/login", wait_until="networkidle")
|
||||
logger.info(f"Login page status: {login_response.status if login_response else 'No response'}")
|
||||
await asyncio.sleep(2)
|
||||
|
||||
|
||||
# Dismiss cookie/privacy modal before login
|
||||
logger.info("Attempting to dismiss cookie/privacy modal before login...")
|
||||
await self.dismiss_cookie_modal(page)
|
||||
logger.info("Cookie/privacy modal dismissed.")
|
||||
|
||||
# Fill login form (if present)
|
||||
logger.info("Filling in login credentials...")
|
||||
await page.fill('input[name="email"], input[type="email"]', self.state_manager.email)
|
||||
await page.fill('input[name="password"], input[type="password"]', self.state_manager.password)
|
||||
logger.info("Login credentials filled.")
|
||||
|
||||
# Click submit button
|
||||
logger.info("Submitting login form...")
|
||||
submit_response = await page.click('button[type="submit"], input[type="submit"]', timeout=30000)
|
||||
logger.info(f"Clicked submit, waiting for navigation...")
|
||||
try:
|
||||
await page.wait_for_load_state("networkidle", timeout=30000)
|
||||
logger.info(f"After login, page url: {page.url}")
|
||||
logger.info(f"After login, page content length: {len(await page.content())}")
|
||||
except Exception as e:
|
||||
logger.error(f"Timeout or error after login submit: {e}")
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Check if login successful
|
||||
logger.info("Checking if login was successful...")
|
||||
if "mein-bereich" in page.url or await page.query_selector('text="Abmelden"'):
|
||||
logger.info("Login successful.")
|
||||
return True
|
||||
else:
|
||||
logger.error(f"Login failed - ended up at {page.url}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Login error: {e}")
|
||||
logger.debug("Exception occurred during login", exc_info=True)
|
||||
return False
|
||||
|
||||
|
||||
async def fetch_listings(self) -> list[dict]:
|
||||
"""Fetch listings from the Wohnungsfinder"""
|
||||
listings = []
|
||||
|
||||
try:
|
||||
|
||||
page = await self.context.new_page()
|
||||
|
||||
# Attempt login if not already logged in
|
||||
if not self.state_manager.logged_in:
|
||||
login_success = await self.login(page)
|
||||
if login_success:
|
||||
self.state_manager.logged_in = True
|
||||
else:
|
||||
logger.warning("Login failed. Proceeding with public listings.")
|
||||
|
||||
# Select the correct URL after login check
|
||||
if self.state_manager.logged_in:
|
||||
url = "https://www.inberlinwohnen.de/mein-bereich/wohnungsfinder"
|
||||
else:
|
||||
url = "https://www.inberlinwohnen.de/wohnungsfinder/"
|
||||
|
||||
logger.info(f"Fetching listings from {url}")
|
||||
|
||||
|
||||
# Navigate to the page with a longer wait condition for slow internet
|
||||
logger.info("Navigating to listings page with extended timeout...")
|
||||
await page.goto(url, wait_until="networkidle", timeout=20000)
|
||||
|
||||
# Check if the page is a download
|
||||
if "download" in page.url or page.url.endswith(".pdf"):
|
||||
logger.error("Page redirected to a download. Aborting.")
|
||||
return []
|
||||
|
||||
# Handle cookie modal if not logged in
|
||||
if not self.state_manager.logged_in:
|
||||
await self.dismiss_cookie_modal(page)
|
||||
|
||||
# Wait a short time for the page to render, but do not block on any selector
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Collect all listings content by clicking through pagination
|
||||
all_content = ""
|
||||
page_num = 1
|
||||
max_pages = 10 # Safety limit
|
||||
|
||||
while page_num <= max_pages:
|
||||
# Get current page content
|
||||
current_content = await page.content()
|
||||
all_content += current_content
|
||||
|
||||
# Check for "next page" button (Livewire pagination)
|
||||
next_btn = await page.query_selector('[wire\\:click*="nextPage"]')
|
||||
if next_btn and await next_btn.is_visible():
|
||||
await next_btn.click()
|
||||
await asyncio.sleep(2) # Wait for Livewire to update
|
||||
page_num += 1
|
||||
else:
|
||||
break
|
||||
|
||||
logger.info(f"Collected content from {page_num} page(s)")
|
||||
content = all_content
|
||||
|
||||
# Debug: save HTML to file for inspection
|
||||
debug_path = DATA_DIR / "debug_page.html"
|
||||
with open(debug_path, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
logger.info(f"Saved debug HTML to {debug_path}")
|
||||
|
||||
# Debug: Log page title and check for listing count
|
||||
count_match = re.search(r'(\\d+)\\s*Wohnungen? für Sie gefunden', content)
|
||||
if count_match:
|
||||
logger.info(f"Page shows {count_match.group(1)} listings available")
|
||||
|
||||
# Also check for "Zeige X bis Y von Z Angeboten"
|
||||
show_match = re.search(r'Zeige \\d+ bis \\d+ von (\\d+) Angeboten', content)
|
||||
if show_match:
|
||||
logger.info(f"Page shows {show_match.group(1)} total offers")
|
||||
|
||||
# Decode HTML entities and JSON escaped slashes for extraction
|
||||
content_decoded = html.unescape(content)
|
||||
content_decoded = content_decoded.replace('\\/', '/')
|
||||
|
||||
# Build flatId -> deeplink mapping from wire:snapshot JSON data (monitor.py logic)
|
||||
# Format in HTML: "deeplink":"https://...","flatId":12345
|
||||
deeplink_pattern = r'"deeplink":"(https://[^"]+)","flatId":(\d+)'
|
||||
deeplink_matches = re.findall(deeplink_pattern, content_decoded)
|
||||
# Use string keys for flatId to match button extraction
|
||||
id_to_link = {str(flat_id): link for link, flat_id in deeplink_matches}
|
||||
logger.info(f"Found {len(id_to_link)} deeplink mappings")
|
||||
|
||||
|
||||
# --- Extraction logic copied from monitor.py for robustness ---
|
||||
# Extract listings from button elements with aria-label
|
||||
# Format: @click="open !== 12345 ..." aria-label="Wohnungsangebot - 2,0 Zimmer, 53,01 m², 494,38 € Kaltmiete | Adresse"
|
||||
button_pattern = r'@click="open !== (\d+)[^\"]*"[^>]*aria-label="Wohnungsangebot - ([^"]+)'
|
||||
button_matches = re.findall(button_pattern, content_decoded)
|
||||
logger.info(f"Found {len(button_matches)} listing buttons (monitor.py pattern)")
|
||||
|
||||
for flat_id, listing_text in button_matches:
|
||||
# Parse listing text: "2,0 Zimmer, 53,01 m², 494,38 € Kaltmiete | Rhinstraße 4, 10315 Lichtenberg"
|
||||
parts_match = re.match(r'(\d,\d)\s*Zimmer,\s*([\d,.]+)\s*m²,\s*([\d.,]+)\s*€\s*(?:Kaltmiete)?\s*\|\s*(.+)', listing_text)
|
||||
if not parts_match:
|
||||
continue
|
||||
|
||||
rooms, size, price, address = parts_match.groups()
|
||||
rooms = rooms.strip()
|
||||
address = address.strip()
|
||||
|
||||
if len(address) < 5:
|
||||
continue
|
||||
|
||||
# Get the deeplink for this flat (monitor.py logic: flat_id as string)
|
||||
detail_link = id_to_link.get(str(flat_id), url)
|
||||
|
||||
listing_id = hashlib.md5(f"{rooms}{size}{price}{address}".encode()).hexdigest()[:12]
|
||||
|
||||
listings.append({
|
||||
"id": listing_id,
|
||||
"rooms": f"{rooms} Zimmer",
|
||||
"size": f"{size} m²",
|
||||
"price": f"{price} €",
|
||||
"address": address,
|
||||
"link": detail_link,
|
||||
"fetched_at": datetime.now().isoformat()
|
||||
})
|
||||
|
||||
# Deduplicate by id
|
||||
seen_ids = set()
|
||||
unique_listings = []
|
||||
for listing in listings:
|
||||
if listing["id"] not in seen_ids:
|
||||
seen_ids.add(listing["id"])
|
||||
unique_listings.append(listing)
|
||||
listings = unique_listings
|
||||
|
||||
if not listings:
|
||||
logger.warning("No listings found after parsing. Dumping HTML snippet for debugging:")
|
||||
logger.warning(content[:1000])
|
||||
|
||||
await page.close()
|
||||
logger.info(f"Fetched {len(listings)} unique listings")
|
||||
return listings
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching listings: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
return []
|
||||
|
||||
|
||||
async def dismiss_cookie_modal(self, page):
|
||||
"""Dismiss the privacy/cookie consent modal if present"""
|
||||
try:
|
||||
# Wait a bit for modal to appear
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Try to find and click the accept button in the privacy modal
|
||||
# Look for common accept button patterns in German
|
||||
accept_selectors = [
|
||||
'button:has-text("Akzeptieren")',
|
||||
'button:has-text("Alle akzeptieren")',
|
||||
'button:has-text("Accept")',
|
||||
'button:has-text("Zustimmen")',
|
||||
'[x-show="showPrivacyModal"] button',
|
||||
'.privacy-modal button',
|
||||
'button.accept-cookies',
|
||||
# More specific to inberlinwohnen
|
||||
'div[x-show="showPrivacyModal"] button:first-of-type',
|
||||
]
|
||||
|
||||
for selector in accept_selectors:
|
||||
try:
|
||||
button = await page.query_selector(selector)
|
||||
if button and await button.is_visible():
|
||||
await button.click()
|
||||
logger.info(f"Clicked cookie accept button: {selector}")
|
||||
await asyncio.sleep(1)
|
||||
return True
|
||||
except:
|
||||
continue
|
||||
|
||||
# Try clicking any visible button in the modal overlay
|
||||
modal = await page.query_selector('div[x-show="showPrivacyModal"]')
|
||||
if modal:
|
||||
buttons = await modal.query_selector_all('button')
|
||||
for btn in buttons:
|
||||
if await btn.is_visible():
|
||||
text = await btn.inner_text()
|
||||
logger.info(f"Found modal button: {text}")
|
||||
# Click the first button (usually accept)
|
||||
await btn.click()
|
||||
await asyncio.sleep(1)
|
||||
return True
|
||||
|
||||
logger.info("No cookie modal found or already dismissed")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.debug(f"Cookie modal handling: {e}")
|
||||
return False
|
||||
Loading…
Add table
Add a link
Reference in a new issue