import asyncio import logging import hashlib import re import csv from datetime import datetime from pathlib import Path import json import os from typing import Optional from playwright.async_api import async_playwright, Browser, BrowserContext, Playwright logger = logging.getLogger(__name__) WGCOMPANY_LISTINGS_FILE = Path("data/wgcompany_listings.json") WGCOMPANY_TIMING_FILE = Path("data/wgcompany_times.csv") CONTACTS_FILE = Path("data/contacts.csv") # Environment variables for search filters WGCOMPANY_MIN_SIZE = os.environ.get("WGCOMPANY_MIN_SIZE", "") WGCOMPANY_MAX_PRICE = os.environ.get("WGCOMPANY_MAX_PRICE", "") WGCOMPANY_AGE = os.environ.get("WGCOMPANY_AGE", "") WGCOMPANY_SMOKER = os.environ.get("WGCOMPANY_SMOKER", "") WGCOMPANY_BEZIRK = os.environ.get("WGCOMPANY_BEZIRK", "0") class WGCompanyNotifier: def __init__(self, telegram_bot=None, refresh_minutes=10): self.browser: Optional[Browser] = None self.context: Optional[BrowserContext] = None self.playwright: Optional[Playwright] = None self.telegram_bot = telegram_bot self.refresh_minutes = refresh_minutes async def init_browser(self): if self.browser is None: self.playwright = await async_playwright().start() self.browser = await self.playwright.chromium.launch(headless=True) self.context = await self.browser.new_context() logger.debug("[WG] Browser ready") async def fetch_listings(self): await self.init_browser() listings = [] try: assert self.context is not None, "Browser context not initialized" page = await self.context.new_page() search_url = "http://www.wgcompany.de/cgi-bin/seite?st=1&mi=10&li=100" logger.info(f"[WGCOMPANY] Loading search page: {search_url}") await page.goto(search_url, wait_until="networkidle") await asyncio.sleep(2) if WGCOMPANY_MIN_SIZE: min_size_field = await page.query_selector('input[name="c"]') if min_size_field: await min_size_field.fill(WGCOMPANY_MIN_SIZE) if WGCOMPANY_MAX_PRICE: max_price_field = await page.query_selector('input[name="a"]') if max_price_field: await max_price_field.fill(WGCOMPANY_MAX_PRICE) if WGCOMPANY_AGE: age_field = await page.query_selector('input[name="l"]') if age_field: await age_field.fill(WGCOMPANY_AGE) if WGCOMPANY_SMOKER: smoker_select = await page.query_selector('select[name="o"]') if smoker_select: await smoker_select.select_option(WGCOMPANY_SMOKER) if WGCOMPANY_BEZIRK and WGCOMPANY_BEZIRK != "0": bezirk_select = await page.query_selector('select[name="e"]') if bezirk_select: await bezirk_select.select_option(WGCOMPANY_BEZIRK) submit_btn = await page.query_selector('input[type="submit"][value*="finde"], input[type="submit"]') if submit_btn: await submit_btn.click() await page.wait_for_load_state("networkidle") await asyncio.sleep(2) content = await page.content() with open("data/wgcompany_debug.html", "w", encoding="utf-8") as f: f.write(content) listing_links = await page.query_selector_all('a[href*="wg.pl"][href*="wgzeigen"]') logger.info(f"[WGCOMPANY] Found {len(listing_links)} listing links") for link_elem in listing_links: try: href = await link_elem.get_attribute("href") if not href: continue parent = await link_elem.evaluate_handle("el => el.closest('tr') || el.parentElement") row_text = await parent.evaluate("el => el.innerText") if parent else "" price_match = re.search(r'(\d+)\s*€', row_text) price = price_match.group(1) + " €" if price_match else "?" size_match = re.search(r'(\d+)\s*m²', row_text) size = size_match.group(1) + " m²" if size_match else "?" bezirk_patterns = [ "Kreuzberg", "Neukölln", "Friedrichshain", "Prenzlauer Berg", "Mitte", "Wedding", "Charlottenburg", "Schöneberg", "Tempelhof", "Steglitz", "Wilmersdorf", "Pankow", "Lichtenberg", "Treptow", "Köpenick", "Reinickendorf", "Spandau", "Zehlendorf", "Moabit" ] location = "Berlin" for bez in bezirk_patterns: if bez.lower() in row_text.lower(): location = bez break if not href.startswith("http"): href = f"http://www.wgcompany.de{href}" if href.startswith("/") else f"http://www.wgcompany.de/cgi-bin/{href}" listing_id = hashlib.md5(f"{href}{price}{size}".encode()).hexdigest()[:12] listings.append({ "id": listing_id, "rooms": "1 Zimmer (WG)", "size": size, "price": price, "address": location, "link": href, "source": "wgcompany", "fetched_at": datetime.now().isoformat() }) except Exception as e: logger.debug(f"[WGCOMPANY] Error parsing listing: {e}") continue # Deduplicate seen_ids = set() unique_listings = [] for listing in listings: if listing["id"] not in seen_ids: seen_ids.add(listing["id"]) unique_listings.append(listing) await page.close() if len(unique_listings) == 0: logger.warning("[WGCOMPANY] Fetched 0 listings - possible page load failure") else: logger.info(f"[WGCOMPANY] Fetched {len(unique_listings)} unique listings") return unique_listings except Exception as e: logger.error(f"[WGCOMPANY] Error fetching listings: {e}", exc_info=True) return [] def load_previous_listings(self): if WGCOMPANY_LISTINGS_FILE.exists(): with open(WGCOMPANY_LISTINGS_FILE, 'r') as f: data = json.load(f) logger.info(f"[WGCOMPANY] Loaded {len(data)} previous listings from file") return data logger.info("[WGCOMPANY] No previous listings file found, starting fresh") return {} def save_listings(self, listings: list[dict]) -> None: listings_dict = {l['id']: l for l in listings} logger.info(f"[WGCOMPANY] Saving {len(listings_dict)} listings to file") with open(WGCOMPANY_LISTINGS_FILE, 'w') as f: json.dump(listings_dict, f, indent=2, ensure_ascii=False) def find_new_listings(self, current: list[dict], previous: dict) -> list[dict]: new = [] for listing in current: if listing['id'] not in previous: new.append(listing) if new: logger.info(f"[WG] {len(new)} new listing{'s' if len(new) > 1 else ''} detected") return new async def fetch_listing_details(self, listing_url: str) -> dict: """Fetch detailed information from a listing page including email.""" details = { "email": "", "contact_person": "", "address": "", "description": "", "wg_name": "" } try: assert self.context is not None, "Browser context not initialized" page = await self.context.new_page() await page.goto(listing_url, wait_until="networkidle") await asyncio.sleep(1) content = await page.content() # Extract email (look for patterns like email: xxx@yyy.zz or Email: xxx) # Priority: Look for email in table cell context (WG-specific email), exclude footer email email_patterns = [ r'email\s*:\s*\s*