wohnbot/handlers/wgcompany_notifier.py

218 lines
10 KiB
Python
Raw Normal View History

import asyncio
import logging
import hashlib
import re
from datetime import datetime
from pathlib import Path
import json
import os
from playwright.async_api import async_playwright
logger = logging.getLogger(__name__)
WGCOMPANY_LISTINGS_FILE = Path("data/wgcompany_listings.json")
WGCOMPANY_TIMING_FILE = Path("data/wgcompany_times.csv")
# Environment variables for search filters
WGCOMPANY_MIN_SIZE = os.environ.get("WGCOMPANY_MIN_SIZE", "")
WGCOMPANY_MAX_PRICE = os.environ.get("WGCOMPANY_MAX_PRICE", "")
WGCOMPANY_AGE = os.environ.get("WGCOMPANY_AGE", "")
WGCOMPANY_SMOKER = os.environ.get("WGCOMPANY_SMOKER", "")
WGCOMPANY_BEZIRK = os.environ.get("WGCOMPANY_BEZIRK", "0")
class WGCompanyNotifier:
def __init__(self, telegram_bot=None, refresh_minutes=10):
self.browser = None
self.context = None
self.telegram_bot = telegram_bot
self.refresh_minutes = refresh_minutes
async def init_browser(self):
if self.browser is None:
self.playwright = await async_playwright().start()
self.browser = await self.playwright.chromium.launch(headless=True)
self.context = await self.browser.new_context(
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
)
logger.info("[WGCOMPANY] Browser initialized")
async def fetch_listings(self):
2025-12-31 16:06:42 +01:00
await self.init_browser()
listings = []
try:
page = await self.context.new_page()
search_url = "http://www.wgcompany.de/cgi-bin/seite?st=1&mi=10&li=100"
logger.info(f"[WGCOMPANY] Loading search page: {search_url}")
await page.goto(search_url, wait_until="networkidle")
await asyncio.sleep(2)
if WGCOMPANY_MIN_SIZE:
min_size_field = await page.query_selector('input[name="c"]')
if min_size_field:
await min_size_field.fill(WGCOMPANY_MIN_SIZE)
if WGCOMPANY_MAX_PRICE:
max_price_field = await page.query_selector('input[name="a"]')
if max_price_field:
await max_price_field.fill(WGCOMPANY_MAX_PRICE)
if WGCOMPANY_AGE:
age_field = await page.query_selector('input[name="l"]')
if age_field:
await age_field.fill(WGCOMPANY_AGE)
if WGCOMPANY_SMOKER:
smoker_select = await page.query_selector('select[name="o"]')
if smoker_select:
await smoker_select.select_option(WGCOMPANY_SMOKER)
if WGCOMPANY_BEZIRK and WGCOMPANY_BEZIRK != "0":
bezirk_select = await page.query_selector('select[name="e"]')
if bezirk_select:
await bezirk_select.select_option(WGCOMPANY_BEZIRK)
submit_btn = await page.query_selector('input[type="submit"][value*="finde"], input[type="submit"]')
if submit_btn:
await submit_btn.click()
await page.wait_for_load_state("networkidle")
await asyncio.sleep(2)
content = await page.content()
with open("data/wgcompany_debug.html", "w", encoding="utf-8") as f:
f.write(content)
listing_links = await page.query_selector_all('a[href*="wg.pl"][href*="wgzeigen"]')
logger.info(f"[WGCOMPANY] Found {len(listing_links)} listing links")
for link_elem in listing_links:
try:
href = await link_elem.get_attribute("href")
if not href:
continue
parent = await link_elem.evaluate_handle("el => el.closest('tr') || el.parentElement")
row_text = await parent.evaluate("el => el.innerText") if parent else ""
price_match = re.search(r'(\d+)\s*€', row_text)
price = price_match.group(1) + "" if price_match else "?"
size_match = re.search(r'(\d+)\s*m²', row_text)
size = size_match.group(1) + "" if size_match else "?"
bezirk_patterns = [
"Kreuzberg", "Neukölln", "Friedrichshain", "Prenzlauer Berg",
"Mitte", "Wedding", "Charlottenburg", "Schöneberg", "Tempelhof",
"Steglitz", "Wilmersdorf", "Pankow", "Lichtenberg", "Treptow",
"Köpenick", "Reinickendorf", "Spandau", "Zehlendorf", "Moabit"
]
location = "Berlin"
for bez in bezirk_patterns:
if bez.lower() in row_text.lower():
location = bez
break
if not href.startswith("http"):
href = f"http://www.wgcompany.de{href}" if href.startswith("/") else f"http://www.wgcompany.de/cgi-bin/{href}"
listing_id = hashlib.md5(f"{href}{price}{size}".encode()).hexdigest()[:12]
listings.append({
"id": listing_id,
"rooms": "1 Zimmer (WG)",
"size": size,
"price": price,
"address": location,
"link": href,
"source": "wgcompany",
"fetched_at": datetime.now().isoformat()
})
except Exception as e:
logger.debug(f"[WGCOMPANY] Error parsing listing: {e}")
continue
# Deduplicate
seen_ids = set()
unique_listings = []
for listing in listings:
if listing["id"] not in seen_ids:
seen_ids.add(listing["id"])
unique_listings.append(listing)
await page.close()
logger.info(f"[WGCOMPANY] Fetched {len(unique_listings)} unique listings")
return unique_listings
except Exception as e:
logger.error(f"[WGCOMPANY] Error fetching listings: {e}")
return []
def load_previous_listings(self):
if WGCOMPANY_LISTINGS_FILE.exists():
with open(WGCOMPANY_LISTINGS_FILE, "r") as f:
data = json.load(f)
logger.info(f"[WGCOMPANY] Loaded {len(data)} previous listings from file. IDs: {list(data.keys())[:10]}{'...' if len(data) > 10 else ''}")
return data
logger.info("[WGCOMPANY] No previous listings file found.")
return {}
def save_listings(self, listings):
listings_dict = {l["id"]: l for l in listings}
logger.info(f"[WGCOMPANY] Saving {len(listings_dict)} listings to file. IDs: {list(listings_dict.keys())[:10]}{'...' if len(listings_dict) > 10 else ''}")
with open(WGCOMPANY_LISTINGS_FILE, "w") as f:
json.dump(listings_dict, f, indent=2, ensure_ascii=False)
def find_new_listings(self, current, previous):
current_ids = [l["id"] for l in current]
previous_ids = list(previous.keys())
logger.info(f"[WGCOMPANY] Current listing IDs: {current_ids[:10]}{'...' if len(current_ids) > 10 else ''}")
logger.info(f"[WGCOMPANY] Previous listing IDs: {previous_ids[:10]}{'...' if len(previous_ids) > 10 else ''}")
new_listings = [l for l in current if l["id"] not in previous]
logger.info(f"[WGCOMPANY] Detected {len(new_listings)} new listings (not in previous)")
return new_listings
def log_listing_times(self, new_listings):
if not new_listings:
return
import csv
file_exists = WGCOMPANY_TIMING_FILE.exists()
with open(WGCOMPANY_TIMING_FILE, "a", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
if not file_exists:
writer.writerow(["timestamp", "weekday", "hour", "minute", "rooms", "size", "price", "address", "listing_id"])
now = datetime.now()
for listing in new_listings:
writer.writerow([
now.isoformat(),
now.strftime("%A"),
now.hour,
now.minute,
listing["rooms"],
listing["size"],
listing["price"],
listing["address"],
listing["id"]
])
logger.info(f"[WGCOMPANY] Logged {len(new_listings)} listing times to CSV")
async def notify_new_listings(self, new_listings):
if not new_listings or not self.telegram_bot:
logger.info("[WGCOMPANY] No new listings to notify or Telegram bot not set.")
return
logger.info(f"[WGCOMPANY] Notifying {len(new_listings)} new listing(s) via Telegram")
for idx, listing in enumerate(new_listings, 1):
try:
logger.info(f"[WGCOMPANY] Sending listing {idx}/{len(new_listings)}: {listing['link']} | {listing['rooms']} | {listing['size']} | {listing['price']} | {listing['address']}")
message = f"<b>[WGCOMPANY]</b> <a href=\"{listing['link']}\">{listing['link']}</a>\n"
message += f"🚪 <b>{listing['rooms']}</b>\n"
message += f"📐 {listing['size']}\n"
message += f"💰 {listing['price']}\n"
message += f"📍 {listing['address']}"
await self.telegram_bot._send_message(message)
await asyncio.sleep(0.5)
except Exception as e:
logger.error(f"[WGCOMPANY] Error sending Telegram message for listing {idx}/{len(new_listings)}: {e}")
import traceback
logger.error(traceback.format_exc())
async def run(self):
await self.init_browser()
while True:
listings = await self.fetch_listings()
previous = self.load_previous_listings()
new_listings = self.find_new_listings(listings, previous)
if new_listings:
logger.info(f"[WGCOMPANY] Found {len(new_listings)} new listing(s)")
self.log_listing_times(new_listings)
await self.notify_new_listings(new_listings)
else:
logger.info("[WGCOMPANY] No new listings")
self.save_listings(listings)
await asyncio.sleep(self.refresh_minutes * 60)