From 92912e8487288c19d4cb12af11cc8e12e16cbed7 Mon Sep 17 00:00:00 2001 From: Aron Date: Thu, 8 Jan 2026 21:04:43 +0100 Subject: [PATCH] upd gewobag --- .env.example | 5 + handlers/gewobag_handler.py | 73 ++++++++++ handlers/wgcompany_notifier.py | 17 ++- helper_functions/merge_all_data.py | 210 +++++++++++++++++++++++++++++ 4 files changed, 300 insertions(+), 5 deletions(-) create mode 100644 helper_functions/merge_all_data.py diff --git a/.env.example b/.env.example index 696dc2b..bd1e076 100644 --- a/.env.example +++ b/.env.example @@ -24,6 +24,9 @@ FORM_PERSONS=2 FORM_ADULTS=1 FORM_CHILDREN=0 FORM_INCOME=2500 +FORM_WBS_VALID_UNTIL=26.11.2026 +FORM_WBS_TYPE=WBS 100 +FORM_WBS_ROOMS=1 # WGcompany.de Search Filters (optional) WGCOMPANY_ENABLED=true @@ -32,3 +35,5 @@ WGCOMPANY_MAX_SIZE= WGCOMPANY_MIN_PRICE= WGCOMPANY_MAX_PRICE= WGCOMPANY_BEZIRK=0 +WGCOMPANY_AGE= +WGCOMPANY_SMOKER= diff --git a/handlers/gewobag_handler.py b/handlers/gewobag_handler.py index b48bfe6..882d24f 100644 --- a/handlers/gewobag_handler.py +++ b/handlers/gewobag_handler.py @@ -236,10 +236,83 @@ class GewobagHandler(BaseHandler): if wbs_ja: await wbs_ja.click() logger.info("[GEWOBAG] Selected WBS: Ja") + await asyncio.sleep(1) # Wait for conditional WBS fields to appear + + # Save debug HTML after WBS fields are visible + try: + html_content = await iframe_page.content() + debug_html_path = DATA_DIR / f"gewobag_wbs_fields_{listing['id']}.html" + with open(debug_html_path, "w", encoding="utf-8") as f: + f.write(html_content) + logger.info(f"[GEWOBAG] Saved WBS fields debug HTML to {debug_html_path}") + except Exception as e: + logger.warning(f"[GEWOBAG] Could not save WBS debug HTML: {e}") + form_filled = True except Exception as e: logger.warning(f"[GEWOBAG] Could not select WBS: {e}") + # WBS Gültigkeit (validity date) - appears after selecting Ja + try: + wbs_valid_until = os.environ.get("FORM_WBS_VALID_UNTIL", "26.11.2026") + wbs_date_input = await iframe_page.query_selector('#formly_6_input_\\$\\$_wbs_valid_until_\\$\\$_0') + if wbs_date_input: + await wbs_date_input.fill(wbs_valid_until) + logger.info(f"[GEWOBAG] Filled WBS Gültigkeit: {wbs_valid_until}") + form_filled = True + except Exception as e: + logger.warning(f"[GEWOBAG] Could not fill WBS Gültigkeit: {e}") + + # WBS Art/Bezeichnung (type) dropdown - appears after selecting Ja + try: + wbs_type = os.environ.get("FORM_WBS_TYPE", "WBS 100") + wbs_type_input = await iframe_page.query_selector('#formly_6_select_gewobag_art_bezeichnung_des_wbs_1') + if wbs_type_input: + await wbs_type_input.click() + await iframe_page.wait_for_timeout(300) + wbs_type_option = await iframe_page.query_selector(f'.ng-option:has-text("{wbs_type}")') + if wbs_type_option: + await wbs_type_option.click() + logger.info(f"[GEWOBAG] Selected WBS Type: {wbs_type}") + form_filled = True + except Exception as e: + logger.warning(f"[GEWOBAG] Could not select WBS Type: {e}") + + # WBS Anzahl Räume (number of rooms) dropdown - appears after selecting Ja + try: + wbs_rooms = os.environ.get("FORM_WBS_ROOMS", "1") + wbs_rooms_input = await iframe_page.query_selector('#formly_7_select_\\$\\$_wbs_max_number_rooms_\\$\\$_0') + if wbs_rooms_input: + await wbs_rooms_input.click() + await iframe_page.wait_for_timeout(300) + wbs_rooms_option = await iframe_page.query_selector(f'.ng-option:has-text("{wbs_rooms}")') + if wbs_rooms_option: + await wbs_rooms_option.click() + logger.info(f"[GEWOBAG] Selected WBS Rooms: {wbs_rooms}") + form_filled = True + except Exception as e: + logger.warning(f"[GEWOBAG] Could not select WBS Rooms: {e}") + + # WBS file upload - Upload the WBS PDF and PNG from data/uploads + try: + wbs_files = [ + Path("data/uploads/WBS_Antrag_Bestaetigung.pdf"), + Path("data/uploads/WBS_Rechner.png") + ] + existing_files = [str(f) for f in wbs_files if f.exists()] + + if existing_files: + file_input = await iframe_page.query_selector('input[type="file"]') + if file_input: + await file_input.set_input_files(existing_files) + await asyncio.sleep(1) # Wait for upload to process + logger.info(f"[GEWOBAG] Uploaded {len(existing_files)} WBS file(s): {', '.join([Path(f).name for f in existing_files])}") + form_filled = True + else: + logger.warning("[GEWOBAG] No WBS files found in data/uploads") + except Exception as e: + logger.warning(f"[GEWOBAG] Could not upload WBS files: {e}") + # Privacy checkbox (Main Datenschutzbestimmungen) - REQUIRED try: privacy_checkbox = await iframe_page.query_selector('#formly_20_checkbox_gewobag_datenschutzhinweis_bestaetigt_0') diff --git a/handlers/wgcompany_notifier.py b/handlers/wgcompany_notifier.py index 032620f..2cdafcc 100644 --- a/handlers/wgcompany_notifier.py +++ b/handlers/wgcompany_notifier.py @@ -177,14 +177,21 @@ class WGCompanyNotifier: content = await page.content() # Extract email (look for patterns like email: xxx@yyy.zz or Email: xxx) + # Priority: Look for email in table cell context (WG-specific email), exclude footer email email_patterns = [ - r'[Ee]-?[Mm]ail[:\s]+([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', - r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})' + r'email\s*:\s*\s*]*>.*?mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', # Table cell email + r'', # Any mailto link + r'[Ee]-?[Mm]ail[:\s]+([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', # Plain email: pattern ] for pattern in email_patterns: - email_match = re.search(pattern, content) - if email_match: - details["email"] = email_match.group(1) + email_matches = re.finditer(pattern, content, re.IGNORECASE | re.DOTALL) + for match in email_matches: + email = match.group(1) + # Exclude the footer/contact email + if email != "wgcompany@wgcompany.de": + details["email"] = email + break + if "email" in details: break # Extract WG name from URL diff --git a/helper_functions/merge_all_data.py b/helper_functions/merge_all_data.py new file mode 100644 index 0000000..93a8ebb --- /dev/null +++ b/helper_functions/merge_all_data.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +""" +Merge all data from prod and dev environments. +Handles applications.json, listings.json, wgcompany_listings.json, and CSV files. +For failed applications with duplicates, keeps the earlier timestamp. +""" +import json +import pandas as pd +from pathlib import Path +from datetime import datetime + +def parse_timestamp(ts_str): + """Parse ISO format timestamp string to datetime object.""" + if ts_str: + try: + return datetime.fromisoformat(ts_str) + except Exception: + return None + return None + +def merge_applications(local_path, merge_path, output_path=None): + """ + Merge two applications.json files, deduplicate by listing_id. + Special handling: For failed applications with duplicates, keep the earlier timestamp. + For successful applications, keep the entry with more complete data. + """ + output_path = output_path or local_path + with open(local_path, encoding='utf-8') as f: + local = json.load(f) + with open(merge_path, encoding='utf-8') as f: + remote = json.load(f) + + merged = {} + all_keys = set(local.keys()) | set(remote.keys()) + + for key in all_keys: + l_entry = local.get(key) + r_entry = remote.get(key) + + if l_entry and r_entry: + # Both have this application + l_success = l_entry.get('success', False) + r_success = r_entry.get('success', False) + l_ts = parse_timestamp(l_entry.get('timestamp')) + r_ts = parse_timestamp(r_entry.get('timestamp')) + + # If both failed, keep the one with earlier timestamp (to avoid timestamp corruption bug) + if not l_success and not r_success: + if l_ts and r_ts: + merged[key] = l_entry if l_ts < r_ts else r_entry + else: + merged[key] = l_entry # fallback if timestamp missing + # If one succeeded and one failed, keep the successful one + elif l_success and not r_success: + merged[key] = l_entry + elif r_success and not l_success: + merged[key] = r_entry + # If both succeeded, prefer entry with more fields, or latest timestamp + else: + if len(l_entry) > len(r_entry): + merged[key] = l_entry + elif len(r_entry) > len(l_entry): + merged[key] = r_entry + else: + # Same length, prefer latest timestamp + if l_ts and r_ts: + merged[key] = l_entry if l_ts > r_ts else r_entry + else: + merged[key] = l_entry + else: + # Only one has this application + merged[key] = l_entry or r_entry + + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(merged, f, ensure_ascii=False, indent=2) + + print(f"✓ Merged applications: {len(merged)} unique entries → {output_path}") + return merged + +def merge_dict_json(local_path, merge_path, output_path=None, timestamp_field='fetched_at'): + """ + Merge two dict-based JSON files (keyed by id), deduplicate by key. + If duplicate, keep entry with latest timestamp_field. + """ + output_path = output_path or local_path + with open(local_path, encoding='utf-8') as f: + local = json.load(f) + with open(merge_path, encoding='utf-8') as f: + remote = json.load(f) + + merged = {} + all_keys = set(local.keys()) | set(remote.keys()) + + for key in all_keys: + l_entry = local.get(key) + r_entry = remote.get(key) + + if l_entry and r_entry: + l_ts = l_entry.get(timestamp_field) + r_ts = r_entry.get(timestamp_field) + if l_ts and r_ts: + merged[key] = l_entry if l_ts > r_ts else r_entry + else: + merged[key] = l_entry + else: + merged[key] = l_entry or r_entry + + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(merged, f, ensure_ascii=False, indent=2) + + print(f"✓ Merged {Path(local_path).name}: {len(merged)} unique entries → {output_path}") + return merged + +def merge_csv_times(local_path, merge_path, output_path=None): + """ + Merge two CSV files with time-series data, deduplicate by all columns. + Keeps unique rows based on all column values. + """ + output_path = output_path or local_path + + local_df = pd.read_csv(local_path) + remote_df = pd.read_csv(merge_path) + + # Combine and drop duplicates + merged_df = pd.concat([local_df, remote_df], ignore_index=True) + merged_df = merged_df.drop_duplicates() + + # Sort by timestamp if present + if 'timestamp' in merged_df.columns: + merged_df = merged_df.sort_values('timestamp') + + merged_df.to_csv(output_path, index=False) + + print(f"✓ Merged {Path(local_path).name}: {len(merged_df)} rows → {output_path}") + return merged_df + +def merge_all_data(local_base_dir="data", merge_base_dir="data/to_merge", output_base_dir=None): + """ + Main function to merge all data from prod and dev environments. + + Args: + local_base_dir: Base directory for local (dev) data + merge_base_dir: Base directory for data to merge (prod) + output_base_dir: Output directory (defaults to local_base_dir) + + Returns: + dict: Summary of merge results + """ + output_base_dir = output_base_dir or local_base_dir + local_base = Path(local_base_dir) + merge_base = Path(merge_base_dir) + output_base = Path(output_base_dir) + + print("=" * 60) + print("MERGING PROD AND DEV DATA") + print("=" * 60) + + results = {} + + # 1. Merge applications.json (special handling for failed duplicates) + if (local_base / "applications.json").exists() and (merge_base / "applications.json").exists(): + results['applications'] = merge_applications( + str(local_base / "applications.json"), + str(merge_base / "applications.json"), + str(output_base / "applications.json") + ) + + # 2. Merge listings.json + if (local_base / "listings.json").exists() and (merge_base / "listings.json").exists(): + results['listings'] = merge_dict_json( + str(local_base / "listings.json"), + str(merge_base / "listings.json"), + str(output_base / "listings.json"), + timestamp_field='fetched_at' + ) + + # 3. Merge wgcompany_listings.json + if (local_base / "wgcompany_listings.json").exists() and (merge_base / "wgcompany_listings.json").exists(): + results['wgcompany_listings'] = merge_dict_json( + str(local_base / "wgcompany_listings.json"), + str(merge_base / "wgcompany_listings.json"), + str(output_base / "wgcompany_listings.json"), + timestamp_field='fetched_at' + ) + + # 4. Merge listing_times.csv + if (local_base / "listing_times.csv").exists() and (merge_base / "listing_times.csv").exists(): + results['listing_times'] = merge_csv_times( + str(local_base / "listing_times.csv"), + str(merge_base / "listing_times.csv"), + str(output_base / "listing_times.csv") + ) + + # 5. Merge wgcompany_times.csv + if (local_base / "wgcompany_times.csv").exists() and (merge_base / "wgcompany_times.csv").exists(): + results['wgcompany_times'] = merge_csv_times( + str(local_base / "wgcompany_times.csv"), + str(merge_base / "wgcompany_times.csv"), + str(output_base / "wgcompany_times.csv") + ) + + print("=" * 60) + print("MERGE COMPLETE") + print("=" * 60) + + return results + +if __name__ == "__main__": + # Usage: Place prod data in data/to_merge/ directory, then run this script + merge_all_data()