From 92912e8487288c19d4cb12af11cc8e12e16cbed7 Mon Sep 17 00:00:00 2001
From: Aron <aron@petau.net>
Date: Thu, 8 Jan 2026 21:04:43 +0100
Subject: [PATCH] upd gewobag

---
 .env.example                       |   5 +
 handlers/gewobag_handler.py        |  73 ++++++++++
 handlers/wgcompany_notifier.py     |  17 ++-
 helper_functions/merge_all_data.py | 210 +++++++++++++++++++++++++++++
 4 files changed, 300 insertions(+), 5 deletions(-)
 create mode 100644 helper_functions/merge_all_data.py

diff --git a/.env.example b/.env.example
index 696dc2b..bd1e076 100644
--- a/.env.example
+++ b/.env.example
@@ -24,6 +24,9 @@ FORM_PERSONS=2
 FORM_ADULTS=1
 FORM_CHILDREN=0
 FORM_INCOME=2500
+FORM_WBS_VALID_UNTIL=26.11.2026
+FORM_WBS_TYPE=WBS 100
+FORM_WBS_ROOMS=1
 
 # WGcompany.de Search Filters (optional)
 WGCOMPANY_ENABLED=true
@@ -32,3 +35,5 @@ WGCOMPANY_MAX_SIZE=
 WGCOMPANY_MIN_PRICE=
 WGCOMPANY_MAX_PRICE=
 WGCOMPANY_BEZIRK=0
+WGCOMPANY_AGE=
+WGCOMPANY_SMOKER=
diff --git a/handlers/gewobag_handler.py b/handlers/gewobag_handler.py
index b48bfe6..882d24f 100644
--- a/handlers/gewobag_handler.py
+++ b/handlers/gewobag_handler.py
@@ -236,10 +236,83 @@ class GewobagHandler(BaseHandler):
                         if wbs_ja:
                             await wbs_ja.click()
                             logger.info("[GEWOBAG] Selected WBS: Ja")
+                            await asyncio.sleep(1)  # Wait for conditional WBS fields to appear
+
+                            # Save debug HTML after WBS fields are visible
+                            try:
+                                html_content = await iframe_page.content()
+                                debug_html_path = DATA_DIR / f"gewobag_wbs_fields_{listing['id']}.html"
+                                with open(debug_html_path, "w", encoding="utf-8") as f:
+                                    f.write(html_content)
+                                logger.info(f"[GEWOBAG] Saved WBS fields debug HTML to {debug_html_path}")
+                            except Exception as e:
+                                logger.warning(f"[GEWOBAG] Could not save WBS debug HTML: {e}")
+
                             form_filled = True
                     except Exception as e:
                         logger.warning(f"[GEWOBAG] Could not select WBS: {e}")
 
+                    # WBS Gültigkeit (validity date) - appears after selecting Ja
+                    try:
+                        wbs_valid_until = os.environ.get("FORM_WBS_VALID_UNTIL", "26.11.2026")
+                        wbs_date_input = await iframe_page.query_selector('#formly_6_input_\\$\\$_wbs_valid_until_\\$\\$_0')
+                        if wbs_date_input:
+                            await wbs_date_input.fill(wbs_valid_until)
+                            logger.info(f"[GEWOBAG] Filled WBS Gültigkeit: {wbs_valid_until}")
+                            form_filled = True
+                    except Exception as e:
+                        logger.warning(f"[GEWOBAG] Could not fill WBS Gültigkeit: {e}")
+
+                    # WBS Art/Bezeichnung (type) dropdown - appears after selecting Ja
+                    try:
+                        wbs_type = os.environ.get("FORM_WBS_TYPE", "WBS 100")
+                        wbs_type_input = await iframe_page.query_selector('#formly_6_select_gewobag_art_bezeichnung_des_wbs_1')
+                        if wbs_type_input:
+                            await wbs_type_input.click()
+                            await iframe_page.wait_for_timeout(300)
+                            wbs_type_option = await iframe_page.query_selector(f'.ng-option:has-text("{wbs_type}")')
+                            if wbs_type_option:
+                                await wbs_type_option.click()
+                                logger.info(f"[GEWOBAG] Selected WBS Type: {wbs_type}")
+                                form_filled = True
+                    except Exception as e:
+                        logger.warning(f"[GEWOBAG] Could not select WBS Type: {e}")
+
+                    # WBS Anzahl Räume (number of rooms) dropdown - appears after selecting Ja
+                    try:
+                        wbs_rooms = os.environ.get("FORM_WBS_ROOMS", "1")
+                        wbs_rooms_input = await iframe_page.query_selector('#formly_7_select_\\$\\$_wbs_max_number_rooms_\\$\\$_0')
+                        if wbs_rooms_input:
+                            await wbs_rooms_input.click()
+                            await iframe_page.wait_for_timeout(300)
+                            wbs_rooms_option = await iframe_page.query_selector(f'.ng-option:has-text("{wbs_rooms}")')
+                            if wbs_rooms_option:
+                                await wbs_rooms_option.click()
+                                logger.info(f"[GEWOBAG] Selected WBS Rooms: {wbs_rooms}")
+                                form_filled = True
+                    except Exception as e:
+                        logger.warning(f"[GEWOBAG] Could not select WBS Rooms: {e}")
+
+                    # WBS file upload - Upload the WBS PDF and PNG from data/uploads
+                    try:
+                        wbs_files = [
+                            Path("data/uploads/WBS_Antrag_Bestaetigung.pdf"),
+                            Path("data/uploads/WBS_Rechner.png")
+                        ]
+                        existing_files = [str(f) for f in wbs_files if f.exists()]
+
+                        if existing_files:
+                            file_input = await iframe_page.query_selector('input[type="file"]')
+                            if file_input:
+                                await file_input.set_input_files(existing_files)
+                                await asyncio.sleep(1)  # Wait for upload to process
+                                logger.info(f"[GEWOBAG] Uploaded {len(existing_files)} WBS file(s): {', '.join([Path(f).name for f in existing_files])}")
+                                form_filled = True
+                        else:
+                            logger.warning("[GEWOBAG] No WBS files found in data/uploads")
+                    except Exception as e:
+                        logger.warning(f"[GEWOBAG] Could not upload WBS files: {e}")
+
                     # Privacy checkbox (Main Datenschutzbestimmungen) - REQUIRED
                     try:
                         privacy_checkbox = await iframe_page.query_selector('#formly_20_checkbox_gewobag_datenschutzhinweis_bestaetigt_0')
diff --git a/handlers/wgcompany_notifier.py b/handlers/wgcompany_notifier.py
index 032620f..2cdafcc 100644
--- a/handlers/wgcompany_notifier.py
+++ b/handlers/wgcompany_notifier.py
@@ -177,14 +177,21 @@ class WGCompanyNotifier:
             content = await page.content()
 
             # Extract email (look for patterns like email: xxx@yyy.zz or Email: xxx)
+            # Priority: Look for email in table cell context (WG-specific email), exclude footer email
             email_patterns = [
-                r'[Ee]-?[Mm]ail[:\s]+([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})',
-                r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'
+                r'email\s*:\s*</font></b></td>\s*<td[^>]*>.*?mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})',  # Table cell email
+                r'<a href="mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})">',  # Any mailto link
+                r'[Ee]-?[Mm]ail[:\s]+([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})',  # Plain email: pattern
             ]
             for pattern in email_patterns:
-                email_match = re.search(pattern, content)
-                if email_match:
-                    details["email"] = email_match.group(1)
+                email_matches = re.finditer(pattern, content, re.IGNORECASE | re.DOTALL)
+                for match in email_matches:
+                    email = match.group(1)
+                    # Exclude the footer/contact email
+                    if email != "wgcompany@wgcompany.de":
+                        details["email"] = email
+                        break
+                if "email" in details:
                     break
 
             # Extract WG name from URL
diff --git a/helper_functions/merge_all_data.py b/helper_functions/merge_all_data.py
new file mode 100644
index 0000000..93a8ebb
--- /dev/null
+++ b/helper_functions/merge_all_data.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+"""
+Merge all data from prod and dev environments.
+Handles applications.json, listings.json, wgcompany_listings.json, and CSV files.
+For failed applications with duplicates, keeps the earlier timestamp.
+"""
+import json
+import pandas as pd
+from pathlib import Path
+from datetime import datetime
+
+def parse_timestamp(ts_str):
+    """Parse ISO format timestamp string to datetime object."""
+    if ts_str:
+        try:
+            return datetime.fromisoformat(ts_str)
+        except Exception:
+            return None
+    return None
+
+def merge_applications(local_path, merge_path, output_path=None):
+    """
+    Merge two applications.json files, deduplicate by listing_id.
+    Special handling: For failed applications with duplicates, keep the earlier timestamp.
+    For successful applications, keep the entry with more complete data.
+    """
+    output_path = output_path or local_path
+    with open(local_path, encoding='utf-8') as f:
+        local = json.load(f)
+    with open(merge_path, encoding='utf-8') as f:
+        remote = json.load(f)
+
+    merged = {}
+    all_keys = set(local.keys()) | set(remote.keys())
+
+    for key in all_keys:
+        l_entry = local.get(key)
+        r_entry = remote.get(key)
+
+        if l_entry and r_entry:
+            # Both have this application
+            l_success = l_entry.get('success', False)
+            r_success = r_entry.get('success', False)
+            l_ts = parse_timestamp(l_entry.get('timestamp'))
+            r_ts = parse_timestamp(r_entry.get('timestamp'))
+
+            # If both failed, keep the one with earlier timestamp (to avoid timestamp corruption bug)
+            if not l_success and not r_success:
+                if l_ts and r_ts:
+                    merged[key] = l_entry if l_ts < r_ts else r_entry
+                else:
+                    merged[key] = l_entry  # fallback if timestamp missing
+            # If one succeeded and one failed, keep the successful one
+            elif l_success and not r_success:
+                merged[key] = l_entry
+            elif r_success and not l_success:
+                merged[key] = r_entry
+            # If both succeeded, prefer entry with more fields, or latest timestamp
+            else:
+                if len(l_entry) > len(r_entry):
+                    merged[key] = l_entry
+                elif len(r_entry) > len(l_entry):
+                    merged[key] = r_entry
+                else:
+                    # Same length, prefer latest timestamp
+                    if l_ts and r_ts:
+                        merged[key] = l_entry if l_ts > r_ts else r_entry
+                    else:
+                        merged[key] = l_entry
+        else:
+            # Only one has this application
+            merged[key] = l_entry or r_entry
+
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(merged, f, ensure_ascii=False, indent=2)
+
+    print(f"✓ Merged applications: {len(merged)} unique entries → {output_path}")
+    return merged
+
+def merge_dict_json(local_path, merge_path, output_path=None, timestamp_field='fetched_at'):
+    """
+    Merge two dict-based JSON files (keyed by id), deduplicate by key.
+    If duplicate, keep entry with latest timestamp_field.
+    """
+    output_path = output_path or local_path
+    with open(local_path, encoding='utf-8') as f:
+        local = json.load(f)
+    with open(merge_path, encoding='utf-8') as f:
+        remote = json.load(f)
+
+    merged = {}
+    all_keys = set(local.keys()) | set(remote.keys())
+
+    for key in all_keys:
+        l_entry = local.get(key)
+        r_entry = remote.get(key)
+
+        if l_entry and r_entry:
+            l_ts = l_entry.get(timestamp_field)
+            r_ts = r_entry.get(timestamp_field)
+            if l_ts and r_ts:
+                merged[key] = l_entry if l_ts > r_ts else r_entry
+            else:
+                merged[key] = l_entry
+        else:
+            merged[key] = l_entry or r_entry
+
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(merged, f, ensure_ascii=False, indent=2)
+
+    print(f"✓ Merged {Path(local_path).name}: {len(merged)} unique entries → {output_path}")
+    return merged
+
+def merge_csv_times(local_path, merge_path, output_path=None):
+    """
+    Merge two CSV files with time-series data, deduplicate by all columns.
+    Keeps unique rows based on all column values.
+    """
+    output_path = output_path or local_path
+
+    local_df = pd.read_csv(local_path)
+    remote_df = pd.read_csv(merge_path)
+
+    # Combine and drop duplicates
+    merged_df = pd.concat([local_df, remote_df], ignore_index=True)
+    merged_df = merged_df.drop_duplicates()
+
+    # Sort by timestamp if present
+    if 'timestamp' in merged_df.columns:
+        merged_df = merged_df.sort_values('timestamp')
+
+    merged_df.to_csv(output_path, index=False)
+
+    print(f"✓ Merged {Path(local_path).name}: {len(merged_df)} rows → {output_path}")
+    return merged_df
+
+def merge_all_data(local_base_dir="data", merge_base_dir="data/to_merge", output_base_dir=None):
+    """
+    Main function to merge all data from prod and dev environments.
+
+    Args:
+        local_base_dir: Base directory for local (dev) data
+        merge_base_dir: Base directory for data to merge (prod)
+        output_base_dir: Output directory (defaults to local_base_dir)
+
+    Returns:
+        dict: Summary of merge results
+    """
+    output_base_dir = output_base_dir or local_base_dir
+    local_base = Path(local_base_dir)
+    merge_base = Path(merge_base_dir)
+    output_base = Path(output_base_dir)
+
+    print("=" * 60)
+    print("MERGING PROD AND DEV DATA")
+    print("=" * 60)
+
+    results = {}
+
+    # 1. Merge applications.json (special handling for failed duplicates)
+    if (local_base / "applications.json").exists() and (merge_base / "applications.json").exists():
+        results['applications'] = merge_applications(
+            str(local_base / "applications.json"),
+            str(merge_base / "applications.json"),
+            str(output_base / "applications.json")
+        )
+
+    # 2. Merge listings.json
+    if (local_base / "listings.json").exists() and (merge_base / "listings.json").exists():
+        results['listings'] = merge_dict_json(
+            str(local_base / "listings.json"),
+            str(merge_base / "listings.json"),
+            str(output_base / "listings.json"),
+            timestamp_field='fetched_at'
+        )
+
+    # 3. Merge wgcompany_listings.json
+    if (local_base / "wgcompany_listings.json").exists() and (merge_base / "wgcompany_listings.json").exists():
+        results['wgcompany_listings'] = merge_dict_json(
+            str(local_base / "wgcompany_listings.json"),
+            str(merge_base / "wgcompany_listings.json"),
+            str(output_base / "wgcompany_listings.json"),
+            timestamp_field='fetched_at'
+        )
+
+    # 4. Merge listing_times.csv
+    if (local_base / "listing_times.csv").exists() and (merge_base / "listing_times.csv").exists():
+        results['listing_times'] = merge_csv_times(
+            str(local_base / "listing_times.csv"),
+            str(merge_base / "listing_times.csv"),
+            str(output_base / "listing_times.csv")
+        )
+
+    # 5. Merge wgcompany_times.csv
+    if (local_base / "wgcompany_times.csv").exists() and (merge_base / "wgcompany_times.csv").exists():
+        results['wgcompany_times'] = merge_csv_times(
+            str(local_base / "wgcompany_times.csv"),
+            str(merge_base / "wgcompany_times.csv"),
+            str(output_base / "wgcompany_times.csv")
+        )
+
+    print("=" * 60)
+    print("MERGE COMPLETE")
+    print("=" * 60)
+
+    return results
+
+if __name__ == "__main__":
+    # Usage: Place prod data in data/to_merge/ directory, then run this script
+    merge_all_data()