diff --git a/.env.example b/.env.example
index 696dc2b..bd1e076 100644
--- a/.env.example
+++ b/.env.example
@@ -24,6 +24,9 @@ FORM_PERSONS=2
FORM_ADULTS=1
FORM_CHILDREN=0
FORM_INCOME=2500
+FORM_WBS_VALID_UNTIL=26.11.2026
+FORM_WBS_TYPE=WBS 100
+FORM_WBS_ROOMS=1
# WGcompany.de Search Filters (optional)
WGCOMPANY_ENABLED=true
@@ -32,3 +35,5 @@ WGCOMPANY_MAX_SIZE=
WGCOMPANY_MIN_PRICE=
WGCOMPANY_MAX_PRICE=
WGCOMPANY_BEZIRK=0
+WGCOMPANY_AGE=
+WGCOMPANY_SMOKER=
diff --git a/handlers/gewobag_handler.py b/handlers/gewobag_handler.py
index b48bfe6..882d24f 100644
--- a/handlers/gewobag_handler.py
+++ b/handlers/gewobag_handler.py
@@ -236,10 +236,83 @@ class GewobagHandler(BaseHandler):
if wbs_ja:
await wbs_ja.click()
logger.info("[GEWOBAG] Selected WBS: Ja")
+ await asyncio.sleep(1) # Wait for conditional WBS fields to appear
+
+ # Save debug HTML after WBS fields are visible
+ try:
+ html_content = await iframe_page.content()
+ debug_html_path = DATA_DIR / f"gewobag_wbs_fields_{listing['id']}.html"
+ with open(debug_html_path, "w", encoding="utf-8") as f:
+ f.write(html_content)
+ logger.info(f"[GEWOBAG] Saved WBS fields debug HTML to {debug_html_path}")
+ except Exception as e:
+ logger.warning(f"[GEWOBAG] Could not save WBS debug HTML: {e}")
+
form_filled = True
except Exception as e:
logger.warning(f"[GEWOBAG] Could not select WBS: {e}")
+ # WBS Gültigkeit (validity date) - appears after selecting Ja
+ try:
+ wbs_valid_until = os.environ.get("FORM_WBS_VALID_UNTIL", "26.11.2026")
+ wbs_date_input = await iframe_page.query_selector('#formly_6_input_\\$\\$_wbs_valid_until_\\$\\$_0')
+ if wbs_date_input:
+ await wbs_date_input.fill(wbs_valid_until)
+ logger.info(f"[GEWOBAG] Filled WBS Gültigkeit: {wbs_valid_until}")
+ form_filled = True
+ except Exception as e:
+ logger.warning(f"[GEWOBAG] Could not fill WBS Gültigkeit: {e}")
+
+ # WBS Art/Bezeichnung (type) dropdown - appears after selecting Ja
+ try:
+ wbs_type = os.environ.get("FORM_WBS_TYPE", "WBS 100")
+ wbs_type_input = await iframe_page.query_selector('#formly_6_select_gewobag_art_bezeichnung_des_wbs_1')
+ if wbs_type_input:
+ await wbs_type_input.click()
+ await iframe_page.wait_for_timeout(300)
+ wbs_type_option = await iframe_page.query_selector(f'.ng-option:has-text("{wbs_type}")')
+ if wbs_type_option:
+ await wbs_type_option.click()
+ logger.info(f"[GEWOBAG] Selected WBS Type: {wbs_type}")
+ form_filled = True
+ except Exception as e:
+ logger.warning(f"[GEWOBAG] Could not select WBS Type: {e}")
+
+ # WBS Anzahl Räume (number of rooms) dropdown - appears after selecting Ja
+ try:
+ wbs_rooms = os.environ.get("FORM_WBS_ROOMS", "1")
+ wbs_rooms_input = await iframe_page.query_selector('#formly_7_select_\\$\\$_wbs_max_number_rooms_\\$\\$_0')
+ if wbs_rooms_input:
+ await wbs_rooms_input.click()
+ await iframe_page.wait_for_timeout(300)
+ wbs_rooms_option = await iframe_page.query_selector(f'.ng-option:has-text("{wbs_rooms}")')
+ if wbs_rooms_option:
+ await wbs_rooms_option.click()
+ logger.info(f"[GEWOBAG] Selected WBS Rooms: {wbs_rooms}")
+ form_filled = True
+ except Exception as e:
+ logger.warning(f"[GEWOBAG] Could not select WBS Rooms: {e}")
+
+ # WBS file upload - Upload the WBS PDF and PNG from data/uploads
+ try:
+ wbs_files = [
+ Path("data/uploads/WBS_Antrag_Bestaetigung.pdf"),
+ Path("data/uploads/WBS_Rechner.png")
+ ]
+ existing_files = [str(f) for f in wbs_files if f.exists()]
+
+ if existing_files:
+ file_input = await iframe_page.query_selector('input[type="file"]')
+ if file_input:
+ await file_input.set_input_files(existing_files)
+ await asyncio.sleep(1) # Wait for upload to process
+ logger.info(f"[GEWOBAG] Uploaded {len(existing_files)} WBS file(s): {', '.join([Path(f).name for f in existing_files])}")
+ form_filled = True
+ else:
+ logger.warning("[GEWOBAG] No WBS files found in data/uploads")
+ except Exception as e:
+ logger.warning(f"[GEWOBAG] Could not upload WBS files: {e}")
+
# Privacy checkbox (Main Datenschutzbestimmungen) - REQUIRED
try:
privacy_checkbox = await iframe_page.query_selector('#formly_20_checkbox_gewobag_datenschutzhinweis_bestaetigt_0')
diff --git a/handlers/wgcompany_notifier.py b/handlers/wgcompany_notifier.py
index 032620f..2cdafcc 100644
--- a/handlers/wgcompany_notifier.py
+++ b/handlers/wgcompany_notifier.py
@@ -177,14 +177,21 @@ class WGCompanyNotifier:
content = await page.content()
# Extract email (look for patterns like email: xxx@yyy.zz or Email: xxx)
+ # Priority: Look for email in table cell context (WG-specific email), exclude footer email
email_patterns = [
- r'[Ee]-?[Mm]ail[:\s]+([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})',
- r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'
+ r'email\s*:\s*\s*
]*>.*?mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', # Table cell email
+ r'', # Any mailto link
+ r'[Ee]-?[Mm]ail[:\s]+([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', # Plain email: pattern
]
for pattern in email_patterns:
- email_match = re.search(pattern, content)
- if email_match:
- details["email"] = email_match.group(1)
+ email_matches = re.finditer(pattern, content, re.IGNORECASE | re.DOTALL)
+ for match in email_matches:
+ email = match.group(1)
+ # Exclude the footer/contact email
+ if email != "wgcompany@wgcompany.de":
+ details["email"] = email
+ break
+ if "email" in details:
break
# Extract WG name from URL
diff --git a/helper_functions/merge_all_data.py b/helper_functions/merge_all_data.py
new file mode 100644
index 0000000..93a8ebb
--- /dev/null
+++ b/helper_functions/merge_all_data.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+"""
+Merge all data from prod and dev environments.
+Handles applications.json, listings.json, wgcompany_listings.json, and CSV files.
+For failed applications with duplicates, keeps the earlier timestamp.
+"""
+import json
+import pandas as pd
+from pathlib import Path
+from datetime import datetime
+
+def parse_timestamp(ts_str):
+ """Parse ISO format timestamp string to datetime object."""
+ if ts_str:
+ try:
+ return datetime.fromisoformat(ts_str)
+ except Exception:
+ return None
+ return None
+
+def merge_applications(local_path, merge_path, output_path=None):
+ """
+ Merge two applications.json files, deduplicate by listing_id.
+ Special handling: For failed applications with duplicates, keep the earlier timestamp.
+ For successful applications, keep the entry with more complete data.
+ """
+ output_path = output_path or local_path
+ with open(local_path, encoding='utf-8') as f:
+ local = json.load(f)
+ with open(merge_path, encoding='utf-8') as f:
+ remote = json.load(f)
+
+ merged = {}
+ all_keys = set(local.keys()) | set(remote.keys())
+
+ for key in all_keys:
+ l_entry = local.get(key)
+ r_entry = remote.get(key)
+
+ if l_entry and r_entry:
+ # Both have this application
+ l_success = l_entry.get('success', False)
+ r_success = r_entry.get('success', False)
+ l_ts = parse_timestamp(l_entry.get('timestamp'))
+ r_ts = parse_timestamp(r_entry.get('timestamp'))
+
+ # If both failed, keep the one with earlier timestamp (to avoid timestamp corruption bug)
+ if not l_success and not r_success:
+ if l_ts and r_ts:
+ merged[key] = l_entry if l_ts < r_ts else r_entry
+ else:
+ merged[key] = l_entry # fallback if timestamp missing
+ # If one succeeded and one failed, keep the successful one
+ elif l_success and not r_success:
+ merged[key] = l_entry
+ elif r_success and not l_success:
+ merged[key] = r_entry
+ # If both succeeded, prefer entry with more fields, or latest timestamp
+ else:
+ if len(l_entry) > len(r_entry):
+ merged[key] = l_entry
+ elif len(r_entry) > len(l_entry):
+ merged[key] = r_entry
+ else:
+ # Same length, prefer latest timestamp
+ if l_ts and r_ts:
+ merged[key] = l_entry if l_ts > r_ts else r_entry
+ else:
+ merged[key] = l_entry
+ else:
+ # Only one has this application
+ merged[key] = l_entry or r_entry
+
+ with open(output_path, 'w', encoding='utf-8') as f:
+ json.dump(merged, f, ensure_ascii=False, indent=2)
+
+ print(f"✓ Merged applications: {len(merged)} unique entries → {output_path}")
+ return merged
+
+def merge_dict_json(local_path, merge_path, output_path=None, timestamp_field='fetched_at'):
+ """
+ Merge two dict-based JSON files (keyed by id), deduplicate by key.
+ If duplicate, keep entry with latest timestamp_field.
+ """
+ output_path = output_path or local_path
+ with open(local_path, encoding='utf-8') as f:
+ local = json.load(f)
+ with open(merge_path, encoding='utf-8') as f:
+ remote = json.load(f)
+
+ merged = {}
+ all_keys = set(local.keys()) | set(remote.keys())
+
+ for key in all_keys:
+ l_entry = local.get(key)
+ r_entry = remote.get(key)
+
+ if l_entry and r_entry:
+ l_ts = l_entry.get(timestamp_field)
+ r_ts = r_entry.get(timestamp_field)
+ if l_ts and r_ts:
+ merged[key] = l_entry if l_ts > r_ts else r_entry
+ else:
+ merged[key] = l_entry
+ else:
+ merged[key] = l_entry or r_entry
+
+ with open(output_path, 'w', encoding='utf-8') as f:
+ json.dump(merged, f, ensure_ascii=False, indent=2)
+
+ print(f"✓ Merged {Path(local_path).name}: {len(merged)} unique entries → {output_path}")
+ return merged
+
+def merge_csv_times(local_path, merge_path, output_path=None):
+ """
+ Merge two CSV files with time-series data, deduplicate by all columns.
+ Keeps unique rows based on all column values.
+ """
+ output_path = output_path or local_path
+
+ local_df = pd.read_csv(local_path)
+ remote_df = pd.read_csv(merge_path)
+
+ # Combine and drop duplicates
+ merged_df = pd.concat([local_df, remote_df], ignore_index=True)
+ merged_df = merged_df.drop_duplicates()
+
+ # Sort by timestamp if present
+ if 'timestamp' in merged_df.columns:
+ merged_df = merged_df.sort_values('timestamp')
+
+ merged_df.to_csv(output_path, index=False)
+
+ print(f"✓ Merged {Path(local_path).name}: {len(merged_df)} rows → {output_path}")
+ return merged_df
+
+def merge_all_data(local_base_dir="data", merge_base_dir="data/to_merge", output_base_dir=None):
+ """
+ Main function to merge all data from prod and dev environments.
+
+ Args:
+ local_base_dir: Base directory for local (dev) data
+ merge_base_dir: Base directory for data to merge (prod)
+ output_base_dir: Output directory (defaults to local_base_dir)
+
+ Returns:
+ dict: Summary of merge results
+ """
+ output_base_dir = output_base_dir or local_base_dir
+ local_base = Path(local_base_dir)
+ merge_base = Path(merge_base_dir)
+ output_base = Path(output_base_dir)
+
+ print("=" * 60)
+ print("MERGING PROD AND DEV DATA")
+ print("=" * 60)
+
+ results = {}
+
+ # 1. Merge applications.json (special handling for failed duplicates)
+ if (local_base / "applications.json").exists() and (merge_base / "applications.json").exists():
+ results['applications'] = merge_applications(
+ str(local_base / "applications.json"),
+ str(merge_base / "applications.json"),
+ str(output_base / "applications.json")
+ )
+
+ # 2. Merge listings.json
+ if (local_base / "listings.json").exists() and (merge_base / "listings.json").exists():
+ results['listings'] = merge_dict_json(
+ str(local_base / "listings.json"),
+ str(merge_base / "listings.json"),
+ str(output_base / "listings.json"),
+ timestamp_field='fetched_at'
+ )
+
+ # 3. Merge wgcompany_listings.json
+ if (local_base / "wgcompany_listings.json").exists() and (merge_base / "wgcompany_listings.json").exists():
+ results['wgcompany_listings'] = merge_dict_json(
+ str(local_base / "wgcompany_listings.json"),
+ str(merge_base / "wgcompany_listings.json"),
+ str(output_base / "wgcompany_listings.json"),
+ timestamp_field='fetched_at'
+ )
+
+ # 4. Merge listing_times.csv
+ if (local_base / "listing_times.csv").exists() and (merge_base / "listing_times.csv").exists():
+ results['listing_times'] = merge_csv_times(
+ str(local_base / "listing_times.csv"),
+ str(merge_base / "listing_times.csv"),
+ str(output_base / "listing_times.csv")
+ )
+
+ # 5. Merge wgcompany_times.csv
+ if (local_base / "wgcompany_times.csv").exists() and (merge_base / "wgcompany_times.csv").exists():
+ results['wgcompany_times'] = merge_csv_times(
+ str(local_base / "wgcompany_times.csv"),
+ str(merge_base / "wgcompany_times.csv"),
+ str(output_base / "wgcompany_times.csv")
+ )
+
+ print("=" * 60)
+ print("MERGE COMPLETE")
+ print("=" * 60)
+
+ return results
+
+if __name__ == "__main__":
+ # Usage: Place prod data in data/to_merge/ directory, then run this script
+ merge_all_data()
|