upd gewobag
This commit is contained in:
parent
afb87d7d3c
commit
92912e8487
4 changed files with 300 additions and 5 deletions
|
|
@ -24,6 +24,9 @@ FORM_PERSONS=2
|
|||
FORM_ADULTS=1
|
||||
FORM_CHILDREN=0
|
||||
FORM_INCOME=2500
|
||||
FORM_WBS_VALID_UNTIL=26.11.2026
|
||||
FORM_WBS_TYPE=WBS 100
|
||||
FORM_WBS_ROOMS=1
|
||||
|
||||
# WGcompany.de Search Filters (optional)
|
||||
WGCOMPANY_ENABLED=true
|
||||
|
|
@ -32,3 +35,5 @@ WGCOMPANY_MAX_SIZE=
|
|||
WGCOMPANY_MIN_PRICE=
|
||||
WGCOMPANY_MAX_PRICE=
|
||||
WGCOMPANY_BEZIRK=0
|
||||
WGCOMPANY_AGE=
|
||||
WGCOMPANY_SMOKER=
|
||||
|
|
|
|||
|
|
@ -236,10 +236,83 @@ class GewobagHandler(BaseHandler):
|
|||
if wbs_ja:
|
||||
await wbs_ja.click()
|
||||
logger.info("[GEWOBAG] Selected WBS: Ja")
|
||||
await asyncio.sleep(1) # Wait for conditional WBS fields to appear
|
||||
|
||||
# Save debug HTML after WBS fields are visible
|
||||
try:
|
||||
html_content = await iframe_page.content()
|
||||
debug_html_path = DATA_DIR / f"gewobag_wbs_fields_{listing['id']}.html"
|
||||
with open(debug_html_path, "w", encoding="utf-8") as f:
|
||||
f.write(html_content)
|
||||
logger.info(f"[GEWOBAG] Saved WBS fields debug HTML to {debug_html_path}")
|
||||
except Exception as e:
|
||||
logger.warning(f"[GEWOBAG] Could not save WBS debug HTML: {e}")
|
||||
|
||||
form_filled = True
|
||||
except Exception as e:
|
||||
logger.warning(f"[GEWOBAG] Could not select WBS: {e}")
|
||||
|
||||
# WBS Gültigkeit (validity date) - appears after selecting Ja
|
||||
try:
|
||||
wbs_valid_until = os.environ.get("FORM_WBS_VALID_UNTIL", "26.11.2026")
|
||||
wbs_date_input = await iframe_page.query_selector('#formly_6_input_\\$\\$_wbs_valid_until_\\$\\$_0')
|
||||
if wbs_date_input:
|
||||
await wbs_date_input.fill(wbs_valid_until)
|
||||
logger.info(f"[GEWOBAG] Filled WBS Gültigkeit: {wbs_valid_until}")
|
||||
form_filled = True
|
||||
except Exception as e:
|
||||
logger.warning(f"[GEWOBAG] Could not fill WBS Gültigkeit: {e}")
|
||||
|
||||
# WBS Art/Bezeichnung (type) dropdown - appears after selecting Ja
|
||||
try:
|
||||
wbs_type = os.environ.get("FORM_WBS_TYPE", "WBS 100")
|
||||
wbs_type_input = await iframe_page.query_selector('#formly_6_select_gewobag_art_bezeichnung_des_wbs_1')
|
||||
if wbs_type_input:
|
||||
await wbs_type_input.click()
|
||||
await iframe_page.wait_for_timeout(300)
|
||||
wbs_type_option = await iframe_page.query_selector(f'.ng-option:has-text("{wbs_type}")')
|
||||
if wbs_type_option:
|
||||
await wbs_type_option.click()
|
||||
logger.info(f"[GEWOBAG] Selected WBS Type: {wbs_type}")
|
||||
form_filled = True
|
||||
except Exception as e:
|
||||
logger.warning(f"[GEWOBAG] Could not select WBS Type: {e}")
|
||||
|
||||
# WBS Anzahl Räume (number of rooms) dropdown - appears after selecting Ja
|
||||
try:
|
||||
wbs_rooms = os.environ.get("FORM_WBS_ROOMS", "1")
|
||||
wbs_rooms_input = await iframe_page.query_selector('#formly_7_select_\\$\\$_wbs_max_number_rooms_\\$\\$_0')
|
||||
if wbs_rooms_input:
|
||||
await wbs_rooms_input.click()
|
||||
await iframe_page.wait_for_timeout(300)
|
||||
wbs_rooms_option = await iframe_page.query_selector(f'.ng-option:has-text("{wbs_rooms}")')
|
||||
if wbs_rooms_option:
|
||||
await wbs_rooms_option.click()
|
||||
logger.info(f"[GEWOBAG] Selected WBS Rooms: {wbs_rooms}")
|
||||
form_filled = True
|
||||
except Exception as e:
|
||||
logger.warning(f"[GEWOBAG] Could not select WBS Rooms: {e}")
|
||||
|
||||
# WBS file upload - Upload the WBS PDF and PNG from data/uploads
|
||||
try:
|
||||
wbs_files = [
|
||||
Path("data/uploads/WBS_Antrag_Bestaetigung.pdf"),
|
||||
Path("data/uploads/WBS_Rechner.png")
|
||||
]
|
||||
existing_files = [str(f) for f in wbs_files if f.exists()]
|
||||
|
||||
if existing_files:
|
||||
file_input = await iframe_page.query_selector('input[type="file"]')
|
||||
if file_input:
|
||||
await file_input.set_input_files(existing_files)
|
||||
await asyncio.sleep(1) # Wait for upload to process
|
||||
logger.info(f"[GEWOBAG] Uploaded {len(existing_files)} WBS file(s): {', '.join([Path(f).name for f in existing_files])}")
|
||||
form_filled = True
|
||||
else:
|
||||
logger.warning("[GEWOBAG] No WBS files found in data/uploads")
|
||||
except Exception as e:
|
||||
logger.warning(f"[GEWOBAG] Could not upload WBS files: {e}")
|
||||
|
||||
# Privacy checkbox (Main Datenschutzbestimmungen) - REQUIRED
|
||||
try:
|
||||
privacy_checkbox = await iframe_page.query_selector('#formly_20_checkbox_gewobag_datenschutzhinweis_bestaetigt_0')
|
||||
|
|
|
|||
|
|
@ -177,14 +177,21 @@ class WGCompanyNotifier:
|
|||
content = await page.content()
|
||||
|
||||
# Extract email (look for patterns like email: xxx@yyy.zz or Email: xxx)
|
||||
# Priority: Look for email in table cell context (WG-specific email), exclude footer email
|
||||
email_patterns = [
|
||||
r'[Ee]-?[Mm]ail[:\s]+([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})',
|
||||
r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'
|
||||
r'email\s*:\s*</font></b></td>\s*<td[^>]*>.*?mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', # Table cell email
|
||||
r'<a href="mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})">', # Any mailto link
|
||||
r'[Ee]-?[Mm]ail[:\s]+([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', # Plain email: pattern
|
||||
]
|
||||
for pattern in email_patterns:
|
||||
email_match = re.search(pattern, content)
|
||||
if email_match:
|
||||
details["email"] = email_match.group(1)
|
||||
email_matches = re.finditer(pattern, content, re.IGNORECASE | re.DOTALL)
|
||||
for match in email_matches:
|
||||
email = match.group(1)
|
||||
# Exclude the footer/contact email
|
||||
if email != "wgcompany@wgcompany.de":
|
||||
details["email"] = email
|
||||
break
|
||||
if "email" in details:
|
||||
break
|
||||
|
||||
# Extract WG name from URL
|
||||
|
|
|
|||
210
helper_functions/merge_all_data.py
Normal file
210
helper_functions/merge_all_data.py
Normal file
|
|
@ -0,0 +1,210 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Merge all data from prod and dev environments.
|
||||
Handles applications.json, listings.json, wgcompany_listings.json, and CSV files.
|
||||
For failed applications with duplicates, keeps the earlier timestamp.
|
||||
"""
|
||||
import json
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
def parse_timestamp(ts_str):
|
||||
"""Parse ISO format timestamp string to datetime object."""
|
||||
if ts_str:
|
||||
try:
|
||||
return datetime.fromisoformat(ts_str)
|
||||
except Exception:
|
||||
return None
|
||||
return None
|
||||
|
||||
def merge_applications(local_path, merge_path, output_path=None):
|
||||
"""
|
||||
Merge two applications.json files, deduplicate by listing_id.
|
||||
Special handling: For failed applications with duplicates, keep the earlier timestamp.
|
||||
For successful applications, keep the entry with more complete data.
|
||||
"""
|
||||
output_path = output_path or local_path
|
||||
with open(local_path, encoding='utf-8') as f:
|
||||
local = json.load(f)
|
||||
with open(merge_path, encoding='utf-8') as f:
|
||||
remote = json.load(f)
|
||||
|
||||
merged = {}
|
||||
all_keys = set(local.keys()) | set(remote.keys())
|
||||
|
||||
for key in all_keys:
|
||||
l_entry = local.get(key)
|
||||
r_entry = remote.get(key)
|
||||
|
||||
if l_entry and r_entry:
|
||||
# Both have this application
|
||||
l_success = l_entry.get('success', False)
|
||||
r_success = r_entry.get('success', False)
|
||||
l_ts = parse_timestamp(l_entry.get('timestamp'))
|
||||
r_ts = parse_timestamp(r_entry.get('timestamp'))
|
||||
|
||||
# If both failed, keep the one with earlier timestamp (to avoid timestamp corruption bug)
|
||||
if not l_success and not r_success:
|
||||
if l_ts and r_ts:
|
||||
merged[key] = l_entry if l_ts < r_ts else r_entry
|
||||
else:
|
||||
merged[key] = l_entry # fallback if timestamp missing
|
||||
# If one succeeded and one failed, keep the successful one
|
||||
elif l_success and not r_success:
|
||||
merged[key] = l_entry
|
||||
elif r_success and not l_success:
|
||||
merged[key] = r_entry
|
||||
# If both succeeded, prefer entry with more fields, or latest timestamp
|
||||
else:
|
||||
if len(l_entry) > len(r_entry):
|
||||
merged[key] = l_entry
|
||||
elif len(r_entry) > len(l_entry):
|
||||
merged[key] = r_entry
|
||||
else:
|
||||
# Same length, prefer latest timestamp
|
||||
if l_ts and r_ts:
|
||||
merged[key] = l_entry if l_ts > r_ts else r_entry
|
||||
else:
|
||||
merged[key] = l_entry
|
||||
else:
|
||||
# Only one has this application
|
||||
merged[key] = l_entry or r_entry
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(merged, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"✓ Merged applications: {len(merged)} unique entries → {output_path}")
|
||||
return merged
|
||||
|
||||
def merge_dict_json(local_path, merge_path, output_path=None, timestamp_field='fetched_at'):
|
||||
"""
|
||||
Merge two dict-based JSON files (keyed by id), deduplicate by key.
|
||||
If duplicate, keep entry with latest timestamp_field.
|
||||
"""
|
||||
output_path = output_path or local_path
|
||||
with open(local_path, encoding='utf-8') as f:
|
||||
local = json.load(f)
|
||||
with open(merge_path, encoding='utf-8') as f:
|
||||
remote = json.load(f)
|
||||
|
||||
merged = {}
|
||||
all_keys = set(local.keys()) | set(remote.keys())
|
||||
|
||||
for key in all_keys:
|
||||
l_entry = local.get(key)
|
||||
r_entry = remote.get(key)
|
||||
|
||||
if l_entry and r_entry:
|
||||
l_ts = l_entry.get(timestamp_field)
|
||||
r_ts = r_entry.get(timestamp_field)
|
||||
if l_ts and r_ts:
|
||||
merged[key] = l_entry if l_ts > r_ts else r_entry
|
||||
else:
|
||||
merged[key] = l_entry
|
||||
else:
|
||||
merged[key] = l_entry or r_entry
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(merged, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"✓ Merged {Path(local_path).name}: {len(merged)} unique entries → {output_path}")
|
||||
return merged
|
||||
|
||||
def merge_csv_times(local_path, merge_path, output_path=None):
|
||||
"""
|
||||
Merge two CSV files with time-series data, deduplicate by all columns.
|
||||
Keeps unique rows based on all column values.
|
||||
"""
|
||||
output_path = output_path or local_path
|
||||
|
||||
local_df = pd.read_csv(local_path)
|
||||
remote_df = pd.read_csv(merge_path)
|
||||
|
||||
# Combine and drop duplicates
|
||||
merged_df = pd.concat([local_df, remote_df], ignore_index=True)
|
||||
merged_df = merged_df.drop_duplicates()
|
||||
|
||||
# Sort by timestamp if present
|
||||
if 'timestamp' in merged_df.columns:
|
||||
merged_df = merged_df.sort_values('timestamp')
|
||||
|
||||
merged_df.to_csv(output_path, index=False)
|
||||
|
||||
print(f"✓ Merged {Path(local_path).name}: {len(merged_df)} rows → {output_path}")
|
||||
return merged_df
|
||||
|
||||
def merge_all_data(local_base_dir="data", merge_base_dir="data/to_merge", output_base_dir=None):
|
||||
"""
|
||||
Main function to merge all data from prod and dev environments.
|
||||
|
||||
Args:
|
||||
local_base_dir: Base directory for local (dev) data
|
||||
merge_base_dir: Base directory for data to merge (prod)
|
||||
output_base_dir: Output directory (defaults to local_base_dir)
|
||||
|
||||
Returns:
|
||||
dict: Summary of merge results
|
||||
"""
|
||||
output_base_dir = output_base_dir or local_base_dir
|
||||
local_base = Path(local_base_dir)
|
||||
merge_base = Path(merge_base_dir)
|
||||
output_base = Path(output_base_dir)
|
||||
|
||||
print("=" * 60)
|
||||
print("MERGING PROD AND DEV DATA")
|
||||
print("=" * 60)
|
||||
|
||||
results = {}
|
||||
|
||||
# 1. Merge applications.json (special handling for failed duplicates)
|
||||
if (local_base / "applications.json").exists() and (merge_base / "applications.json").exists():
|
||||
results['applications'] = merge_applications(
|
||||
str(local_base / "applications.json"),
|
||||
str(merge_base / "applications.json"),
|
||||
str(output_base / "applications.json")
|
||||
)
|
||||
|
||||
# 2. Merge listings.json
|
||||
if (local_base / "listings.json").exists() and (merge_base / "listings.json").exists():
|
||||
results['listings'] = merge_dict_json(
|
||||
str(local_base / "listings.json"),
|
||||
str(merge_base / "listings.json"),
|
||||
str(output_base / "listings.json"),
|
||||
timestamp_field='fetched_at'
|
||||
)
|
||||
|
||||
# 3. Merge wgcompany_listings.json
|
||||
if (local_base / "wgcompany_listings.json").exists() and (merge_base / "wgcompany_listings.json").exists():
|
||||
results['wgcompany_listings'] = merge_dict_json(
|
||||
str(local_base / "wgcompany_listings.json"),
|
||||
str(merge_base / "wgcompany_listings.json"),
|
||||
str(output_base / "wgcompany_listings.json"),
|
||||
timestamp_field='fetched_at'
|
||||
)
|
||||
|
||||
# 4. Merge listing_times.csv
|
||||
if (local_base / "listing_times.csv").exists() and (merge_base / "listing_times.csv").exists():
|
||||
results['listing_times'] = merge_csv_times(
|
||||
str(local_base / "listing_times.csv"),
|
||||
str(merge_base / "listing_times.csv"),
|
||||
str(output_base / "listing_times.csv")
|
||||
)
|
||||
|
||||
# 5. Merge wgcompany_times.csv
|
||||
if (local_base / "wgcompany_times.csv").exists() and (merge_base / "wgcompany_times.csv").exists():
|
||||
results['wgcompany_times'] = merge_csv_times(
|
||||
str(local_base / "wgcompany_times.csv"),
|
||||
str(merge_base / "wgcompany_times.csv"),
|
||||
str(output_base / "wgcompany_times.csv")
|
||||
)
|
||||
|
||||
print("=" * 60)
|
||||
print("MERGE COMPLETE")
|
||||
print("=" * 60)
|
||||
|
||||
return results
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Usage: Place prod data in data/to_merge/ directory, then run this script
|
||||
merge_all_data()
|
||||
Loading…
Add table
Add a link
Reference in a new issue