upd gewobag

This commit is contained in:
Aron Petau 2026-01-08 21:04:43 +01:00
parent afb87d7d3c
commit 92912e8487
4 changed files with 300 additions and 5 deletions

View file

@ -177,14 +177,21 @@ class WGCompanyNotifier:
content = await page.content()
# Extract email (look for patterns like email: xxx@yyy.zz or Email: xxx)
# Priority: Look for email in table cell context (WG-specific email), exclude footer email
email_patterns = [
r'[Ee]-?[Mm]ail[:\s]+([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})',
r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'
r'email\s*:\s*</font></b></td>\s*<td[^>]*>.*?mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', # Table cell email
r'<a href="mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})">', # Any mailto link
r'[Ee]-?[Mm]ail[:\s]+([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', # Plain email: pattern
]
for pattern in email_patterns:
email_match = re.search(pattern, content)
if email_match:
details["email"] = email_match.group(1)
email_matches = re.finditer(pattern, content, re.IGNORECASE | re.DOTALL)
for match in email_matches:
email = match.group(1)
# Exclude the footer/contact email
if email != "wgcompany@wgcompany.de":
details["email"] = email
break
if "email" in details:
break
# Extract WG name from URL