upd gewobag
This commit is contained in:
parent
afb87d7d3c
commit
92912e8487
4 changed files with 300 additions and 5 deletions
|
|
@ -177,14 +177,21 @@ class WGCompanyNotifier:
|
|||
content = await page.content()
|
||||
|
||||
# Extract email (look for patterns like email: xxx@yyy.zz or Email: xxx)
|
||||
# Priority: Look for email in table cell context (WG-specific email), exclude footer email
|
||||
email_patterns = [
|
||||
r'[Ee]-?[Mm]ail[:\s]+([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})',
|
||||
r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'
|
||||
r'email\s*:\s*</font></b></td>\s*<td[^>]*>.*?mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', # Table cell email
|
||||
r'<a href="mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})">', # Any mailto link
|
||||
r'[Ee]-?[Mm]ail[:\s]+([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})', # Plain email: pattern
|
||||
]
|
||||
for pattern in email_patterns:
|
||||
email_match = re.search(pattern, content)
|
||||
if email_match:
|
||||
details["email"] = email_match.group(1)
|
||||
email_matches = re.finditer(pattern, content, re.IGNORECASE | re.DOTALL)
|
||||
for match in email_matches:
|
||||
email = match.group(1)
|
||||
# Exclude the footer/contact email
|
||||
if email != "wgcompany@wgcompany.de":
|
||||
details["email"] = email
|
||||
break
|
||||
if "email" in details:
|
||||
break
|
||||
|
||||
# Extract WG name from URL
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue