import re import unicodedata import requests from scraper.email_spammer.blacklisted import BLACKLISTED_DOMAINS EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b') def remove_strikethrough(s): return ''.join(c for c in s if not unicodedata.combining(c)) def search_4chan(): url = "https://a.4cdn.org/g/catalog.json" response = requests.get(url) data = response.json() email_addresses = [] for page in data: for thread in page['threads']: # Check if the thread has a subject and if '/aicg/' is in the subject if 'sub' in thread and '/aicg/' in thread['sub']: thread_id = thread['no'] thread_url = f"https://a.4cdn.org/g/thread/{thread_id}.json" thread_response = requests.get(thread_url) thread_data = thread_response.json() # Loop through each post in the thread for post in thread_data['posts']: # Check if the post has a comment if 'com' in post: cleaned_comment = remove_strikethrough(post['com']) # If so, find all email addresses in the comment emails = re.findall(EMAIL_PATTERN, cleaned_comment) for email in emails: email = email.lower() email_domain = email.split('@')[1] # if email.endswith('proton.me') or email.endswith('protonmail.com') or email.endswith('cock.il'): if email_domain not in BLACKLISTED_DOMAINS: email_addresses.extend([email]) return email_addresses