catspam/email_spammer/scrape.py

import re
import unicodedata

import requests

from scraper.email_spammer.blacklisted import BLACKLISTED_DOMAINS

EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')


def remove_strikethrough(s):
    return ''.join(c for c in s if not unicodedata.combining(c))


def search_4chan():
    url = "https://a.4cdn.org/g/catalog.json"

    response = requests.get(url)
    data = response.json()
    email_addresses = []

    for page in data:
        for thread in page['threads']:
            # Check if the thread has a subject and if '/aicg/' is in the subject
            if 'sub' in thread and '/aicg/' in thread['sub']:
                thread_id = thread['no']
                thread_url = f"https://a.4cdn.org/g/thread/{thread_id}.json"
                thread_response = requests.get(thread_url)
                thread_data = thread_response.json()
                # Loop through each post in the thread
                for post in thread_data['posts']:
                    # Check if the post has a comment
                    if 'com' in post:
                        cleaned_comment = remove_strikethrough(post['com'])
                        # If so, find all email addresses in the comment
                        emails = re.findall(EMAIL_PATTERN, cleaned_comment)
                        for email in emails:
                            email = email.lower()
                            email_domain = email.split('@')[1]
                            # if email.endswith('proton.me') or email.endswith('protonmail.com') or email.endswith('cock.il'):
                            if email_domain not in BLACKLISTED_DOMAINS:
                                email_addresses.extend([email])

    return email_addresses