45 lines
1.7 KiB
Python
45 lines
1.7 KiB
Python
import re
|
|
import unicodedata
|
|
|
|
import requests
|
|
|
|
from scraper.email_spammer.blacklisted import BLACKLISTED_DOMAINS
|
|
|
|
EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
|
|
|
|
|
|
def remove_strikethrough(s):
|
|
return ''.join(c for c in s if not unicodedata.combining(c))
|
|
|
|
|
|
def search_4chan():
|
|
url = "https://a.4cdn.org/g/catalog.json"
|
|
|
|
response = requests.get(url)
|
|
data = response.json()
|
|
email_addresses = []
|
|
|
|
for page in data:
|
|
for thread in page['threads']:
|
|
# Check if the thread has a subject and if '/aicg/' is in the subject
|
|
if 'sub' in thread and '/aicg/' in thread['sub']:
|
|
thread_id = thread['no']
|
|
thread_url = f"https://a.4cdn.org/g/thread/{thread_id}.json"
|
|
thread_response = requests.get(thread_url)
|
|
thread_data = thread_response.json()
|
|
# Loop through each post in the thread
|
|
for post in thread_data['posts']:
|
|
# Check if the post has a comment
|
|
if 'com' in post:
|
|
cleaned_comment = remove_strikethrough(post['com'])
|
|
# If so, find all email addresses in the comment
|
|
emails = re.findall(EMAIL_PATTERN, cleaned_comment)
|
|
for email in emails:
|
|
email = email.lower()
|
|
email_domain = email.split('@')[1]
|
|
# if email.endswith('proton.me') or email.endswith('protonmail.com') or email.endswith('cock.il'):
|
|
if email_domain not in BLACKLISTED_DOMAINS:
|
|
email_addresses.extend([email])
|
|
|
|
return email_addresses
|