This repository has been archived on 2024-06-09. You can view files and clone it, but cannot push or open issues or pull requests.
catspam/email_spammer/scrape.py

45 lines
1.7 KiB
Python

import re
import unicodedata
import requests
from scraper.email_spammer.blacklisted import BLACKLISTED_DOMAINS
EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
def remove_strikethrough(s):
return ''.join(c for c in s if not unicodedata.combining(c))
def search_4chan():
url = "https://a.4cdn.org/g/catalog.json"
response = requests.get(url)
data = response.json()
email_addresses = []
for page in data:
for thread in page['threads']:
# Check if the thread has a subject and if '/aicg/' is in the subject
if 'sub' in thread and '/aicg/' in thread['sub']:
thread_id = thread['no']
thread_url = f"https://a.4cdn.org/g/thread/{thread_id}.json"
thread_response = requests.get(thread_url)
thread_data = thread_response.json()
# Loop through each post in the thread
for post in thread_data['posts']:
# Check if the post has a comment
if 'com' in post:
cleaned_comment = remove_strikethrough(post['com'])
# If so, find all email addresses in the comment
emails = re.findall(EMAIL_PATTERN, cleaned_comment)
for email in emails:
email = email.lower()
email_domain = email.split('@')[1]
# if email.endswith('proton.me') or email.endswith('protonmail.com') or email.endswith('cock.il'):
if email_domain not in BLACKLISTED_DOMAINS:
email_addresses.extend([email])
return email_addresses