2024-03-05 23:57:38 -07:00
|
|
|
import concurrent.futures
|
|
|
|
import email
|
|
|
|
import hashlib
|
|
|
|
import imaplib
|
2024-03-06 00:04:11 -07:00
|
|
|
import logging
|
|
|
|
import sys
|
2024-03-05 23:57:38 -07:00
|
|
|
import time
|
|
|
|
from email.header import decode_header
|
|
|
|
from email.utils import parsedate_to_datetime
|
|
|
|
from json import JSONEncoder
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
import chardet
|
|
|
|
|
|
|
|
from iarchiver.email import extract_emails
|
|
|
|
|
|
|
|
|
|
|
|
def md5_chunked(data: bytes, size: int = 1024):
|
|
|
|
m = hashlib.md5()
|
|
|
|
for i in range(0, len(data), size):
|
|
|
|
m.update(data[i:i + size])
|
|
|
|
return m.hexdigest()
|
|
|
|
|
|
|
|
|
|
|
|
class FileAttachment:
|
|
|
|
def __init__(self, file_name: str, file_hash: str):
|
|
|
|
self.filename = file_name
|
|
|
|
self.hash = file_hash
|
|
|
|
|
|
|
|
def to_dict(self):
|
|
|
|
return {'filename': self.filename, 'hash': self.hash}
|
|
|
|
|
|
|
|
|
|
|
|
class FileAttachmentEncoder(JSONEncoder):
|
|
|
|
def default(self, o):
|
|
|
|
if isinstance(o, FileAttachment):
|
|
|
|
return o.to_dict()
|
|
|
|
return super().default(o)
|
|
|
|
|
|
|
|
|
|
|
|
class MailConnection:
|
|
|
|
def __init__(self, host: str, username: str, password: str, attachments_dir: Path):
|
|
|
|
self.mail = imaplib.IMAP4_SSL(host)
|
|
|
|
self.mail.login(username, password)
|
|
|
|
self.attachments_dir = attachments_dir.expanduser().absolute().resolve()
|
|
|
|
self.folder_structure = {}
|
2024-03-06 10:28:27 -07:00
|
|
|
self.logger = logging.getLogger('iarchiver.mail_conn')
|
2024-03-06 00:04:11 -07:00
|
|
|
self.logger.setLevel(logging.INFO)
|
2024-03-05 23:57:38 -07:00
|
|
|
|
|
|
|
def load_folders(self):
|
|
|
|
folders = [tuple(f.decode().split(' "/" ')[1].replace('"', '').split('/')) for f in self.mail.list()[1]]
|
|
|
|
folder_structure = {}
|
|
|
|
for f in folders:
|
|
|
|
if not folder_structure.get(f[0]):
|
|
|
|
folder_structure[f[0]] = []
|
|
|
|
if len(f) > 1:
|
|
|
|
folder_structure[f[0]].append(f[1])
|
|
|
|
self.folder_structure = folder_structure
|
|
|
|
return self.folder_structure
|
|
|
|
|
|
|
|
def __fetch_email(self, i):
|
|
|
|
result, data = self.mail.uid('fetch', str(i), '(BODY[])') # fetch the raw email
|
|
|
|
if data[0] is None:
|
|
|
|
return
|
|
|
|
raw_email_bytes = data[0][1]
|
2024-03-06 00:04:11 -07:00
|
|
|
try:
|
|
|
|
detected = chardet.detect(raw_email_bytes)
|
|
|
|
except TypeError as e:
|
|
|
|
self.logger.critical(f'Failed to decode an email. Timeout? - {e}')
|
|
|
|
sys.exit(1)
|
2024-03-05 23:57:38 -07:00
|
|
|
encoding = detected['encoding']
|
|
|
|
if not encoding:
|
|
|
|
encoding = 'utf-8'
|
|
|
|
raw_email = raw_email_bytes.decode(encoding, errors='replace')
|
|
|
|
email_message = email.message_from_string(raw_email)
|
|
|
|
date_header = email_message['Date']
|
|
|
|
date = parsedate_to_datetime(date_header)
|
|
|
|
unix_timestamp = int(time.mktime(date.timetuple()))
|
|
|
|
|
2024-03-06 10:28:27 -07:00
|
|
|
from_addr = email_message['From']
|
|
|
|
to_addr = email_message['To']
|
|
|
|
if not from_addr and not to_addr:
|
|
|
|
return
|
|
|
|
if not from_addr:
|
|
|
|
from_addr = ''
|
|
|
|
if not to_addr:
|
|
|
|
to_addr = ''
|
|
|
|
|
|
|
|
from_header = ', '.join(extract_emails(from_addr))
|
|
|
|
to_header = ', '.join(extract_emails(to_addr))
|
|
|
|
if '@' not in from_header:
|
|
|
|
from_header = from_addr
|
2024-03-05 23:57:38 -07:00
|
|
|
if '@' not in to_header:
|
2024-03-06 10:28:27 -07:00
|
|
|
to_header = to_addr
|
2024-03-05 23:57:38 -07:00
|
|
|
|
|
|
|
subject_header = email_message['Subject']
|
|
|
|
if subject_header:
|
|
|
|
subject = decode_header(subject_header)[0][0]
|
|
|
|
if isinstance(subject, bytes):
|
|
|
|
try:
|
|
|
|
detected = chardet.detect(subject)
|
|
|
|
encoding = detected['encoding']
|
|
|
|
if not encoding:
|
|
|
|
encoding = 'utf-8'
|
|
|
|
subject = subject.decode(encoding, errors='replace')
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
subject = subject.decode('utf-8')
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
|
|
|
|
attachments = []
|
|
|
|
if email_message.is_multipart():
|
|
|
|
for part in email_message.walk():
|
|
|
|
content_type = part.get_content_type()
|
|
|
|
content_disposition = str(part.get("Content-Disposition"))
|
|
|
|
if "attachment" in content_disposition:
|
|
|
|
filename = part.get_filename()
|
|
|
|
if filename:
|
|
|
|
# The filename of the file is the hash of its content, which should de-duplicate files.
|
|
|
|
filecontents = part.get_payload(decode=True)
|
|
|
|
filehash = md5_chunked(filecontents)
|
|
|
|
part.set_payload(filehash) # replace the attachment with its hash
|
|
|
|
filepath = self.attachments_dir / filehash
|
|
|
|
file_obj = FileAttachment(filename, filehash)
|
|
|
|
if not filepath.is_file():
|
|
|
|
with open(filepath, 'wb') as f:
|
|
|
|
f.write(filecontents)
|
|
|
|
attachments.append(file_obj)
|
|
|
|
raw_email_clean = email_message.as_string()
|
|
|
|
return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments
|
|
|
|
|
|
|
|
def fetch_folder(self, folder: str, search_criterion: str = 'ALL', max_threads: int = 1):
|
|
|
|
"""
|
|
|
|
Don't use multiple threads because most mail servers don't allow the client to multiplex.
|
|
|
|
"""
|
|
|
|
self.mail.select(f'"{folder}"')
|
|
|
|
result, data = self.mail.uid('search', None, search_criterion)
|
|
|
|
mail_ids = data[0]
|
|
|
|
id_list = mail_ids.split()
|
|
|
|
if not len(id_list):
|
|
|
|
# Empty folder
|
|
|
|
return
|
|
|
|
first_email_id = int(id_list[0])
|
|
|
|
latest_email_id = int(id_list[-1])
|
|
|
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
|
|
|
|
futures = {executor.submit(self.__fetch_email, i) for i in range(latest_email_id, first_email_id, -1)}
|
|
|
|
for future in concurrent.futures.as_completed(futures):
|
|
|
|
result = future.result()
|
|
|
|
if result is not None:
|
|
|
|
yield result
|