2024-03-05 23:57:38 -07:00
|
|
|
import concurrent.futures
|
|
|
|
import email
|
|
|
|
import imaplib
|
2024-03-06 00:04:11 -07:00
|
|
|
import logging
|
2024-03-06 14:25:36 -07:00
|
|
|
import traceback
|
2024-03-05 23:57:38 -07:00
|
|
|
from email.header import decode_header
|
|
|
|
from email.utils import parsedate_to_datetime
|
|
|
|
from json import JSONEncoder
|
|
|
|
from pathlib import Path
|
2024-03-06 11:36:43 -07:00
|
|
|
from typing import List
|
2024-03-05 23:57:38 -07:00
|
|
|
|
|
|
|
import chardet
|
|
|
|
|
2024-03-06 11:36:43 -07:00
|
|
|
from iarchiver.email import extract_emails, normalize_for_imap_folder
|
2024-03-06 12:08:12 -07:00
|
|
|
from iarchiver.hash import murmur3_chunked
|
2024-03-05 23:57:38 -07:00
|
|
|
|
|
|
|
|
|
|
|
class FileAttachment:
|
2024-03-06 14:25:36 -07:00
|
|
|
def __init__(self, file_name: str, file_hash: str, content_type: str):
|
2024-03-05 23:57:38 -07:00
|
|
|
self.filename = file_name
|
|
|
|
self.hash = file_hash
|
2024-03-06 14:25:36 -07:00
|
|
|
self.content_type = content_type
|
2024-03-05 23:57:38 -07:00
|
|
|
|
|
|
|
def to_dict(self):
|
2024-03-06 14:25:36 -07:00
|
|
|
return {'filename': self.filename, 'hash': self.hash, 'content_type': self.content_type}
|
2024-03-05 23:57:38 -07:00
|
|
|
|
|
|
|
|
|
|
|
class FileAttachmentEncoder(JSONEncoder):
|
|
|
|
def default(self, o):
|
|
|
|
if isinstance(o, FileAttachment):
|
|
|
|
return o.to_dict()
|
|
|
|
return super().default(o)
|
|
|
|
|
|
|
|
|
|
|
|
class MailConnection:
|
2024-07-18 19:21:05 -06:00
|
|
|
def __init__(self, host: str, username: str, password: str, attachments_dir: Path, save_inline_attachments: bool = False):
|
2024-03-05 23:57:38 -07:00
|
|
|
self.mail = imaplib.IMAP4_SSL(host)
|
|
|
|
self.mail.login(username, password)
|
|
|
|
self.attachments_dir = attachments_dir.expanduser().absolute().resolve()
|
2024-07-18 19:21:05 -06:00
|
|
|
self.save_inline_attachments = save_inline_attachments
|
2024-03-05 23:57:38 -07:00
|
|
|
self.folder_structure = {}
|
2024-03-06 10:28:27 -07:00
|
|
|
self.logger = logging.getLogger('iarchiver.mail_conn')
|
2024-03-06 00:04:11 -07:00
|
|
|
self.logger.setLevel(logging.INFO)
|
2024-03-05 23:57:38 -07:00
|
|
|
|
|
|
|
def load_folders(self):
|
2024-03-06 11:36:43 -07:00
|
|
|
self.folder_structure = [f.decode().split(' "/" ')[1].replace('"', '').replace("\\'", "'") for f in self.mail.list()[1]]
|
2024-03-05 23:57:38 -07:00
|
|
|
return self.folder_structure
|
|
|
|
|
|
|
|
def __fetch_email(self, i):
|
2024-03-06 00:04:11 -07:00
|
|
|
try:
|
2024-07-18 19:25:30 -06:00
|
|
|
result, data = self.mail.uid('fetch', str(i), '(BODY.PEEK[])') # fetch the email without marking it as read
|
2024-03-06 14:25:36 -07:00
|
|
|
if data[0] is None:
|
|
|
|
return
|
|
|
|
raw_email_bytes = data[0][1]
|
|
|
|
try:
|
|
|
|
detected = chardet.detect(raw_email_bytes)
|
|
|
|
except TypeError as e:
|
|
|
|
self.logger.critical(f'Failed to decode an email. Timeout? Server error? - "{e}"')
|
|
|
|
return
|
|
|
|
encoding = detected['encoding']
|
|
|
|
if not encoding:
|
|
|
|
encoding = 'utf-8'
|
|
|
|
raw_email = raw_email_bytes.decode(encoding, errors='replace')
|
|
|
|
email_message = email.message_from_string(raw_email)
|
|
|
|
date_header = email_message['Date']
|
|
|
|
if not date_header:
|
|
|
|
date_header = 'Thu, 1 Jan 1970 00:00:00 +0000'
|
|
|
|
parsed_date = email.utils.parsedate_to_datetime(date_header)
|
|
|
|
unix_timestamp = int(parsed_date.timestamp())
|
|
|
|
|
|
|
|
from_addr = email_message['From']
|
|
|
|
to_addr = email_message['To']
|
|
|
|
if not from_addr and not to_addr:
|
|
|
|
return
|
|
|
|
if not from_addr:
|
|
|
|
from_addr = ''
|
|
|
|
if not to_addr:
|
|
|
|
to_addr = ''
|
|
|
|
|
|
|
|
from_header = ', '.join(extract_emails(from_addr))
|
|
|
|
to_header = ', '.join(extract_emails(to_addr))
|
|
|
|
if '@' not in from_header:
|
|
|
|
from_header = from_addr
|
|
|
|
if '@' not in to_header:
|
|
|
|
to_header = to_addr
|
|
|
|
|
|
|
|
subject_header = email_message['Subject']
|
|
|
|
if subject_header:
|
|
|
|
subject = decode_header(subject_header)[0][0]
|
|
|
|
if isinstance(subject, bytes):
|
|
|
|
try:
|
|
|
|
detected = chardet.detect(subject)
|
|
|
|
encoding = detected['encoding']
|
|
|
|
if not encoding:
|
|
|
|
encoding = 'utf-8'
|
|
|
|
subject = subject.decode(encoding, errors='replace')
|
|
|
|
except UnicodeDecodeError:
|
|
|
|
subject = subject.decode('utf-8')
|
|
|
|
else:
|
|
|
|
return
|
2024-03-05 23:57:38 -07:00
|
|
|
|
2024-03-06 14:25:36 -07:00
|
|
|
attachments = []
|
|
|
|
if email_message.is_multipart():
|
|
|
|
for part in email_message.walk():
|
|
|
|
content_disposition = str(part.get("Content-Disposition"))
|
2024-07-18 19:21:05 -06:00
|
|
|
if 'attachment' in content_disposition or (self.save_inline_attachments and 'inline' in content_disposition):
|
2024-03-06 14:25:36 -07:00
|
|
|
filename = part.get_filename()
|
|
|
|
if filename:
|
|
|
|
# The filename of the file is the hash of its content, which should de-duplicate files.
|
|
|
|
filecontents = part.get_payload(decode=True)
|
|
|
|
if not filecontents:
|
|
|
|
continue
|
|
|
|
filehash = murmur3_chunked(filecontents)
|
|
|
|
part.set_payload(f'MMH3:{filehash}') # replace the attachment with its hash
|
|
|
|
filepath = self.attachments_dir / f'F{filehash}'
|
|
|
|
file_obj = FileAttachment(filename, filehash, part.get_content_type())
|
|
|
|
if not filepath.is_file():
|
|
|
|
with open(filepath, 'wb') as f:
|
|
|
|
f.write(filecontents)
|
|
|
|
attachments.append(file_obj)
|
|
|
|
raw_email_clean = email_message.as_string()
|
|
|
|
return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments
|
2024-07-18 19:21:05 -06:00
|
|
|
except:
|
2024-03-06 14:25:36 -07:00
|
|
|
self.logger.critical(traceback.format_exc())
|
|
|
|
return
|
2024-03-05 23:57:38 -07:00
|
|
|
|
2024-03-06 11:36:43 -07:00
|
|
|
def fetch_folder(self, folder: str, search_criterion: List[str] = None, max_threads: int = 1):
|
2024-03-05 23:57:38 -07:00
|
|
|
"""
|
|
|
|
Don't use multiple threads because most mail servers don't allow the client to multiplex.
|
|
|
|
"""
|
2024-03-06 11:36:43 -07:00
|
|
|
if not search_criterion:
|
|
|
|
search_criterion = ['ALL']
|
2024-07-18 19:58:42 -06:00
|
|
|
self.mail.select(normalize_for_imap_folder(folder), readonly=True)
|
2024-03-06 11:36:43 -07:00
|
|
|
for search_item in search_criterion:
|
|
|
|
result, data = self.mail.uid('search', search_item)
|
|
|
|
mail_ids = data[0]
|
|
|
|
id_list = mail_ids.split()
|
|
|
|
if not len(id_list):
|
|
|
|
# Empty folder
|
|
|
|
return
|
|
|
|
first_email_id = int(id_list[0])
|
|
|
|
latest_email_id = int(id_list[-1])
|
|
|
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
|
|
|
|
futures = {executor.submit(self.__fetch_email, i) for i in range(latest_email_id, first_email_id, -1)}
|
|
|
|
for future in concurrent.futures.as_completed(futures):
|
|
|
|
result = future.result()
|
|
|
|
if result is not None:
|
|
|
|
yield result
|