imap-archiver/iarchiver/mail_conn.py

150 lines
6.3 KiB
Python
Raw Permalink Normal View History

2024-03-05 23:57:38 -07:00
import concurrent.futures
import email
import imaplib
2024-03-06 00:04:11 -07:00
import logging
2024-03-06 14:25:36 -07:00
import traceback
2024-03-05 23:57:38 -07:00
from email.header import decode_header
from email.utils import parsedate_to_datetime
from json import JSONEncoder
from pathlib import Path
from typing import List
2024-03-05 23:57:38 -07:00
import chardet
from iarchiver.email import extract_emails, normalize_for_imap_folder
2024-03-06 12:08:12 -07:00
from iarchiver.hash import murmur3_chunked
2024-03-05 23:57:38 -07:00
class FileAttachment:
2024-03-06 14:25:36 -07:00
def __init__(self, file_name: str, file_hash: str, content_type: str):
2024-03-05 23:57:38 -07:00
self.filename = file_name
self.hash = file_hash
2024-03-06 14:25:36 -07:00
self.content_type = content_type
2024-03-05 23:57:38 -07:00
def to_dict(self):
2024-03-06 14:25:36 -07:00
return {'filename': self.filename, 'hash': self.hash, 'content_type': self.content_type}
2024-03-05 23:57:38 -07:00
class FileAttachmentEncoder(JSONEncoder):
def default(self, o):
if isinstance(o, FileAttachment):
return o.to_dict()
return super().default(o)
class MailConnection:
2024-07-18 19:21:05 -06:00
def __init__(self, host: str, username: str, password: str, attachments_dir: Path, save_inline_attachments: bool = False):
2024-03-05 23:57:38 -07:00
self.mail = imaplib.IMAP4_SSL(host)
self.mail.login(username, password)
self.attachments_dir = attachments_dir.expanduser().absolute().resolve()
2024-07-18 19:21:05 -06:00
self.save_inline_attachments = save_inline_attachments
2024-03-05 23:57:38 -07:00
self.folder_structure = {}
2024-03-06 10:28:27 -07:00
self.logger = logging.getLogger('iarchiver.mail_conn')
2024-03-06 00:04:11 -07:00
self.logger.setLevel(logging.INFO)
2024-03-05 23:57:38 -07:00
def load_folders(self):
self.folder_structure = [f.decode().split(' "/" ')[1].replace('"', '').replace("\\'", "'") for f in self.mail.list()[1]]
2024-03-05 23:57:38 -07:00
return self.folder_structure
def __fetch_email(self, i):
2024-03-06 00:04:11 -07:00
try:
result, data = self.mail.uid('fetch', str(i), '(BODY.PEEK[])') # fetch the email without marking it as read
2024-03-06 14:25:36 -07:00
if data[0] is None:
return
raw_email_bytes = data[0][1]
try:
detected = chardet.detect(raw_email_bytes)
except TypeError as e:
self.logger.critical(f'Failed to decode an email. Timeout? Server error? - "{e}"')
return
encoding = detected['encoding']
if not encoding:
encoding = 'utf-8'
raw_email = raw_email_bytes.decode(encoding, errors='replace')
email_message = email.message_from_string(raw_email)
date_header = email_message['Date']
if not date_header:
date_header = 'Thu, 1 Jan 1970 00:00:00 +0000'
parsed_date = email.utils.parsedate_to_datetime(date_header)
unix_timestamp = int(parsed_date.timestamp())
from_addr = email_message['From']
to_addr = email_message['To']
if not from_addr and not to_addr:
return
if not from_addr:
from_addr = ''
if not to_addr:
to_addr = ''
from_header = ', '.join(extract_emails(from_addr))
to_header = ', '.join(extract_emails(to_addr))
if '@' not in from_header:
from_header = from_addr
if '@' not in to_header:
to_header = to_addr
subject_header = email_message['Subject']
if subject_header:
subject = decode_header(subject_header)[0][0]
if isinstance(subject, bytes):
try:
detected = chardet.detect(subject)
encoding = detected['encoding']
if not encoding:
encoding = 'utf-8'
subject = subject.decode(encoding, errors='replace')
except UnicodeDecodeError:
subject = subject.decode('utf-8')
else:
return
2024-03-05 23:57:38 -07:00
2024-03-06 14:25:36 -07:00
attachments = []
if email_message.is_multipart():
for part in email_message.walk():
content_disposition = str(part.get("Content-Disposition"))
2024-07-18 19:21:05 -06:00
if 'attachment' in content_disposition or (self.save_inline_attachments and 'inline' in content_disposition):
2024-03-06 14:25:36 -07:00
filename = part.get_filename()
if filename:
# The filename of the file is the hash of its content, which should de-duplicate files.
filecontents = part.get_payload(decode=True)
if not filecontents:
continue
filehash = murmur3_chunked(filecontents)
part.set_payload(f'MMH3:{filehash}') # replace the attachment with its hash
filepath = self.attachments_dir / f'F{filehash}'
file_obj = FileAttachment(filename, filehash, part.get_content_type())
if not filepath.is_file():
with open(filepath, 'wb') as f:
f.write(filecontents)
attachments.append(file_obj)
raw_email_clean = email_message.as_string()
return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments
2024-07-18 19:21:05 -06:00
except:
2024-03-06 14:25:36 -07:00
self.logger.critical(traceback.format_exc())
return
2024-03-05 23:57:38 -07:00
def fetch_folder(self, folder: str, search_criterion: List[str] = None, max_threads: int = 1):
2024-03-05 23:57:38 -07:00
"""
Don't use multiple threads because most mail servers don't allow the client to multiplex.
"""
if not search_criterion:
search_criterion = ['ALL']
2024-07-18 19:58:42 -06:00
self.mail.select(normalize_for_imap_folder(folder), readonly=True)
for search_item in search_criterion:
result, data = self.mail.uid('search', search_item)
mail_ids = data[0]
id_list = mail_ids.split()
if not len(id_list):
# Empty folder
return
first_email_id = int(id_list[0])
latest_email_id = int(id_list[-1])
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
futures = {executor.submit(self.__fetch_email, i) for i in range(latest_email_id, first_email_id, -1)}
for future in concurrent.futures.as_completed(futures):
result = future.result()
if result is not None:
yield result