imap-archiver/iarchiver/mail_conn.py

import concurrent.futures
import email
import imaplib
import logging
import traceback
from email.header import decode_header
from email.utils import parsedate_to_datetime
from json import JSONEncoder
from pathlib import Path
from typing import List

import chardet

from iarchiver.email import extract_emails, normalize_for_imap_folder
from iarchiver.hash import murmur3_chunked


class FileAttachment:
    def __init__(self, file_name: str, file_hash: str, content_type: str):
        self.filename = file_name
        self.hash = file_hash
        self.content_type = content_type

    def to_dict(self):
        return {'filename': self.filename, 'hash': self.hash, 'content_type': self.content_type}


class FileAttachmentEncoder(JSONEncoder):
    def default(self, o):
        if isinstance(o, FileAttachment):
            return o.to_dict()
        return super().default(o)


class MailConnection:
    def __init__(self, host: str, username: str, password: str, attachments_dir: Path, save_inline_attachments: bool = False):
        self.mail = imaplib.IMAP4_SSL(host)
        self.mail.login(username, password)
        self.attachments_dir = attachments_dir.expanduser().absolute().resolve()
        self.save_inline_attachments = save_inline_attachments
        self.folder_structure = {}
        self.logger = logging.getLogger('iarchiver.mail_conn')
        self.logger.setLevel(logging.INFO)

    def load_folders(self):
        self.folder_structure = [f.decode().split(' "/" ')[1].replace('"', '').replace("\\'", "'") for f in self.mail.list()[1]]
        return self.folder_structure

    def __fetch_email(self, i):
        try:
            result, data = self.mail.uid('fetch', str(i), '(BODY.PEEK[])')  # fetch the email without marking it as read
            if data[0] is None:
                return
            raw_email_bytes = data[0][1]
            try:
                detected = chardet.detect(raw_email_bytes)
            except TypeError as e:
                self.logger.critical(f'Failed to decode an email. Timeout? Server error? - "{e}"')
                return
            encoding = detected['encoding']
            if not encoding:
                encoding = 'utf-8'
            raw_email = raw_email_bytes.decode(encoding, errors='replace')
            email_message = email.message_from_string(raw_email)
            date_header = email_message['Date']
            if not date_header:
                date_header = 'Thu, 1 Jan 1970 00:00:00 +0000'
            parsed_date = email.utils.parsedate_to_datetime(date_header)
            unix_timestamp = int(parsed_date.timestamp())

            from_addr = email_message['From']
            to_addr = email_message['To']
            if not from_addr and not to_addr:
                return
            if not from_addr:
                from_addr = ''
            if not to_addr:
                to_addr = ''

            from_header = ', '.join(extract_emails(from_addr))
            to_header = ', '.join(extract_emails(to_addr))
            if '@' not in from_header:
                from_header = from_addr
            if '@' not in to_header:
                to_header = to_addr

            subject_header = email_message['Subject']
            if subject_header:
                subject = decode_header(subject_header)[0][0]
                if isinstance(subject, bytes):
                    try:
                        detected = chardet.detect(subject)
                        encoding = detected['encoding']
                        if not encoding:
                            encoding = 'utf-8'
                        subject = subject.decode(encoding, errors='replace')
                    except UnicodeDecodeError:
                        subject = subject.decode('utf-8')
            else:
                return

            attachments = []
            if email_message.is_multipart():
                for part in email_message.walk():
                    content_disposition = str(part.get("Content-Disposition"))
                    if 'attachment' in content_disposition or (self.save_inline_attachments and 'inline' in content_disposition):
                        filename = part.get_filename()
                        if filename:
                            # The filename of the file is the hash of its content, which should de-duplicate files.
                            filecontents = part.get_payload(decode=True)
                            if not filecontents:
                                continue
                            filehash = murmur3_chunked(filecontents)
                            part.set_payload(f'MMH3:{filehash}')  # replace the attachment with its hash
                            filepath = self.attachments_dir / f'F{filehash}'
                            file_obj = FileAttachment(filename, filehash, part.get_content_type())
                            if not filepath.is_file():
                                with open(filepath, 'wb') as f:
                                    f.write(filecontents)
                            attachments.append(file_obj)
            raw_email_clean = email_message.as_string()
            return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments
        except:
            self.logger.critical(traceback.format_exc())
            return

    def fetch_folder(self, folder: str, search_criterion: List[str] = None, max_threads: int = 1):
        """
        Don't use multiple threads because most mail servers don't allow the client to multiplex.
        """
        if not search_criterion:
            search_criterion = ['ALL']
        self.mail.select(normalize_for_imap_folder(folder), readonly=True)
        for search_item in search_criterion:
            result, data = self.mail.uid('search', search_item)
            mail_ids = data[0]
            id_list = mail_ids.split()
            if not len(id_list):
                # Empty folder
                return
            first_email_id = int(id_list[0])
            latest_email_id = int(id_list[-1])

            with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
                futures = {executor.submit(self.__fetch_email, i) for i in range(latest_email_id, first_email_id, -1)}
                for future in concurrent.futures.as_completed(futures):
                    result = future.result()
                    if result is not None:
                        yield result
add code 2024-03-05 23:57:38 -07:00			`import concurrent.futures`
			`import email`
			`import imaplib`
detect timeout? 2024-03-06 00:04:11 -07:00			`import logging`
add viewer server 2024-03-06 14:25:36 -07:00			`import traceback`
add code 2024-03-05 23:57:38 -07:00			`from email.header import decode_header`
			`from email.utils import parsedate_to_datetime`
			`from json import JSONEncoder`
			`from pathlib import Path`
remove parent/child folder layout, make work with gmail, fix issues 2024-03-06 11:36:43 -07:00			`from typing import List`
add code 2024-03-05 23:57:38 -07:00
			`import chardet`

remove parent/child folder layout, make work with gmail, fix issues 2024-03-06 11:36:43 -07:00			`from iarchiver.email import extract_emails, normalize_for_imap_folder`
improve hashing 2024-03-06 12:08:12 -07:00			`from iarchiver.hash import murmur3_chunked`
add code 2024-03-05 23:57:38 -07:00

			`class FileAttachment:`
add viewer server 2024-03-06 14:25:36 -07:00			`def __init__(self, file_name: str, file_hash: str, content_type: str):`
add code 2024-03-05 23:57:38 -07:00			`self.filename = file_name`
			`self.hash = file_hash`
add viewer server 2024-03-06 14:25:36 -07:00			`self.content_type = content_type`
add code 2024-03-05 23:57:38 -07:00
			`def to_dict(self):`
add viewer server 2024-03-06 14:25:36 -07:00			`return {'filename': self.filename, 'hash': self.hash, 'content_type': self.content_type}`
add code 2024-03-05 23:57:38 -07:00

			`class FileAttachmentEncoder(JSONEncoder):`
			`def default(self, o):`
			`if isinstance(o, FileAttachment):`
			`return o.to_dict()`
			`return super().default(o)`


			`class MailConnection:`
support also saving inline attachments 2024-07-18 19:21:05 -06:00			`def __init__(self, host: str, username: str, password: str, attachments_dir: Path, save_inline_attachments: bool = False):`
add code 2024-03-05 23:57:38 -07:00			`self.mail = imaplib.IMAP4_SSL(host)`
			`self.mail.login(username, password)`
			`self.attachments_dir = attachments_dir.expanduser().absolute().resolve()`
support also saving inline attachments 2024-07-18 19:21:05 -06:00			`self.save_inline_attachments = save_inline_attachments`
add code 2024-03-05 23:57:38 -07:00			`self.folder_structure = {}`
fix exception 2024-03-06 10:28:27 -07:00			`self.logger = logging.getLogger('iarchiver.mail_conn')`
detect timeout? 2024-03-06 00:04:11 -07:00			`self.logger.setLevel(logging.INFO)`
add code 2024-03-05 23:57:38 -07:00
			`def load_folders(self):`
remove parent/child folder layout, make work with gmail, fix issues 2024-03-06 11:36:43 -07:00			`self.folder_structure = [f.decode().split(' "/" ')[1].replace('"', '').replace("\\'", "'") for f in self.mail.list()[1]]`
add code 2024-03-05 23:57:38 -07:00			`return self.folder_structure`

			`def __fetch_email(self, i):`
detect timeout? 2024-03-06 00:04:11 -07:00			`try:`
don't mark emails as read when fetching them 2024-07-18 19:25:30 -06:00			`result, data = self.mail.uid('fetch', str(i), '(BODY.PEEK[])') # fetch the email without marking it as read`
add viewer server 2024-03-06 14:25:36 -07:00			`if data[0] is None:`
			`return`
			`raw_email_bytes = data[0][1]`
			`try:`
			`detected = chardet.detect(raw_email_bytes)`
			`except TypeError as e:`
			`self.logger.critical(f'Failed to decode an email. Timeout? Server error? - "{e}"')`
			`return`
			`encoding = detected['encoding']`
			`if not encoding:`
			`encoding = 'utf-8'`
			`raw_email = raw_email_bytes.decode(encoding, errors='replace')`
			`email_message = email.message_from_string(raw_email)`
			`date_header = email_message['Date']`
			`if not date_header:`
			`date_header = 'Thu, 1 Jan 1970 00:00:00 +0000'`
			`parsed_date = email.utils.parsedate_to_datetime(date_header)`
			`unix_timestamp = int(parsed_date.timestamp())`

			`from_addr = email_message['From']`
			`to_addr = email_message['To']`
			`if not from_addr and not to_addr:`
			`return`
			`if not from_addr:`
			`from_addr = ''`
			`if not to_addr:`
			`to_addr = ''`

			`from_header = ', '.join(extract_emails(from_addr))`
			`to_header = ', '.join(extract_emails(to_addr))`
			`if '@' not in from_header:`
			`from_header = from_addr`
			`if '@' not in to_header:`
			`to_header = to_addr`

			`subject_header = email_message['Subject']`
			`if subject_header:`
			`subject = decode_header(subject_header)[0][0]`
			`if isinstance(subject, bytes):`
			`try:`
			`detected = chardet.detect(subject)`
			`encoding = detected['encoding']`
			`if not encoding:`
			`encoding = 'utf-8'`
			`subject = subject.decode(encoding, errors='replace')`
			`except UnicodeDecodeError:`
			`subject = subject.decode('utf-8')`
			`else:`
			`return`
add code 2024-03-05 23:57:38 -07:00
add viewer server 2024-03-06 14:25:36 -07:00			`attachments = []`
			`if email_message.is_multipart():`
			`for part in email_message.walk():`
			`content_disposition = str(part.get("Content-Disposition"))`
support also saving inline attachments 2024-07-18 19:21:05 -06:00			`if 'attachment' in content_disposition or (self.save_inline_attachments and 'inline' in content_disposition):`
add viewer server 2024-03-06 14:25:36 -07:00			`filename = part.get_filename()`
			`if filename:`
			`# The filename of the file is the hash of its content, which should de-duplicate files.`
			`filecontents = part.get_payload(decode=True)`
			`if not filecontents:`
			`continue`
			`filehash = murmur3_chunked(filecontents)`
			`part.set_payload(f'MMH3:{filehash}') # replace the attachment with its hash`
			`filepath = self.attachments_dir / f'F{filehash}'`
			`file_obj = FileAttachment(filename, filehash, part.get_content_type())`
			`if not filepath.is_file():`
			`with open(filepath, 'wb') as f:`
			`f.write(filecontents)`
			`attachments.append(file_obj)`
			`raw_email_clean = email_message.as_string()`
			`return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments`
support also saving inline attachments 2024-07-18 19:21:05 -06:00			`except:`
add viewer server 2024-03-06 14:25:36 -07:00			`self.logger.critical(traceback.format_exc())`
			`return`
add code 2024-03-05 23:57:38 -07:00
remove parent/child folder layout, make work with gmail, fix issues 2024-03-06 11:36:43 -07:00			`def fetch_folder(self, folder: str, search_criterion: List[str] = None, max_threads: int = 1):`
add code 2024-03-05 23:57:38 -07:00			`"""`
			`Don't use multiple threads because most mail servers don't allow the client to multiplex.`
			`"""`
remove parent/child folder layout, make work with gmail, fix issues 2024-03-06 11:36:43 -07:00			`if not search_criterion:`
			`search_criterion = ['ALL']`
make sure we're in readonly mode 2024-07-18 19:58:42 -06:00			`self.mail.select(normalize_for_imap_folder(folder), readonly=True)`
remove parent/child folder layout, make work with gmail, fix issues 2024-03-06 11:36:43 -07:00			`for search_item in search_criterion:`
			`result, data = self.mail.uid('search', search_item)`
			`mail_ids = data[0]`
			`id_list = mail_ids.split()`
			`if not len(id_list):`
			`# Empty folder`
			`return`
			`first_email_id = int(id_list[0])`
			`latest_email_id = int(id_list[-1])`

			`with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:`
			`futures = {executor.submit(self.__fetch_email, i) for i in range(latest_email_id, first_email_id, -1)}`
			`for future in concurrent.futures.as_completed(futures):`
			`result = future.result()`
			`if result is not None:`
			`yield result`