diff --git a/iarchiver/mail_conn.py b/iarchiver/mail_conn.py index 779beb9..3a1113a 100644 --- a/iarchiver/mail_conn.py +++ b/iarchiver/mail_conn.py @@ -2,6 +2,7 @@ import concurrent.futures import email import imaplib import logging +import traceback from email.header import decode_header from email.utils import parsedate_to_datetime from json import JSONEncoder @@ -15,12 +16,13 @@ from iarchiver.hash import murmur3_chunked class FileAttachment: - def __init__(self, file_name: str, file_hash: str): + def __init__(self, file_name: str, file_hash: str, content_type: str): self.filename = file_name self.hash = file_hash + self.content_type = content_type def to_dict(self): - return {'filename': self.filename, 'hash': self.hash} + return {'filename': self.filename, 'hash': self.hash, 'content_type': self.content_type} class FileAttachmentEncoder(JSONEncoder): @@ -44,79 +46,82 @@ class MailConnection: return self.folder_structure def __fetch_email(self, i): - result, data = self.mail.uid('fetch', str(i), '(BODY[])') # fetch the raw email - if data[0] is None: - return - raw_email_bytes = data[0][1] try: - detected = chardet.detect(raw_email_bytes) - except TypeError as e: - self.logger.critical(f'Failed to decode an email. Timeout? Server error? - "{e}"') + result, data = self.mail.uid('fetch', str(i), '(BODY[])') # fetch the raw email + if data[0] is None: + return + raw_email_bytes = data[0][1] + try: + detected = chardet.detect(raw_email_bytes) + except TypeError as e: + self.logger.critical(f'Failed to decode an email. Timeout? Server error? - "{e}"') + return + encoding = detected['encoding'] + if not encoding: + encoding = 'utf-8' + raw_email = raw_email_bytes.decode(encoding, errors='replace') + email_message = email.message_from_string(raw_email) + date_header = email_message['Date'] + if not date_header: + date_header = 'Thu, 1 Jan 1970 00:00:00 +0000' + parsed_date = email.utils.parsedate_to_datetime(date_header) + unix_timestamp = int(parsed_date.timestamp()) + + from_addr = email_message['From'] + to_addr = email_message['To'] + if not from_addr and not to_addr: + return + if not from_addr: + from_addr = '' + if not to_addr: + to_addr = '' + + from_header = ', '.join(extract_emails(from_addr)) + to_header = ', '.join(extract_emails(to_addr)) + if '@' not in from_header: + from_header = from_addr + if '@' not in to_header: + to_header = to_addr + + subject_header = email_message['Subject'] + if subject_header: + subject = decode_header(subject_header)[0][0] + if isinstance(subject, bytes): + try: + detected = chardet.detect(subject) + encoding = detected['encoding'] + if not encoding: + encoding = 'utf-8' + subject = subject.decode(encoding, errors='replace') + except UnicodeDecodeError: + subject = subject.decode('utf-8') + else: + return + + attachments = [] + if email_message.is_multipart(): + for part in email_message.walk(): + content_disposition = str(part.get("Content-Disposition")) + if "attachment" in content_disposition: + filename = part.get_filename() + if filename: + # The filename of the file is the hash of its content, which should de-duplicate files. + filecontents = part.get_payload(decode=True) + if not filecontents: + continue + filehash = murmur3_chunked(filecontents) + part.set_payload(f'MMH3:{filehash}') # replace the attachment with its hash + filepath = self.attachments_dir / f'F{filehash}' + file_obj = FileAttachment(filename, filehash, part.get_content_type()) + if not filepath.is_file(): + with open(filepath, 'wb') as f: + f.write(filecontents) + attachments.append(file_obj) + raw_email_clean = email_message.as_string() + return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments + except Exception as e: + self.logger.critical(traceback.format_exc()) return - encoding = detected['encoding'] - if not encoding: - encoding = 'utf-8' - raw_email = raw_email_bytes.decode(encoding, errors='replace') - email_message = email.message_from_string(raw_email) - date_header = email_message['Date'] - if not date_header: - date_header = 'Thu, 1 Jan 1970 00:00:00 +0000' - parsed_date = email.utils.parsedate_to_datetime(date_header) - unix_timestamp = int(parsed_date.timestamp()) - - from_addr = email_message['From'] - to_addr = email_message['To'] - if not from_addr and not to_addr: - return - if not from_addr: - from_addr = '' - if not to_addr: - to_addr = '' - - from_header = ', '.join(extract_emails(from_addr)) - to_header = ', '.join(extract_emails(to_addr)) - if '@' not in from_header: - from_header = from_addr - if '@' not in to_header: - to_header = to_addr - - subject_header = email_message['Subject'] - if subject_header: - subject = decode_header(subject_header)[0][0] - if isinstance(subject, bytes): - try: - detected = chardet.detect(subject) - encoding = detected['encoding'] - if not encoding: - encoding = 'utf-8' - subject = subject.decode(encoding, errors='replace') - except UnicodeDecodeError: - subject = subject.decode('utf-8') - else: - return - - attachments = [] - if email_message.is_multipart(): - for part in email_message.walk(): - # content_type = part.get_content_type() - content_disposition = str(part.get("Content-Disposition")) - if "attachment" in content_disposition: - filename = part.get_filename() - if filename: - # The filename of the file is the hash of its content, which should de-duplicate files. - filecontents = part.get_payload(decode=True) - if not filecontents: - continue - filehash = murmur3_chunked(filecontents) - part.set_payload(f'MD5:{filehash}') # replace the attachment with its hash - filepath = self.attachments_dir / filehash - file_obj = FileAttachment(filename, filehash) - if not filepath.is_file(): - with open(filepath, 'wb') as f: - f.write(filecontents) - attachments.append(file_obj) - raw_email_clean = email_message.as_string() - return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments def fetch_folder(self, folder: str, search_criterion: List[str] = None, max_threads: int = 1): """ @@ -135,7 +140,6 @@ class MailConnection: first_email_id = int(id_list[0]) latest_email_id = int(id_list[-1]) - with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: futures = {executor.submit(self.__fetch_email, i) for i in range(latest_email_id, first_email_id, -1)} for future in concurrent.futures.as_completed(futures): diff --git a/requirements.txt b/requirements.txt index 58f66a5..430c549 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ pyyaml==6.0.1 chardet==5.2.0 humanize==4.9.0 -mmh3==4.1.0 \ No newline at end of file +mmh3==4.1.0 +flask==3.0.2 \ No newline at end of file diff --git a/server.py b/server.py index e69de29..a10a70d 100644 --- a/server.py +++ b/server.py @@ -0,0 +1,63 @@ +import json +import sqlite3 +from datetime import datetime +from pathlib import Path + +import magic +from flask import Flask, render_template, send_from_directory + +app = Flask(__name__) + + +def get_db_connection(): + conn = sqlite3.connect('emails.db') + conn.row_factory = sqlite3.Row + return conn + + +def dict_from_row(row): + return dict(zip(row.keys(), row)) + + +@app.route('/') +def index(): + conn = get_db_connection() + folders = conn.execute('SELECT name, table_name FROM folders_mapping').fetchall() + syncs = conn.execute('SELECT * FROM syncs ORDER BY timestamp DESC').fetchall() + conn.close() + syncs = [dict_from_row(sync) for sync in syncs] + for sync in syncs: + sync['timestamp'] = datetime.fromtimestamp(sync['timestamp']).strftime('%Y-%m-%d %H:%M:%S') + return render_template('index.html', folders=folders, syncs=syncs) + + +@app.route('/folder/') +def folder(table_name): + conn = get_db_connection() + emails = conn.execute(f'SELECT * FROM {table_name} ORDER BY timestamp DESC').fetchall() + conn.close() + emails = [dict_from_row(email) for email in emails] + for email in emails: + email['timestamp'] = datetime.fromtimestamp(email['timestamp']).strftime('%Y-%m-%d %H:%M:%S') + return render_template('folder.html', emails=emails, table_name=table_name) + + +@app.route('/email//') +def email(table_name, id): + conn = get_db_connection() + email = conn.execute(f'SELECT * FROM {table_name} WHERE id = ?', (id,)).fetchone() + conn.close() + email = dict_from_row(email) + email['timestamp'] = datetime.fromtimestamp(email['timestamp']).strftime('%Y-%m-%d %H:%M:%S') + attachments = json.loads(email['attachments']) + return render_template('email.html', email=email, attachments=attachments) + + +@app.route('/attachments/') +def download_file(filename): + mimetype = magic.from_file(str(Path('attachments', filename)), mime=True) + return send_from_directory('attachments', filename, mimetype=mimetype) + + +if __name__ == '__main__': + app.run(host='0.0.0.0', debug=True) diff --git a/templates/email.html b/templates/email.html new file mode 100644 index 0000000..fc2bd10 --- /dev/null +++ b/templates/email.html @@ -0,0 +1,17 @@ + + + + Email + + +

Email

+

Attachments

+ +

Content

+
{{ email.raw_content }}
+ + diff --git a/templates/folder.html b/templates/folder.html new file mode 100644 index 0000000..4b9e686 --- /dev/null +++ b/templates/folder.html @@ -0,0 +1,23 @@ + + + + Emails + + +

Emails

+ + + + diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..5e0ab9c --- /dev/null +++ b/templates/index.html @@ -0,0 +1,40 @@ + + + + Email Folders + + +

Email Folders

+ +

Last Syncs

+ + + + + + + + + + {% for sync in syncs %} + + + + + + + + + {% endfor %} +
TimestampTypeNew EmailsNew AttachmentsNew FoldersDuration
{{ sync.timestamp }}{{ sync.type }}{{ sync.new_emails }}{{ sync.new_attachments }}{{ sync.new_folders }}{{ sync.duration }}
+ + +