diff --git a/iarchiver/database.py b/iarchiver/database.py index 8b74f3c..bee4b09 100644 --- a/iarchiver/database.py +++ b/iarchiver/database.py @@ -5,6 +5,7 @@ import time from pathlib import Path from typing import List +from iarchiver.hash import murmur3_chunked from iarchiver.mail_conn import FileAttachment, FileAttachmentEncoder @@ -39,26 +40,27 @@ class EmailDatabase: if sanitized_table_name in self.__restricted_strings: raise ValueError(f'Invalid table name, conflicts with system tables: {table_name}') cursor = self.conn.cursor() - cursor.execute(f'CREATE TABLE IF NOT EXISTS {sanitized_table_name} (timestamp INTEGER, to_email TEXT, from_email TEXT, subject TEXT, raw TEXT, attachments TEXT, id INTEGER PRIMARY KEY AUTOINCREMENT)') + cursor.execute(f'CREATE TABLE IF NOT EXISTS {sanitized_table_name} (timestamp INTEGER, to_email TEXT, from_email TEXT, subject TEXT, raw_content TEXT, raw_content_hash TEXT, attachments TEXT, id INTEGER PRIMARY KEY AUTOINCREMENT)') cursor.execute('INSERT OR IGNORE INTO folders_mapping (name, table_name) VALUES (?, ?)', (table_name, sanitized_table_name)) self.conn.commit() cursor.close() - def insert_email(self, folder: str, timestamp: int, subject: str, raw: str, to_email: str, from_email: str, attachments: List[FileAttachment]): + def insert_email(self, folder: str, timestamp: int, subject: str, raw_content: str, to_email: str, from_email: str, attachments: List[FileAttachment]): + raw_content_hash = murmur3_chunked(raw_content.encode()) sanitized_table_name = sanitize_table_name(folder) self.__create_table(folder) cursor = self.conn.cursor() # Check if record already exists - stmt_check = f"SELECT * FROM {sanitized_table_name} WHERE timestamp = ? AND raw = ?" - cursor.execute(stmt_check, (timestamp, raw)) + stmt_check = f"SELECT * FROM {sanitized_table_name} WHERE timestamp = ? AND raw_content_hash = ?" + cursor.execute(stmt_check, (timestamp, raw_content_hash)) data = cursor.fetchone() # If record does not exist, insert it new_email = False if data is None: - stmt = f"INSERT INTO {sanitized_table_name} (timestamp, to_email, from_email, subject, raw, attachments) VALUES (?, ?, ?, ?, ?, ?)" - cursor.execute(stmt, (timestamp, to_email, from_email, subject, raw, json.dumps(attachments, cls=FileAttachmentEncoder))) + stmt = f"INSERT INTO {sanitized_table_name} (timestamp, to_email, from_email, subject, raw_content, raw_content_hash, attachments) VALUES (?, ?, ?, ?, ?, ?, ?)" + cursor.execute(stmt, (timestamp, to_email, from_email, subject, raw_content, raw_content_hash, json.dumps(attachments, cls=FileAttachmentEncoder))) self.conn.commit() new_email = True cursor.close() diff --git a/iarchiver/hash.py b/iarchiver/hash.py new file mode 100644 index 0000000..ac4585f --- /dev/null +++ b/iarchiver/hash.py @@ -0,0 +1,16 @@ +import hashlib + +import mmh3 + + +def md5_chunked(data: bytes, size: int = 1024): + m = hashlib.md5() + for i in range(0, len(data), size): + m.update(data[i:i + size]) + return m.hexdigest() + + +def murmur3_chunked(data: bytes, size: int = 1024): + hashes = [mmh3.hash(data[i:i + size]) for i in range(0, len(data), size)] + combined_hash = mmh3.hash(''.join(map(str, hashes))) + return str(combined_hash) diff --git a/iarchiver/mail_conn.py b/iarchiver/mail_conn.py index 8380bb2..f8c9a71 100644 --- a/iarchiver/mail_conn.py +++ b/iarchiver/mail_conn.py @@ -1,6 +1,5 @@ import concurrent.futures import email -import hashlib import imaplib import logging import sys @@ -13,13 +12,7 @@ from typing import List import chardet from iarchiver.email import extract_emails, normalize_for_imap_folder - - -def md5_chunked(data: bytes, size: int = 1024): - m = hashlib.md5() - for i in range(0, len(data), size): - m.update(data[i:i + size]) - return m.hexdigest() +from iarchiver.hash import murmur3_chunked class FileAttachment: @@ -115,7 +108,7 @@ class MailConnection: filecontents = part.get_payload(decode=True) if not filecontents: continue - filehash = md5_chunked(filecontents) + filehash = murmur3_chunked(filecontents) part.set_payload(f'MD5:{filehash}') # replace the attachment with its hash filepath = self.attachments_dir / filehash file_obj = FileAttachment(filename, filehash) diff --git a/main.py b/main.py index ec2879a..a328d52 100755 --- a/main.py +++ b/main.py @@ -58,8 +58,8 @@ def main(args): search_criterion = ['ALL'] for email in mail.fetch_folder(folder_name, search_criterion=search_criterion): - timestamp, to_email, from_email, subject, raw, attachments = email - is_new_email = database.insert_email(folder_name, timestamp, subject, raw, to_email, from_email, attachments) + timestamp, to_email, from_email, subject, raw_content, attachments = email + is_new_email = database.insert_email(folder_name, timestamp, subject, raw_content, to_email, from_email, attachments) if is_new_email: new_emails += 1 if len(attachments): diff --git a/requirements.txt b/requirements.txt index d218722..58f66a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ pyyaml==6.0.1 chardet==5.2.0 -humanize==4.9.0 \ No newline at end of file +humanize==4.9.0 +mmh3==4.1.0 \ No newline at end of file