improve hashing

This commit is contained in:
Cyberes 2024-03-06 12:08:12 -07:00
parent cfacb77777
commit 8643fd247a
5 changed files with 30 additions and 18 deletions

View File

@ -5,6 +5,7 @@ import time
from pathlib import Path
from typing import List
from iarchiver.hash import murmur3_chunked
from iarchiver.mail_conn import FileAttachment, FileAttachmentEncoder
@ -39,26 +40,27 @@ class EmailDatabase:
if sanitized_table_name in self.__restricted_strings:
raise ValueError(f'Invalid table name, conflicts with system tables: {table_name}')
cursor = self.conn.cursor()
cursor.execute(f'CREATE TABLE IF NOT EXISTS {sanitized_table_name} (timestamp INTEGER, to_email TEXT, from_email TEXT, subject TEXT, raw TEXT, attachments TEXT, id INTEGER PRIMARY KEY AUTOINCREMENT)')
cursor.execute(f'CREATE TABLE IF NOT EXISTS {sanitized_table_name} (timestamp INTEGER, to_email TEXT, from_email TEXT, subject TEXT, raw_content TEXT, raw_content_hash TEXT, attachments TEXT, id INTEGER PRIMARY KEY AUTOINCREMENT)')
cursor.execute('INSERT OR IGNORE INTO folders_mapping (name, table_name) VALUES (?, ?)', (table_name, sanitized_table_name))
self.conn.commit()
cursor.close()
def insert_email(self, folder: str, timestamp: int, subject: str, raw: str, to_email: str, from_email: str, attachments: List[FileAttachment]):
def insert_email(self, folder: str, timestamp: int, subject: str, raw_content: str, to_email: str, from_email: str, attachments: List[FileAttachment]):
raw_content_hash = murmur3_chunked(raw_content.encode())
sanitized_table_name = sanitize_table_name(folder)
self.__create_table(folder)
cursor = self.conn.cursor()
# Check if record already exists
stmt_check = f"SELECT * FROM {sanitized_table_name} WHERE timestamp = ? AND raw = ?"
cursor.execute(stmt_check, (timestamp, raw))
stmt_check = f"SELECT * FROM {sanitized_table_name} WHERE timestamp = ? AND raw_content_hash = ?"
cursor.execute(stmt_check, (timestamp, raw_content_hash))
data = cursor.fetchone()
# If record does not exist, insert it
new_email = False
if data is None:
stmt = f"INSERT INTO {sanitized_table_name} (timestamp, to_email, from_email, subject, raw, attachments) VALUES (?, ?, ?, ?, ?, ?)"
cursor.execute(stmt, (timestamp, to_email, from_email, subject, raw, json.dumps(attachments, cls=FileAttachmentEncoder)))
stmt = f"INSERT INTO {sanitized_table_name} (timestamp, to_email, from_email, subject, raw_content, raw_content_hash, attachments) VALUES (?, ?, ?, ?, ?, ?, ?)"
cursor.execute(stmt, (timestamp, to_email, from_email, subject, raw_content, raw_content_hash, json.dumps(attachments, cls=FileAttachmentEncoder)))
self.conn.commit()
new_email = True
cursor.close()

16
iarchiver/hash.py Normal file
View File

@ -0,0 +1,16 @@
import hashlib
import mmh3
def md5_chunked(data: bytes, size: int = 1024):
m = hashlib.md5()
for i in range(0, len(data), size):
m.update(data[i:i + size])
return m.hexdigest()
def murmur3_chunked(data: bytes, size: int = 1024):
hashes = [mmh3.hash(data[i:i + size]) for i in range(0, len(data), size)]
combined_hash = mmh3.hash(''.join(map(str, hashes)))
return str(combined_hash)

View File

@ -1,6 +1,5 @@
import concurrent.futures
import email
import hashlib
import imaplib
import logging
import sys
@ -13,13 +12,7 @@ from typing import List
import chardet
from iarchiver.email import extract_emails, normalize_for_imap_folder
def md5_chunked(data: bytes, size: int = 1024):
m = hashlib.md5()
for i in range(0, len(data), size):
m.update(data[i:i + size])
return m.hexdigest()
from iarchiver.hash import murmur3_chunked
class FileAttachment:
@ -115,7 +108,7 @@ class MailConnection:
filecontents = part.get_payload(decode=True)
if not filecontents:
continue
filehash = md5_chunked(filecontents)
filehash = murmur3_chunked(filecontents)
part.set_payload(f'MD5:{filehash}') # replace the attachment with its hash
filepath = self.attachments_dir / filehash
file_obj = FileAttachment(filename, filehash)

View File

@ -58,8 +58,8 @@ def main(args):
search_criterion = ['ALL']
for email in mail.fetch_folder(folder_name, search_criterion=search_criterion):
timestamp, to_email, from_email, subject, raw, attachments = email
is_new_email = database.insert_email(folder_name, timestamp, subject, raw, to_email, from_email, attachments)
timestamp, to_email, from_email, subject, raw_content, attachments = email
is_new_email = database.insert_email(folder_name, timestamp, subject, raw_content, to_email, from_email, attachments)
if is_new_email:
new_emails += 1
if len(attachments):

View File

@ -1,3 +1,4 @@
pyyaml==6.0.1
chardet==5.2.0
humanize==4.9.0
mmh3==4.1.0