improve hashing
This commit is contained in:
parent
cfacb77777
commit
8643fd247a
|
@ -5,6 +5,7 @@ import time
|
|||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from iarchiver.hash import murmur3_chunked
|
||||
from iarchiver.mail_conn import FileAttachment, FileAttachmentEncoder
|
||||
|
||||
|
||||
|
@ -39,26 +40,27 @@ class EmailDatabase:
|
|||
if sanitized_table_name in self.__restricted_strings:
|
||||
raise ValueError(f'Invalid table name, conflicts with system tables: {table_name}')
|
||||
cursor = self.conn.cursor()
|
||||
cursor.execute(f'CREATE TABLE IF NOT EXISTS {sanitized_table_name} (timestamp INTEGER, to_email TEXT, from_email TEXT, subject TEXT, raw TEXT, attachments TEXT, id INTEGER PRIMARY KEY AUTOINCREMENT)')
|
||||
cursor.execute(f'CREATE TABLE IF NOT EXISTS {sanitized_table_name} (timestamp INTEGER, to_email TEXT, from_email TEXT, subject TEXT, raw_content TEXT, raw_content_hash TEXT, attachments TEXT, id INTEGER PRIMARY KEY AUTOINCREMENT)')
|
||||
cursor.execute('INSERT OR IGNORE INTO folders_mapping (name, table_name) VALUES (?, ?)', (table_name, sanitized_table_name))
|
||||
self.conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def insert_email(self, folder: str, timestamp: int, subject: str, raw: str, to_email: str, from_email: str, attachments: List[FileAttachment]):
|
||||
def insert_email(self, folder: str, timestamp: int, subject: str, raw_content: str, to_email: str, from_email: str, attachments: List[FileAttachment]):
|
||||
raw_content_hash = murmur3_chunked(raw_content.encode())
|
||||
sanitized_table_name = sanitize_table_name(folder)
|
||||
self.__create_table(folder)
|
||||
cursor = self.conn.cursor()
|
||||
|
||||
# Check if record already exists
|
||||
stmt_check = f"SELECT * FROM {sanitized_table_name} WHERE timestamp = ? AND raw = ?"
|
||||
cursor.execute(stmt_check, (timestamp, raw))
|
||||
stmt_check = f"SELECT * FROM {sanitized_table_name} WHERE timestamp = ? AND raw_content_hash = ?"
|
||||
cursor.execute(stmt_check, (timestamp, raw_content_hash))
|
||||
data = cursor.fetchone()
|
||||
|
||||
# If record does not exist, insert it
|
||||
new_email = False
|
||||
if data is None:
|
||||
stmt = f"INSERT INTO {sanitized_table_name} (timestamp, to_email, from_email, subject, raw, attachments) VALUES (?, ?, ?, ?, ?, ?)"
|
||||
cursor.execute(stmt, (timestamp, to_email, from_email, subject, raw, json.dumps(attachments, cls=FileAttachmentEncoder)))
|
||||
stmt = f"INSERT INTO {sanitized_table_name} (timestamp, to_email, from_email, subject, raw_content, raw_content_hash, attachments) VALUES (?, ?, ?, ?, ?, ?, ?)"
|
||||
cursor.execute(stmt, (timestamp, to_email, from_email, subject, raw_content, raw_content_hash, json.dumps(attachments, cls=FileAttachmentEncoder)))
|
||||
self.conn.commit()
|
||||
new_email = True
|
||||
cursor.close()
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
import hashlib
|
||||
|
||||
import mmh3
|
||||
|
||||
|
||||
def md5_chunked(data: bytes, size: int = 1024):
|
||||
m = hashlib.md5()
|
||||
for i in range(0, len(data), size):
|
||||
m.update(data[i:i + size])
|
||||
return m.hexdigest()
|
||||
|
||||
|
||||
def murmur3_chunked(data: bytes, size: int = 1024):
|
||||
hashes = [mmh3.hash(data[i:i + size]) for i in range(0, len(data), size)]
|
||||
combined_hash = mmh3.hash(''.join(map(str, hashes)))
|
||||
return str(combined_hash)
|
|
@ -1,6 +1,5 @@
|
|||
import concurrent.futures
|
||||
import email
|
||||
import hashlib
|
||||
import imaplib
|
||||
import logging
|
||||
import sys
|
||||
|
@ -13,13 +12,7 @@ from typing import List
|
|||
import chardet
|
||||
|
||||
from iarchiver.email import extract_emails, normalize_for_imap_folder
|
||||
|
||||
|
||||
def md5_chunked(data: bytes, size: int = 1024):
|
||||
m = hashlib.md5()
|
||||
for i in range(0, len(data), size):
|
||||
m.update(data[i:i + size])
|
||||
return m.hexdigest()
|
||||
from iarchiver.hash import murmur3_chunked
|
||||
|
||||
|
||||
class FileAttachment:
|
||||
|
@ -115,7 +108,7 @@ class MailConnection:
|
|||
filecontents = part.get_payload(decode=True)
|
||||
if not filecontents:
|
||||
continue
|
||||
filehash = md5_chunked(filecontents)
|
||||
filehash = murmur3_chunked(filecontents)
|
||||
part.set_payload(f'MD5:{filehash}') # replace the attachment with its hash
|
||||
filepath = self.attachments_dir / filehash
|
||||
file_obj = FileAttachment(filename, filehash)
|
||||
|
|
4
main.py
4
main.py
|
@ -58,8 +58,8 @@ def main(args):
|
|||
search_criterion = ['ALL']
|
||||
|
||||
for email in mail.fetch_folder(folder_name, search_criterion=search_criterion):
|
||||
timestamp, to_email, from_email, subject, raw, attachments = email
|
||||
is_new_email = database.insert_email(folder_name, timestamp, subject, raw, to_email, from_email, attachments)
|
||||
timestamp, to_email, from_email, subject, raw_content, attachments = email
|
||||
is_new_email = database.insert_email(folder_name, timestamp, subject, raw_content, to_email, from_email, attachments)
|
||||
if is_new_email:
|
||||
new_emails += 1
|
||||
if len(attachments):
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
pyyaml==6.0.1
|
||||
chardet==5.2.0
|
||||
humanize==4.9.0
|
||||
mmh3==4.1.0
|
Loading…
Reference in New Issue