improve hashing
This commit is contained in:
parent
cfacb77777
commit
8643fd247a
|
@ -5,6 +5,7 @@ import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
from iarchiver.hash import murmur3_chunked
|
||||||
from iarchiver.mail_conn import FileAttachment, FileAttachmentEncoder
|
from iarchiver.mail_conn import FileAttachment, FileAttachmentEncoder
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,26 +40,27 @@ class EmailDatabase:
|
||||||
if sanitized_table_name in self.__restricted_strings:
|
if sanitized_table_name in self.__restricted_strings:
|
||||||
raise ValueError(f'Invalid table name, conflicts with system tables: {table_name}')
|
raise ValueError(f'Invalid table name, conflicts with system tables: {table_name}')
|
||||||
cursor = self.conn.cursor()
|
cursor = self.conn.cursor()
|
||||||
cursor.execute(f'CREATE TABLE IF NOT EXISTS {sanitized_table_name} (timestamp INTEGER, to_email TEXT, from_email TEXT, subject TEXT, raw TEXT, attachments TEXT, id INTEGER PRIMARY KEY AUTOINCREMENT)')
|
cursor.execute(f'CREATE TABLE IF NOT EXISTS {sanitized_table_name} (timestamp INTEGER, to_email TEXT, from_email TEXT, subject TEXT, raw_content TEXT, raw_content_hash TEXT, attachments TEXT, id INTEGER PRIMARY KEY AUTOINCREMENT)')
|
||||||
cursor.execute('INSERT OR IGNORE INTO folders_mapping (name, table_name) VALUES (?, ?)', (table_name, sanitized_table_name))
|
cursor.execute('INSERT OR IGNORE INTO folders_mapping (name, table_name) VALUES (?, ?)', (table_name, sanitized_table_name))
|
||||||
self.conn.commit()
|
self.conn.commit()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
|
||||||
def insert_email(self, folder: str, timestamp: int, subject: str, raw: str, to_email: str, from_email: str, attachments: List[FileAttachment]):
|
def insert_email(self, folder: str, timestamp: int, subject: str, raw_content: str, to_email: str, from_email: str, attachments: List[FileAttachment]):
|
||||||
|
raw_content_hash = murmur3_chunked(raw_content.encode())
|
||||||
sanitized_table_name = sanitize_table_name(folder)
|
sanitized_table_name = sanitize_table_name(folder)
|
||||||
self.__create_table(folder)
|
self.__create_table(folder)
|
||||||
cursor = self.conn.cursor()
|
cursor = self.conn.cursor()
|
||||||
|
|
||||||
# Check if record already exists
|
# Check if record already exists
|
||||||
stmt_check = f"SELECT * FROM {sanitized_table_name} WHERE timestamp = ? AND raw = ?"
|
stmt_check = f"SELECT * FROM {sanitized_table_name} WHERE timestamp = ? AND raw_content_hash = ?"
|
||||||
cursor.execute(stmt_check, (timestamp, raw))
|
cursor.execute(stmt_check, (timestamp, raw_content_hash))
|
||||||
data = cursor.fetchone()
|
data = cursor.fetchone()
|
||||||
|
|
||||||
# If record does not exist, insert it
|
# If record does not exist, insert it
|
||||||
new_email = False
|
new_email = False
|
||||||
if data is None:
|
if data is None:
|
||||||
stmt = f"INSERT INTO {sanitized_table_name} (timestamp, to_email, from_email, subject, raw, attachments) VALUES (?, ?, ?, ?, ?, ?)"
|
stmt = f"INSERT INTO {sanitized_table_name} (timestamp, to_email, from_email, subject, raw_content, raw_content_hash, attachments) VALUES (?, ?, ?, ?, ?, ?, ?)"
|
||||||
cursor.execute(stmt, (timestamp, to_email, from_email, subject, raw, json.dumps(attachments, cls=FileAttachmentEncoder)))
|
cursor.execute(stmt, (timestamp, to_email, from_email, subject, raw_content, raw_content_hash, json.dumps(attachments, cls=FileAttachmentEncoder)))
|
||||||
self.conn.commit()
|
self.conn.commit()
|
||||||
new_email = True
|
new_email = True
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
import mmh3
|
||||||
|
|
||||||
|
|
||||||
|
def md5_chunked(data: bytes, size: int = 1024):
|
||||||
|
m = hashlib.md5()
|
||||||
|
for i in range(0, len(data), size):
|
||||||
|
m.update(data[i:i + size])
|
||||||
|
return m.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def murmur3_chunked(data: bytes, size: int = 1024):
|
||||||
|
hashes = [mmh3.hash(data[i:i + size]) for i in range(0, len(data), size)]
|
||||||
|
combined_hash = mmh3.hash(''.join(map(str, hashes)))
|
||||||
|
return str(combined_hash)
|
|
@ -1,6 +1,5 @@
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import email
|
import email
|
||||||
import hashlib
|
|
||||||
import imaplib
|
import imaplib
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
@ -13,13 +12,7 @@ from typing import List
|
||||||
import chardet
|
import chardet
|
||||||
|
|
||||||
from iarchiver.email import extract_emails, normalize_for_imap_folder
|
from iarchiver.email import extract_emails, normalize_for_imap_folder
|
||||||
|
from iarchiver.hash import murmur3_chunked
|
||||||
|
|
||||||
def md5_chunked(data: bytes, size: int = 1024):
|
|
||||||
m = hashlib.md5()
|
|
||||||
for i in range(0, len(data), size):
|
|
||||||
m.update(data[i:i + size])
|
|
||||||
return m.hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
class FileAttachment:
|
class FileAttachment:
|
||||||
|
@ -115,7 +108,7 @@ class MailConnection:
|
||||||
filecontents = part.get_payload(decode=True)
|
filecontents = part.get_payload(decode=True)
|
||||||
if not filecontents:
|
if not filecontents:
|
||||||
continue
|
continue
|
||||||
filehash = md5_chunked(filecontents)
|
filehash = murmur3_chunked(filecontents)
|
||||||
part.set_payload(f'MD5:{filehash}') # replace the attachment with its hash
|
part.set_payload(f'MD5:{filehash}') # replace the attachment with its hash
|
||||||
filepath = self.attachments_dir / filehash
|
filepath = self.attachments_dir / filehash
|
||||||
file_obj = FileAttachment(filename, filehash)
|
file_obj = FileAttachment(filename, filehash)
|
||||||
|
|
4
main.py
4
main.py
|
@ -58,8 +58,8 @@ def main(args):
|
||||||
search_criterion = ['ALL']
|
search_criterion = ['ALL']
|
||||||
|
|
||||||
for email in mail.fetch_folder(folder_name, search_criterion=search_criterion):
|
for email in mail.fetch_folder(folder_name, search_criterion=search_criterion):
|
||||||
timestamp, to_email, from_email, subject, raw, attachments = email
|
timestamp, to_email, from_email, subject, raw_content, attachments = email
|
||||||
is_new_email = database.insert_email(folder_name, timestamp, subject, raw, to_email, from_email, attachments)
|
is_new_email = database.insert_email(folder_name, timestamp, subject, raw_content, to_email, from_email, attachments)
|
||||||
if is_new_email:
|
if is_new_email:
|
||||||
new_emails += 1
|
new_emails += 1
|
||||||
if len(attachments):
|
if len(attachments):
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
pyyaml==6.0.1
|
pyyaml==6.0.1
|
||||||
chardet==5.2.0
|
chardet==5.2.0
|
||||||
humanize==4.9.0
|
humanize==4.9.0
|
||||||
|
mmh3==4.1.0
|
Loading…
Reference in New Issue