remove parent/child folder layout, make work with gmail, fix issues

This commit is contained in:
Cyberes 2024-03-06 11:36:43 -07:00
parent b965538688
commit 184a9a78e1
6 changed files with 70 additions and 52 deletions

View File

@ -20,4 +20,5 @@ A sample systemd service file is included.
## To Do
- [ ] Fix subject decoding. Some character sets aren't detected correctly.
- [ ] Fix subject decoding. Some character sets aren't detected correctly.
- [ ] Sync Gmail categories as folders.

View File

@ -7,4 +7,8 @@ attachments_path: attachments
exclude_folders:
- Trash
- Drafts
- Drafts
- '[Gmail]/Trash'
- '[Gmail]/Drafts'
- '[Gmail]/Starred'
- '[Gmail]/Important'

View File

@ -10,3 +10,8 @@ def extract_emails(field: str):
def unix_timestamp_since_to_imap_timestamp(unix_timestamp: Union[int, float]) -> str:
return (datetime.fromtimestamp(unix_timestamp) - timedelta(days=1)).strftime("%d-%b-%Y")
def normalize_for_imap_folder(folder: str):
folder = folder.replace("'", "\'")
return f'"{folder}"'

10
iarchiver/gmail.py Normal file
View File

@ -0,0 +1,10 @@
# https://support.google.com/mail/answer/7190?hl=en
gmail_categories = [
# 'category:primary',
'category:social',
'category:promotions',
'category:updates',
'category:forums',
'category:reservations',
'category:purchases'
]

View File

@ -9,10 +9,11 @@ from email.header import decode_header
from email.utils import parsedate_to_datetime
from json import JSONEncoder
from pathlib import Path
from typing import List
import chardet
from iarchiver.email import extract_emails
from iarchiver.email import extract_emails, normalize_for_imap_folder
def md5_chunked(data: bytes, size: int = 1024):
@ -48,14 +49,7 @@ class MailConnection:
self.logger.setLevel(logging.INFO)
def load_folders(self):
folders = [tuple(f.decode().split(' "/" ')[1].replace('"', '').split('/')) for f in self.mail.list()[1]]
folder_structure = {}
for f in folders:
if not folder_structure.get(f[0]):
folder_structure[f[0]] = []
if len(f) > 1:
folder_structure[f[0]].append(f[1])
self.folder_structure = folder_structure
self.folder_structure = [f.decode().split(' "/" ')[1].replace('"', '').replace("\\'", "'") for f in self.mail.list()[1]]
return self.folder_structure
def __fetch_email(self, i):
@ -74,6 +68,8 @@ class MailConnection:
raw_email = raw_email_bytes.decode(encoding, errors='replace')
email_message = email.message_from_string(raw_email)
date_header = email_message['Date']
if not date_header:
date_header = 0
date = parsedate_to_datetime(date_header)
unix_timestamp = int(time.mktime(date.timetuple()))
@ -118,6 +114,8 @@ class MailConnection:
if filename:
# The filename of the file is the hash of its content, which should de-duplicate files.
filecontents = part.get_payload(decode=True)
if not filecontents:
continue
filehash = md5_chunked(filecontents)
part.set_payload(f'MD5:{filehash}') # replace the attachment with its hash
filepath = self.attachments_dir / filehash
@ -129,23 +127,26 @@ class MailConnection:
raw_email_clean = email_message.as_string()
return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments
def fetch_folder(self, folder: str, search_criterion: str = 'ALL', max_threads: int = 1):
def fetch_folder(self, folder: str, search_criterion: List[str] = None, max_threads: int = 1):
"""
Don't use multiple threads because most mail servers don't allow the client to multiplex.
"""
self.mail.select(f'"{folder}"')
result, data = self.mail.uid('search', None, search_criterion)
mail_ids = data[0]
id_list = mail_ids.split()
if not len(id_list):
# Empty folder
return
first_email_id = int(id_list[0])
latest_email_id = int(id_list[-1])
if not search_criterion:
search_criterion = ['ALL']
self.mail.select(normalize_for_imap_folder(folder))
for search_item in search_criterion:
result, data = self.mail.uid('search', search_item)
mail_ids = data[0]
id_list = mail_ids.split()
if not len(id_list):
# Empty folder
return
first_email_id = int(id_list[0])
latest_email_id = int(id_list[-1])
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
futures = {executor.submit(self.__fetch_email, i) for i in range(latest_email_id, first_email_id, -1)}
for future in concurrent.futures.as_completed(futures):
result = future.result()
if result is not None:
yield result
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
futures = {executor.submit(self.__fetch_email, i) for i in range(latest_email_id, first_email_id, -1)}
for future in concurrent.futures.as_completed(futures):
result = future.result()
if result is not None:
yield result

47
run.py
View File

@ -33,40 +33,37 @@ def main(args):
mail = MailConnection(config['server'], config['username'], config['password'], attachments_dir)
mail.load_folders()
logger.info(f'Syncing {len(mail.folder_structure.keys())} folders...')
if config['server'] == 'imap.gmail.com':
mail.folder_structure.remove('INBOX') # We will use "'[Gmail]/All Mail'" instead
mail.folder_structure.remove('[Gmail]')
num_folders_to_sync = len(mail.folder_structure)
logger.info(f'Syncing {num_folders_to_sync} folders...')
new_emails = 0
new_attachments = 0
did_full_sync = False
sync_start_time = datetime.now()
for parent_folder, subfolders in mail.folder_structure.items():
if parent_folder in exclude_folders:
for folder_name in mail.folder_structure:
if folder_name in exclude_folders:
# Exclude folder
continue
for folder in [parent_folder, *subfolders]:
folder_name = parent_folder + '/' + folder
if folder_name == f'{parent_folder}/{parent_folder}':
folder_name = parent_folder
if folder_name in exclude_folders:
# Exclude folder
continue
logger.info(folder_name)
last_refresh = database.have_we_done_a_full_sync_at_all()
if last_refresh:
date = unix_timestamp_since_to_imap_timestamp(last_refresh)
search_criterion = '(SINCE "' + date + '")'
else:
did_full_sync = True
search_criterion = 'ALL'
logger.info(folder_name)
last_refresh = database.have_we_done_a_full_sync_at_all()
if last_refresh:
date = unix_timestamp_since_to_imap_timestamp(last_refresh)
search_criterion = ['(SINCE "' + date + '")']
else:
did_full_sync = True
search_criterion = ['ALL']
for email in mail.fetch_folder(folder_name, search_criterion=search_criterion):
timestamp, to_email, from_email, subject, raw, attachments = email
is_new_email = database.insert_email(folder_name, timestamp, subject, raw, to_email, from_email, attachments)
if is_new_email:
new_emails += 1
if len(attachments):
new_attachments += 1
for email in mail.fetch_folder(folder_name, search_criterion=search_criterion):
timestamp, to_email, from_email, subject, raw, attachments = email
is_new_email = database.insert_email(folder_name, timestamp, subject, raw, to_email, from_email, attachments)
if is_new_email:
new_emails += 1
if len(attachments):
new_attachments += 1
elapsed = datetime.now() - sync_start_time
database.finish_sync('refresh' if not did_full_sync else 'full', new_emails, new_attachments, int(elapsed.total_seconds()))