remove parent/child folder layout, make work with gmail, fix issues
This commit is contained in:
parent
b965538688
commit
184a9a78e1
|
@ -20,4 +20,5 @@ A sample systemd service file is included.
|
||||||
|
|
||||||
## To Do
|
## To Do
|
||||||
|
|
||||||
- [ ] Fix subject decoding. Some character sets aren't detected correctly.
|
- [ ] Fix subject decoding. Some character sets aren't detected correctly.
|
||||||
|
- [ ] Sync Gmail categories as folders.
|
|
@ -7,4 +7,8 @@ attachments_path: attachments
|
||||||
|
|
||||||
exclude_folders:
|
exclude_folders:
|
||||||
- Trash
|
- Trash
|
||||||
- Drafts
|
- Drafts
|
||||||
|
- '[Gmail]/Trash'
|
||||||
|
- '[Gmail]/Drafts'
|
||||||
|
- '[Gmail]/Starred'
|
||||||
|
- '[Gmail]/Important'
|
|
@ -10,3 +10,8 @@ def extract_emails(field: str):
|
||||||
|
|
||||||
def unix_timestamp_since_to_imap_timestamp(unix_timestamp: Union[int, float]) -> str:
|
def unix_timestamp_since_to_imap_timestamp(unix_timestamp: Union[int, float]) -> str:
|
||||||
return (datetime.fromtimestamp(unix_timestamp) - timedelta(days=1)).strftime("%d-%b-%Y")
|
return (datetime.fromtimestamp(unix_timestamp) - timedelta(days=1)).strftime("%d-%b-%Y")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_for_imap_folder(folder: str):
|
||||||
|
folder = folder.replace("'", "\'")
|
||||||
|
return f'"{folder}"'
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
# https://support.google.com/mail/answer/7190?hl=en
|
||||||
|
gmail_categories = [
|
||||||
|
# 'category:primary',
|
||||||
|
'category:social',
|
||||||
|
'category:promotions',
|
||||||
|
'category:updates',
|
||||||
|
'category:forums',
|
||||||
|
'category:reservations',
|
||||||
|
'category:purchases'
|
||||||
|
]
|
|
@ -9,10 +9,11 @@ from email.header import decode_header
|
||||||
from email.utils import parsedate_to_datetime
|
from email.utils import parsedate_to_datetime
|
||||||
from json import JSONEncoder
|
from json import JSONEncoder
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
import chardet
|
import chardet
|
||||||
|
|
||||||
from iarchiver.email import extract_emails
|
from iarchiver.email import extract_emails, normalize_for_imap_folder
|
||||||
|
|
||||||
|
|
||||||
def md5_chunked(data: bytes, size: int = 1024):
|
def md5_chunked(data: bytes, size: int = 1024):
|
||||||
|
@ -48,14 +49,7 @@ class MailConnection:
|
||||||
self.logger.setLevel(logging.INFO)
|
self.logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
def load_folders(self):
|
def load_folders(self):
|
||||||
folders = [tuple(f.decode().split(' "/" ')[1].replace('"', '').split('/')) for f in self.mail.list()[1]]
|
self.folder_structure = [f.decode().split(' "/" ')[1].replace('"', '').replace("\\'", "'") for f in self.mail.list()[1]]
|
||||||
folder_structure = {}
|
|
||||||
for f in folders:
|
|
||||||
if not folder_structure.get(f[0]):
|
|
||||||
folder_structure[f[0]] = []
|
|
||||||
if len(f) > 1:
|
|
||||||
folder_structure[f[0]].append(f[1])
|
|
||||||
self.folder_structure = folder_structure
|
|
||||||
return self.folder_structure
|
return self.folder_structure
|
||||||
|
|
||||||
def __fetch_email(self, i):
|
def __fetch_email(self, i):
|
||||||
|
@ -74,6 +68,8 @@ class MailConnection:
|
||||||
raw_email = raw_email_bytes.decode(encoding, errors='replace')
|
raw_email = raw_email_bytes.decode(encoding, errors='replace')
|
||||||
email_message = email.message_from_string(raw_email)
|
email_message = email.message_from_string(raw_email)
|
||||||
date_header = email_message['Date']
|
date_header = email_message['Date']
|
||||||
|
if not date_header:
|
||||||
|
date_header = 0
|
||||||
date = parsedate_to_datetime(date_header)
|
date = parsedate_to_datetime(date_header)
|
||||||
unix_timestamp = int(time.mktime(date.timetuple()))
|
unix_timestamp = int(time.mktime(date.timetuple()))
|
||||||
|
|
||||||
|
@ -118,6 +114,8 @@ class MailConnection:
|
||||||
if filename:
|
if filename:
|
||||||
# The filename of the file is the hash of its content, which should de-duplicate files.
|
# The filename of the file is the hash of its content, which should de-duplicate files.
|
||||||
filecontents = part.get_payload(decode=True)
|
filecontents = part.get_payload(decode=True)
|
||||||
|
if not filecontents:
|
||||||
|
continue
|
||||||
filehash = md5_chunked(filecontents)
|
filehash = md5_chunked(filecontents)
|
||||||
part.set_payload(f'MD5:{filehash}') # replace the attachment with its hash
|
part.set_payload(f'MD5:{filehash}') # replace the attachment with its hash
|
||||||
filepath = self.attachments_dir / filehash
|
filepath = self.attachments_dir / filehash
|
||||||
|
@ -129,23 +127,26 @@ class MailConnection:
|
||||||
raw_email_clean = email_message.as_string()
|
raw_email_clean = email_message.as_string()
|
||||||
return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments
|
return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments
|
||||||
|
|
||||||
def fetch_folder(self, folder: str, search_criterion: str = 'ALL', max_threads: int = 1):
|
def fetch_folder(self, folder: str, search_criterion: List[str] = None, max_threads: int = 1):
|
||||||
"""
|
"""
|
||||||
Don't use multiple threads because most mail servers don't allow the client to multiplex.
|
Don't use multiple threads because most mail servers don't allow the client to multiplex.
|
||||||
"""
|
"""
|
||||||
self.mail.select(f'"{folder}"')
|
if not search_criterion:
|
||||||
result, data = self.mail.uid('search', None, search_criterion)
|
search_criterion = ['ALL']
|
||||||
mail_ids = data[0]
|
self.mail.select(normalize_for_imap_folder(folder))
|
||||||
id_list = mail_ids.split()
|
for search_item in search_criterion:
|
||||||
if not len(id_list):
|
result, data = self.mail.uid('search', search_item)
|
||||||
# Empty folder
|
mail_ids = data[0]
|
||||||
return
|
id_list = mail_ids.split()
|
||||||
first_email_id = int(id_list[0])
|
if not len(id_list):
|
||||||
latest_email_id = int(id_list[-1])
|
# Empty folder
|
||||||
|
return
|
||||||
|
first_email_id = int(id_list[0])
|
||||||
|
latest_email_id = int(id_list[-1])
|
||||||
|
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
|
||||||
futures = {executor.submit(self.__fetch_email, i) for i in range(latest_email_id, first_email_id, -1)}
|
futures = {executor.submit(self.__fetch_email, i) for i in range(latest_email_id, first_email_id, -1)}
|
||||||
for future in concurrent.futures.as_completed(futures):
|
for future in concurrent.futures.as_completed(futures):
|
||||||
result = future.result()
|
result = future.result()
|
||||||
if result is not None:
|
if result is not None:
|
||||||
yield result
|
yield result
|
||||||
|
|
47
run.py
47
run.py
|
@ -33,40 +33,37 @@ def main(args):
|
||||||
mail = MailConnection(config['server'], config['username'], config['password'], attachments_dir)
|
mail = MailConnection(config['server'], config['username'], config['password'], attachments_dir)
|
||||||
mail.load_folders()
|
mail.load_folders()
|
||||||
|
|
||||||
logger.info(f'Syncing {len(mail.folder_structure.keys())} folders...')
|
if config['server'] == 'imap.gmail.com':
|
||||||
|
mail.folder_structure.remove('INBOX') # We will use "'[Gmail]/All Mail'" instead
|
||||||
|
mail.folder_structure.remove('[Gmail]')
|
||||||
|
num_folders_to_sync = len(mail.folder_structure)
|
||||||
|
logger.info(f'Syncing {num_folders_to_sync} folders...')
|
||||||
|
|
||||||
new_emails = 0
|
new_emails = 0
|
||||||
new_attachments = 0
|
new_attachments = 0
|
||||||
did_full_sync = False
|
did_full_sync = False
|
||||||
sync_start_time = datetime.now()
|
sync_start_time = datetime.now()
|
||||||
|
|
||||||
for parent_folder, subfolders in mail.folder_structure.items():
|
for folder_name in mail.folder_structure:
|
||||||
if parent_folder in exclude_folders:
|
if folder_name in exclude_folders:
|
||||||
# Exclude folder
|
# Exclude folder
|
||||||
continue
|
continue
|
||||||
for folder in [parent_folder, *subfolders]:
|
logger.info(folder_name)
|
||||||
folder_name = parent_folder + '/' + folder
|
last_refresh = database.have_we_done_a_full_sync_at_all()
|
||||||
if folder_name == f'{parent_folder}/{parent_folder}':
|
if last_refresh:
|
||||||
folder_name = parent_folder
|
date = unix_timestamp_since_to_imap_timestamp(last_refresh)
|
||||||
if folder_name in exclude_folders:
|
search_criterion = ['(SINCE "' + date + '")']
|
||||||
# Exclude folder
|
else:
|
||||||
continue
|
did_full_sync = True
|
||||||
logger.info(folder_name)
|
search_criterion = ['ALL']
|
||||||
last_refresh = database.have_we_done_a_full_sync_at_all()
|
|
||||||
if last_refresh:
|
|
||||||
date = unix_timestamp_since_to_imap_timestamp(last_refresh)
|
|
||||||
search_criterion = '(SINCE "' + date + '")'
|
|
||||||
else:
|
|
||||||
did_full_sync = True
|
|
||||||
search_criterion = 'ALL'
|
|
||||||
|
|
||||||
for email in mail.fetch_folder(folder_name, search_criterion=search_criterion):
|
for email in mail.fetch_folder(folder_name, search_criterion=search_criterion):
|
||||||
timestamp, to_email, from_email, subject, raw, attachments = email
|
timestamp, to_email, from_email, subject, raw, attachments = email
|
||||||
is_new_email = database.insert_email(folder_name, timestamp, subject, raw, to_email, from_email, attachments)
|
is_new_email = database.insert_email(folder_name, timestamp, subject, raw, to_email, from_email, attachments)
|
||||||
if is_new_email:
|
if is_new_email:
|
||||||
new_emails += 1
|
new_emails += 1
|
||||||
if len(attachments):
|
if len(attachments):
|
||||||
new_attachments += 1
|
new_attachments += 1
|
||||||
|
|
||||||
elapsed = datetime.now() - sync_start_time
|
elapsed = datetime.now() - sync_start_time
|
||||||
database.finish_sync('refresh' if not did_full_sync else 'full', new_emails, new_attachments, int(elapsed.total_seconds()))
|
database.finish_sync('refresh' if not did_full_sync else 'full', new_emails, new_attachments, int(elapsed.total_seconds()))
|
||||||
|
|
Loading…
Reference in New Issue