diff --git a/README.md b/README.md index c6fa08b..624bbca 100644 --- a/README.md +++ b/README.md @@ -20,4 +20,5 @@ A sample systemd service file is included. ## To Do -- [ ] Fix subject decoding. Some character sets aren't detected correctly. \ No newline at end of file +- [ ] Fix subject decoding. Some character sets aren't detected correctly. +- [ ] Sync Gmail categories as folders. \ No newline at end of file diff --git a/config.yml.sample b/config.yml.sample index 6c6a7d2..d2c1ae4 100644 --- a/config.yml.sample +++ b/config.yml.sample @@ -7,4 +7,8 @@ attachments_path: attachments exclude_folders: - Trash - - Drafts \ No newline at end of file + - Drafts + - '[Gmail]/Trash' + - '[Gmail]/Drafts' + - '[Gmail]/Starred' + - '[Gmail]/Important' \ No newline at end of file diff --git a/iarchiver/email.py b/iarchiver/email.py index ee85179..9c37364 100644 --- a/iarchiver/email.py +++ b/iarchiver/email.py @@ -10,3 +10,8 @@ def extract_emails(field: str): def unix_timestamp_since_to_imap_timestamp(unix_timestamp: Union[int, float]) -> str: return (datetime.fromtimestamp(unix_timestamp) - timedelta(days=1)).strftime("%d-%b-%Y") + + +def normalize_for_imap_folder(folder: str): + folder = folder.replace("'", "\'") + return f'"{folder}"' diff --git a/iarchiver/gmail.py b/iarchiver/gmail.py new file mode 100644 index 0000000..0d0ab62 --- /dev/null +++ b/iarchiver/gmail.py @@ -0,0 +1,10 @@ +# https://support.google.com/mail/answer/7190?hl=en +gmail_categories = [ + # 'category:primary', + 'category:social', + 'category:promotions', + 'category:updates', + 'category:forums', + 'category:reservations', + 'category:purchases' +] diff --git a/iarchiver/mail_conn.py b/iarchiver/mail_conn.py index 8bd4e1f..9f95682 100644 --- a/iarchiver/mail_conn.py +++ b/iarchiver/mail_conn.py @@ -9,10 +9,11 @@ from email.header import decode_header from email.utils import parsedate_to_datetime from json import JSONEncoder from pathlib import Path +from typing import List import chardet -from iarchiver.email import extract_emails +from iarchiver.email import extract_emails, normalize_for_imap_folder def md5_chunked(data: bytes, size: int = 1024): @@ -48,14 +49,7 @@ class MailConnection: self.logger.setLevel(logging.INFO) def load_folders(self): - folders = [tuple(f.decode().split(' "/" ')[1].replace('"', '').split('/')) for f in self.mail.list()[1]] - folder_structure = {} - for f in folders: - if not folder_structure.get(f[0]): - folder_structure[f[0]] = [] - if len(f) > 1: - folder_structure[f[0]].append(f[1]) - self.folder_structure = folder_structure + self.folder_structure = [f.decode().split(' "/" ')[1].replace('"', '').replace("\\'", "'") for f in self.mail.list()[1]] return self.folder_structure def __fetch_email(self, i): @@ -74,6 +68,8 @@ class MailConnection: raw_email = raw_email_bytes.decode(encoding, errors='replace') email_message = email.message_from_string(raw_email) date_header = email_message['Date'] + if not date_header: + date_header = 0 date = parsedate_to_datetime(date_header) unix_timestamp = int(time.mktime(date.timetuple())) @@ -118,6 +114,8 @@ class MailConnection: if filename: # The filename of the file is the hash of its content, which should de-duplicate files. filecontents = part.get_payload(decode=True) + if not filecontents: + continue filehash = md5_chunked(filecontents) part.set_payload(f'MD5:{filehash}') # replace the attachment with its hash filepath = self.attachments_dir / filehash @@ -129,23 +127,26 @@ class MailConnection: raw_email_clean = email_message.as_string() return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments - def fetch_folder(self, folder: str, search_criterion: str = 'ALL', max_threads: int = 1): + def fetch_folder(self, folder: str, search_criterion: List[str] = None, max_threads: int = 1): """ Don't use multiple threads because most mail servers don't allow the client to multiplex. """ - self.mail.select(f'"{folder}"') - result, data = self.mail.uid('search', None, search_criterion) - mail_ids = data[0] - id_list = mail_ids.split() - if not len(id_list): - # Empty folder - return - first_email_id = int(id_list[0]) - latest_email_id = int(id_list[-1]) + if not search_criterion: + search_criterion = ['ALL'] + self.mail.select(normalize_for_imap_folder(folder)) + for search_item in search_criterion: + result, data = self.mail.uid('search', search_item) + mail_ids = data[0] + id_list = mail_ids.split() + if not len(id_list): + # Empty folder + return + first_email_id = int(id_list[0]) + latest_email_id = int(id_list[-1]) - with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: - futures = {executor.submit(self.__fetch_email, i) for i in range(latest_email_id, first_email_id, -1)} - for future in concurrent.futures.as_completed(futures): - result = future.result() - if result is not None: - yield result + with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: + futures = {executor.submit(self.__fetch_email, i) for i in range(latest_email_id, first_email_id, -1)} + for future in concurrent.futures.as_completed(futures): + result = future.result() + if result is not None: + yield result diff --git a/run.py b/run.py index 682c93d..ec2879a 100755 --- a/run.py +++ b/run.py @@ -33,40 +33,37 @@ def main(args): mail = MailConnection(config['server'], config['username'], config['password'], attachments_dir) mail.load_folders() - logger.info(f'Syncing {len(mail.folder_structure.keys())} folders...') + if config['server'] == 'imap.gmail.com': + mail.folder_structure.remove('INBOX') # We will use "'[Gmail]/All Mail'" instead + mail.folder_structure.remove('[Gmail]') + num_folders_to_sync = len(mail.folder_structure) + logger.info(f'Syncing {num_folders_to_sync} folders...') new_emails = 0 new_attachments = 0 did_full_sync = False sync_start_time = datetime.now() - for parent_folder, subfolders in mail.folder_structure.items(): - if parent_folder in exclude_folders: + for folder_name in mail.folder_structure: + if folder_name in exclude_folders: # Exclude folder continue - for folder in [parent_folder, *subfolders]: - folder_name = parent_folder + '/' + folder - if folder_name == f'{parent_folder}/{parent_folder}': - folder_name = parent_folder - if folder_name in exclude_folders: - # Exclude folder - continue - logger.info(folder_name) - last_refresh = database.have_we_done_a_full_sync_at_all() - if last_refresh: - date = unix_timestamp_since_to_imap_timestamp(last_refresh) - search_criterion = '(SINCE "' + date + '")' - else: - did_full_sync = True - search_criterion = 'ALL' + logger.info(folder_name) + last_refresh = database.have_we_done_a_full_sync_at_all() + if last_refresh: + date = unix_timestamp_since_to_imap_timestamp(last_refresh) + search_criterion = ['(SINCE "' + date + '")'] + else: + did_full_sync = True + search_criterion = ['ALL'] - for email in mail.fetch_folder(folder_name, search_criterion=search_criterion): - timestamp, to_email, from_email, subject, raw, attachments = email - is_new_email = database.insert_email(folder_name, timestamp, subject, raw, to_email, from_email, attachments) - if is_new_email: - new_emails += 1 - if len(attachments): - new_attachments += 1 + for email in mail.fetch_folder(folder_name, search_criterion=search_criterion): + timestamp, to_email, from_email, subject, raw, attachments = email + is_new_email = database.insert_email(folder_name, timestamp, subject, raw, to_email, from_email, attachments) + if is_new_email: + new_emails += 1 + if len(attachments): + new_attachments += 1 elapsed = datetime.now() - sync_start_time database.finish_sync('refresh' if not did_full_sync else 'full', new_emails, new_attachments, int(elapsed.total_seconds()))