remove parent/child folder layout, make work with gmail, fix issues
This commit is contained in:
parent
b965538688
commit
184a9a78e1
|
@ -21,3 +21,4 @@ A sample systemd service file is included.
|
|||
## To Do
|
||||
|
||||
- [ ] Fix subject decoding. Some character sets aren't detected correctly.
|
||||
- [ ] Sync Gmail categories as folders.
|
|
@ -8,3 +8,7 @@ attachments_path: attachments
|
|||
exclude_folders:
|
||||
- Trash
|
||||
- Drafts
|
||||
- '[Gmail]/Trash'
|
||||
- '[Gmail]/Drafts'
|
||||
- '[Gmail]/Starred'
|
||||
- '[Gmail]/Important'
|
|
@ -10,3 +10,8 @@ def extract_emails(field: str):
|
|||
|
||||
def unix_timestamp_since_to_imap_timestamp(unix_timestamp: Union[int, float]) -> str:
|
||||
return (datetime.fromtimestamp(unix_timestamp) - timedelta(days=1)).strftime("%d-%b-%Y")
|
||||
|
||||
|
||||
def normalize_for_imap_folder(folder: str):
|
||||
folder = folder.replace("'", "\'")
|
||||
return f'"{folder}"'
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
# https://support.google.com/mail/answer/7190?hl=en
|
||||
gmail_categories = [
|
||||
# 'category:primary',
|
||||
'category:social',
|
||||
'category:promotions',
|
||||
'category:updates',
|
||||
'category:forums',
|
||||
'category:reservations',
|
||||
'category:purchases'
|
||||
]
|
|
@ -9,10 +9,11 @@ from email.header import decode_header
|
|||
from email.utils import parsedate_to_datetime
|
||||
from json import JSONEncoder
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
import chardet
|
||||
|
||||
from iarchiver.email import extract_emails
|
||||
from iarchiver.email import extract_emails, normalize_for_imap_folder
|
||||
|
||||
|
||||
def md5_chunked(data: bytes, size: int = 1024):
|
||||
|
@ -48,14 +49,7 @@ class MailConnection:
|
|||
self.logger.setLevel(logging.INFO)
|
||||
|
||||
def load_folders(self):
|
||||
folders = [tuple(f.decode().split(' "/" ')[1].replace('"', '').split('/')) for f in self.mail.list()[1]]
|
||||
folder_structure = {}
|
||||
for f in folders:
|
||||
if not folder_structure.get(f[0]):
|
||||
folder_structure[f[0]] = []
|
||||
if len(f) > 1:
|
||||
folder_structure[f[0]].append(f[1])
|
||||
self.folder_structure = folder_structure
|
||||
self.folder_structure = [f.decode().split(' "/" ')[1].replace('"', '').replace("\\'", "'") for f in self.mail.list()[1]]
|
||||
return self.folder_structure
|
||||
|
||||
def __fetch_email(self, i):
|
||||
|
@ -74,6 +68,8 @@ class MailConnection:
|
|||
raw_email = raw_email_bytes.decode(encoding, errors='replace')
|
||||
email_message = email.message_from_string(raw_email)
|
||||
date_header = email_message['Date']
|
||||
if not date_header:
|
||||
date_header = 0
|
||||
date = parsedate_to_datetime(date_header)
|
||||
unix_timestamp = int(time.mktime(date.timetuple()))
|
||||
|
||||
|
@ -118,6 +114,8 @@ class MailConnection:
|
|||
if filename:
|
||||
# The filename of the file is the hash of its content, which should de-duplicate files.
|
||||
filecontents = part.get_payload(decode=True)
|
||||
if not filecontents:
|
||||
continue
|
||||
filehash = md5_chunked(filecontents)
|
||||
part.set_payload(f'MD5:{filehash}') # replace the attachment with its hash
|
||||
filepath = self.attachments_dir / filehash
|
||||
|
@ -129,12 +127,15 @@ class MailConnection:
|
|||
raw_email_clean = email_message.as_string()
|
||||
return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments
|
||||
|
||||
def fetch_folder(self, folder: str, search_criterion: str = 'ALL', max_threads: int = 1):
|
||||
def fetch_folder(self, folder: str, search_criterion: List[str] = None, max_threads: int = 1):
|
||||
"""
|
||||
Don't use multiple threads because most mail servers don't allow the client to multiplex.
|
||||
"""
|
||||
self.mail.select(f'"{folder}"')
|
||||
result, data = self.mail.uid('search', None, search_criterion)
|
||||
if not search_criterion:
|
||||
search_criterion = ['ALL']
|
||||
self.mail.select(normalize_for_imap_folder(folder))
|
||||
for search_item in search_criterion:
|
||||
result, data = self.mail.uid('search', search_item)
|
||||
mail_ids = data[0]
|
||||
id_list = mail_ids.split()
|
||||
if not len(id_list):
|
||||
|
|
19
run.py
19
run.py
|
@ -33,21 +33,18 @@ def main(args):
|
|||
mail = MailConnection(config['server'], config['username'], config['password'], attachments_dir)
|
||||
mail.load_folders()
|
||||
|
||||
logger.info(f'Syncing {len(mail.folder_structure.keys())} folders...')
|
||||
if config['server'] == 'imap.gmail.com':
|
||||
mail.folder_structure.remove('INBOX') # We will use "'[Gmail]/All Mail'" instead
|
||||
mail.folder_structure.remove('[Gmail]')
|
||||
num_folders_to_sync = len(mail.folder_structure)
|
||||
logger.info(f'Syncing {num_folders_to_sync} folders...')
|
||||
|
||||
new_emails = 0
|
||||
new_attachments = 0
|
||||
did_full_sync = False
|
||||
sync_start_time = datetime.now()
|
||||
|
||||
for parent_folder, subfolders in mail.folder_structure.items():
|
||||
if parent_folder in exclude_folders:
|
||||
# Exclude folder
|
||||
continue
|
||||
for folder in [parent_folder, *subfolders]:
|
||||
folder_name = parent_folder + '/' + folder
|
||||
if folder_name == f'{parent_folder}/{parent_folder}':
|
||||
folder_name = parent_folder
|
||||
for folder_name in mail.folder_structure:
|
||||
if folder_name in exclude_folders:
|
||||
# Exclude folder
|
||||
continue
|
||||
|
@ -55,10 +52,10 @@ def main(args):
|
|||
last_refresh = database.have_we_done_a_full_sync_at_all()
|
||||
if last_refresh:
|
||||
date = unix_timestamp_since_to_imap_timestamp(last_refresh)
|
||||
search_criterion = '(SINCE "' + date + '")'
|
||||
search_criterion = ['(SINCE "' + date + '")']
|
||||
else:
|
||||
did_full_sync = True
|
||||
search_criterion = 'ALL'
|
||||
search_criterion = ['ALL']
|
||||
|
||||
for email in mail.fetch_folder(folder_name, search_criterion=search_criterion):
|
||||
timestamp, to_email, from_email, subject, raw, attachments = email
|
||||
|
|
Loading…
Reference in New Issue