diff --git a/.gitignore b/.gitignore index 5d381cc..73bee22 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ +.idea +attachments/ +config.yml +emails.db +emails.db-journal + # ---> Python # Byte-compiled / optimized / DLL files __pycache__/ @@ -159,4 +165,3 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ - diff --git a/README.md b/README.md index 6f4a36b..3fd5724 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,23 @@ # imap-archiver -Archive the content on your email account. \ No newline at end of file +_Archive the content on your email account._ + +A very simple way to sync your email account with the goal of backing up all your emails. + +Saves attachments to disk and stores the emails in an SQLite database. + +No viewer yet, but the database is organized to do so. Does not support different accounts. + +## Install + +1. Create a venv +2. `pip install -r requirements.txt` +3. `cp config.yml.sample config.yml` +4. Edit `config.yml` and configure your login info. +5. `python3 run.py` + +A systemd service is included. + +## To Do + +- [ ] Fix subject decoding. Some character sets aren't detected correctly. \ No newline at end of file diff --git a/config.yml.sample b/config.yml.sample new file mode 100644 index 0000000..6c6a7d2 --- /dev/null +++ b/config.yml.sample @@ -0,0 +1,10 @@ +server: imap.example.com +username: bob@example.com +password: password123 + +database_path: emails.db +attachments_path: attachments + +exclude_folders: + - Trash + - Drafts \ No newline at end of file diff --git a/iarchiver/__init__.py b/iarchiver/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/iarchiver/database.py b/iarchiver/database.py new file mode 100644 index 0000000..c267f10 --- /dev/null +++ b/iarchiver/database.py @@ -0,0 +1,83 @@ +import json +import re +import sqlite3 +import time +from pathlib import Path +from typing import List + +from iarchiver.mail import FileAttachment, FileAttachmentEncoder + + +def is_valid_table_name(table_name): + return re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', table_name) is not None + + +def sanitize_table_name(name): + name = name.replace('/', '_') + # Remove any non-alphanumeric characters + sanitized_name = re.sub(r'\W+', '', name) + # If the first character is a digit, prepend an underscore + if sanitized_name and sanitized_name[0].isdigit(): + sanitized_name = '_' + sanitized_name + return sanitized_name + + +class EmailDatabase: + __restricted_strings = ['folders_mapping', 'syncs'] + + def __init__(self, filepath: Path): + filepath = filepath.expanduser().absolute().resolve() + self.conn = sqlite3.connect(filepath) + cursor = self.conn.cursor() + cursor.execute(f'CREATE TABLE IF NOT EXISTS folders_mapping (name TEXT UNIQUE, table_name TEXT UNIQUE)') + cursor.execute(f'CREATE TABLE IF NOT EXISTS syncs (timestamp INTEGER UNIQUE, type TEXT)') + self.conn.commit() + cursor.close() + + def __create_table(self, table_name: str): + sanitized_table_name = sanitize_table_name(table_name) + if sanitized_table_name in self.__restricted_strings: + raise ValueError(f'Invalid table name, conflicts with system tables: {table_name}') + cursor = self.conn.cursor() + cursor.execute(f'CREATE TABLE IF NOT EXISTS {sanitized_table_name} (timestamp INTEGER, to_email TEXT, from_email TEXT, subject TEXT, raw TEXT, attachments TEXT, id INTEGER PRIMARY KEY AUTOINCREMENT)') + cursor.execute('INSERT OR IGNORE INTO folders_mapping (name, table_name) VALUES (?, ?)', (table_name, sanitized_table_name)) + self.conn.commit() + cursor.close() + + def insert_email(self, folder: str, timestamp: int, subject: str, raw: str, to_email: str, from_email: str, attachments: List[FileAttachment]): + sanitized_table_name = sanitize_table_name(folder) + self.__create_table(folder) + cursor = self.conn.cursor() + + # Check if record already exists + stmt_check = f"SELECT * FROM {sanitized_table_name} WHERE timestamp = ? AND raw = ?" + cursor.execute(stmt_check, (timestamp, raw)) + data = cursor.fetchone() + + # If record does not exist, insert it + new_email = False + if data is None: + stmt = f"INSERT INTO {sanitized_table_name} (timestamp, to_email, from_email, subject, raw, attachments) VALUES (?, ?, ?, ?, ?, ?)" + cursor.execute(stmt, (timestamp, to_email, from_email, subject, raw, json.dumps(attachments, cls=FileAttachmentEncoder))) + self.conn.commit() + new_email = True + cursor.close() + return new_email + + def finish_sync(self, sync_type: str): + now = int(time.time()) + cursor = self.conn.cursor() + cursor.execute('INSERT INTO syncs (timestamp, type) VALUES (?, ?)', (now, sync_type)) + self.conn.commit() + cursor.close() + return now + + def have_we_done_a_full_sync_at_all(self): + cursor = self.conn.cursor() + cursor.execute("SELECT * FROM syncs ORDER BY timestamp LIMIT 1") + row = cursor.fetchone() + cursor.close() + if row is not None: + return row[0] + else: + return None diff --git a/iarchiver/email.py b/iarchiver/email.py new file mode 100644 index 0000000..ee85179 --- /dev/null +++ b/iarchiver/email.py @@ -0,0 +1,12 @@ +import re +from datetime import datetime, timedelta +from typing import Union + + +def extract_emails(field: str): + matches = re.findall(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', field.lower()) + return tuple({str(x) for x in matches}) + + +def unix_timestamp_since_to_imap_timestamp(unix_timestamp: Union[int, float]) -> str: + return (datetime.fromtimestamp(unix_timestamp) - timedelta(days=1)).strftime("%d-%b-%Y") diff --git a/iarchiver/mail.py b/iarchiver/mail.py new file mode 100644 index 0000000..9c56552 --- /dev/null +++ b/iarchiver/mail.py @@ -0,0 +1,132 @@ +import concurrent.futures +import email +import hashlib +import imaplib +import time +from email.header import decode_header +from email.utils import parsedate_to_datetime +from json import JSONEncoder +from pathlib import Path + +import chardet + +from iarchiver.email import extract_emails + + +def md5_chunked(data: bytes, size: int = 1024): + m = hashlib.md5() + for i in range(0, len(data), size): + m.update(data[i:i + size]) + return m.hexdigest() + + +class FileAttachment: + def __init__(self, file_name: str, file_hash: str): + self.filename = file_name + self.hash = file_hash + + def to_dict(self): + return {'filename': self.filename, 'hash': self.hash} + + +class FileAttachmentEncoder(JSONEncoder): + def default(self, o): + if isinstance(o, FileAttachment): + return o.to_dict() + return super().default(o) + + +class MailConnection: + def __init__(self, host: str, username: str, password: str, attachments_dir: Path): + self.mail = imaplib.IMAP4_SSL(host) + self.mail.login(username, password) + self.attachments_dir = attachments_dir.expanduser().absolute().resolve() + self.folder_structure = {} + + def load_folders(self): + folders = [tuple(f.decode().split(' "/" ')[1].replace('"', '').split('/')) for f in self.mail.list()[1]] + folder_structure = {} + for f in folders: + if not folder_structure.get(f[0]): + folder_structure[f[0]] = [] + if len(f) > 1: + folder_structure[f[0]].append(f[1]) + self.folder_structure = folder_structure + return self.folder_structure + + def __fetch_email(self, i): + result, data = self.mail.uid('fetch', str(i), '(BODY[])') # fetch the raw email + if data[0] is None: + return + raw_email_bytes = data[0][1] + detected = chardet.detect(raw_email_bytes) + encoding = detected['encoding'] + if not encoding: + encoding = 'utf-8' + raw_email = raw_email_bytes.decode(encoding, errors='replace') + email_message = email.message_from_string(raw_email) + date_header = email_message['Date'] + date = parsedate_to_datetime(date_header) + unix_timestamp = int(time.mktime(date.timetuple())) + + from_header = ', '.join(extract_emails(email_message['From'])) + to_header = ', '.join(extract_emails(email_message['To'])) + if '@' not in to_header: + to_header = email_message['To'] + + subject_header = email_message['Subject'] + if subject_header: + subject = decode_header(subject_header)[0][0] + if isinstance(subject, bytes): + try: + detected = chardet.detect(subject) + encoding = detected['encoding'] + if not encoding: + encoding = 'utf-8' + subject = subject.decode(encoding, errors='replace') + except UnicodeDecodeError: + subject = subject.decode('utf-8') + else: + return + + attachments = [] + if email_message.is_multipart(): + for part in email_message.walk(): + content_type = part.get_content_type() + content_disposition = str(part.get("Content-Disposition")) + if "attachment" in content_disposition: + filename = part.get_filename() + if filename: + # The filename of the file is the hash of its content, which should de-duplicate files. + filecontents = part.get_payload(decode=True) + filehash = md5_chunked(filecontents) + part.set_payload(filehash) # replace the attachment with its hash + filepath = self.attachments_dir / filehash + file_obj = FileAttachment(filename, filehash) + if not filepath.is_file(): + with open(filepath, 'wb') as f: + f.write(filecontents) + attachments.append(file_obj) + raw_email_clean = email_message.as_string() + return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments + + def fetch_folder(self, folder: str, search_criterion: str = 'ALL', max_threads: int = 1): + """ + Don't use multiple threads because most mail servers don't allow the client to multiplex. + """ + self.mail.select(f'"{folder}"') + result, data = self.mail.uid('search', None, search_criterion) + mail_ids = data[0] + id_list = mail_ids.split() + if not len(id_list): + # Empty folder + return + first_email_id = int(id_list[0]) + latest_email_id = int(id_list[-1]) + + with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor: + futures = {executor.submit(self.__fetch_email, i) for i in range(latest_email_id, first_email_id, -1)} + for future in concurrent.futures.as_completed(futures): + result = future.result() + if result is not None: + yield result diff --git a/imaparchiver.service b/imaparchiver.service new file mode 100644 index 0000000..ac99a9f --- /dev/null +++ b/imaparchiver.service @@ -0,0 +1,14 @@ +[Unit] +Description=IMAP archiver service +Wants=network-online.target +After=network-online.target + +[Service] +User=emailsync +Group=emailsync +ExecStart=/srv/email/imap-archiver/venv/bin/python3 /srv/email/imap-archiver/run.py --config /srv/email/imap-archiver/config.yml +SyslogIdentifier=imap-archiver +Restart=on-failure + +[Install] +WantedBy=multi-user.target diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d218722 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +pyyaml==6.0.1 +chardet==5.2.0 +humanize==4.9.0 \ No newline at end of file diff --git a/run.py b/run.py new file mode 100755 index 0000000..84c3d50 --- /dev/null +++ b/run.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +import argparse +import logging +import sys +from datetime import datetime +from pathlib import Path + +import humanize +import yaml + +from iarchiver.database import EmailDatabase +from iarchiver.email import unix_timestamp_since_to_imap_timestamp +from iarchiver.mail import MailConnection + + +def main(args): + logging.basicConfig() + logger = logging.getLogger('iarchiver') + logger.setLevel(logging.INFO) + with open(args.config) as f: + config = yaml.safe_load(f) + + exclude_folders = config.get('exclude_folders', []) + + if not config.get('server') or not config.get('username') or not config.get('password') or not config.get('database_path') or not config.get('attachments_path'): + logger.critical('Bad config file.') + sys.exit(1) + + attachments_dir = Path(config['attachments_path']) + attachments_dir.mkdir(parents=True, exist_ok=True) + + database = EmailDatabase(Path(config['database_path'])) + mail = MailConnection(config['server'], config['username'], config['password'], attachments_dir) + mail.load_folders() + + logger.info(f'Syncing {len(mail.folder_structure.keys())} folders...') + + new_emails = 0 + new_attachments = 0 + did_full_sync = False + sync_start_time = datetime.now() + + for parent_folder, subfolders in mail.folder_structure.items(): + if parent_folder in exclude_folders: + # Exclude folder + continue + for folder in [parent_folder, *subfolders]: + folder_name = parent_folder + '/' + folder + if folder_name == f'{parent_folder}/{parent_folder}': + folder_name = parent_folder + if folder_name in exclude_folders: + # Exclude folder + continue + logger.info(folder_name) + last_refresh = database.have_we_done_a_full_sync_at_all() + if last_refresh: + date = unix_timestamp_since_to_imap_timestamp(last_refresh) + search_criterion = '(SINCE "' + date + '")' + else: + did_full_sync = True + search_criterion = 'ALL' + + for email in mail.fetch_folder(folder_name, search_criterion=search_criterion): + timestamp, to_email, from_email, subject, raw, attachments = email + is_new_email = database.insert_email(folder_name, timestamp, subject, raw, to_email, from_email, attachments) + if is_new_email: + new_emails += 1 + if len(attachments): + new_attachments += 1 + + database.finish_sync('refresh' if not did_full_sync else 'full') + + elapsed = datetime.now() - sync_start_time + logger.info(f'Finished email {"refresh" if not did_full_sync else "sync"} in {humanize.naturaldelta(elapsed)} and added {new_emails} new emails.') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Sync and archive your IMAP server.') + parser.add_argument('--config', default='config.yml', help='Path to config file.') + args = parser.parse_args() + main(args)