This commit is contained in:
Cyberes 2024-03-05 23:57:38 -07:00
parent eff0c832fe
commit 878ec708e1
10 changed files with 362 additions and 2 deletions

7
.gitignore vendored
View File

@ -1,3 +1,9 @@
.idea
attachments/
config.yml
emails.db
emails.db-journal
# ---> Python
# Byte-compiled / optimized / DLL files
__pycache__/
@ -159,4 +165,3 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

View File

@ -1,3 +1,23 @@
# imap-archiver
Archive the content on your email account.
_Archive the content on your email account._
A very simple way to sync your email account with the goal of backing up all your emails.
Saves attachments to disk and stores the emails in an SQLite database.
No viewer yet, but the database is organized to do so. Does not support different accounts.
## Install
1. Create a venv
2. `pip install -r requirements.txt`
3. `cp config.yml.sample config.yml`
4. Edit `config.yml` and configure your login info.
5. `python3 run.py`
A systemd service is included.
## To Do
- [ ] Fix subject decoding. Some character sets aren't detected correctly.

10
config.yml.sample Normal file
View File

@ -0,0 +1,10 @@
server: imap.example.com
username: bob@example.com
password: password123
database_path: emails.db
attachments_path: attachments
exclude_folders:
- Trash
- Drafts

0
iarchiver/__init__.py Normal file
View File

83
iarchiver/database.py Normal file
View File

@ -0,0 +1,83 @@
import json
import re
import sqlite3
import time
from pathlib import Path
from typing import List
from iarchiver.mail import FileAttachment, FileAttachmentEncoder
def is_valid_table_name(table_name):
return re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', table_name) is not None
def sanitize_table_name(name):
name = name.replace('/', '_')
# Remove any non-alphanumeric characters
sanitized_name = re.sub(r'\W+', '', name)
# If the first character is a digit, prepend an underscore
if sanitized_name and sanitized_name[0].isdigit():
sanitized_name = '_' + sanitized_name
return sanitized_name
class EmailDatabase:
__restricted_strings = ['folders_mapping', 'syncs']
def __init__(self, filepath: Path):
filepath = filepath.expanduser().absolute().resolve()
self.conn = sqlite3.connect(filepath)
cursor = self.conn.cursor()
cursor.execute(f'CREATE TABLE IF NOT EXISTS folders_mapping (name TEXT UNIQUE, table_name TEXT UNIQUE)')
cursor.execute(f'CREATE TABLE IF NOT EXISTS syncs (timestamp INTEGER UNIQUE, type TEXT)')
self.conn.commit()
cursor.close()
def __create_table(self, table_name: str):
sanitized_table_name = sanitize_table_name(table_name)
if sanitized_table_name in self.__restricted_strings:
raise ValueError(f'Invalid table name, conflicts with system tables: {table_name}')
cursor = self.conn.cursor()
cursor.execute(f'CREATE TABLE IF NOT EXISTS {sanitized_table_name} (timestamp INTEGER, to_email TEXT, from_email TEXT, subject TEXT, raw TEXT, attachments TEXT, id INTEGER PRIMARY KEY AUTOINCREMENT)')
cursor.execute('INSERT OR IGNORE INTO folders_mapping (name, table_name) VALUES (?, ?)', (table_name, sanitized_table_name))
self.conn.commit()
cursor.close()
def insert_email(self, folder: str, timestamp: int, subject: str, raw: str, to_email: str, from_email: str, attachments: List[FileAttachment]):
sanitized_table_name = sanitize_table_name(folder)
self.__create_table(folder)
cursor = self.conn.cursor()
# Check if record already exists
stmt_check = f"SELECT * FROM {sanitized_table_name} WHERE timestamp = ? AND raw = ?"
cursor.execute(stmt_check, (timestamp, raw))
data = cursor.fetchone()
# If record does not exist, insert it
new_email = False
if data is None:
stmt = f"INSERT INTO {sanitized_table_name} (timestamp, to_email, from_email, subject, raw, attachments) VALUES (?, ?, ?, ?, ?, ?)"
cursor.execute(stmt, (timestamp, to_email, from_email, subject, raw, json.dumps(attachments, cls=FileAttachmentEncoder)))
self.conn.commit()
new_email = True
cursor.close()
return new_email
def finish_sync(self, sync_type: str):
now = int(time.time())
cursor = self.conn.cursor()
cursor.execute('INSERT INTO syncs (timestamp, type) VALUES (?, ?)', (now, sync_type))
self.conn.commit()
cursor.close()
return now
def have_we_done_a_full_sync_at_all(self):
cursor = self.conn.cursor()
cursor.execute("SELECT * FROM syncs ORDER BY timestamp LIMIT 1")
row = cursor.fetchone()
cursor.close()
if row is not None:
return row[0]
else:
return None

12
iarchiver/email.py Normal file
View File

@ -0,0 +1,12 @@
import re
from datetime import datetime, timedelta
from typing import Union
def extract_emails(field: str):
matches = re.findall(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', field.lower())
return tuple({str(x) for x in matches})
def unix_timestamp_since_to_imap_timestamp(unix_timestamp: Union[int, float]) -> str:
return (datetime.fromtimestamp(unix_timestamp) - timedelta(days=1)).strftime("%d-%b-%Y")

132
iarchiver/mail.py Normal file
View File

@ -0,0 +1,132 @@
import concurrent.futures
import email
import hashlib
import imaplib
import time
from email.header import decode_header
from email.utils import parsedate_to_datetime
from json import JSONEncoder
from pathlib import Path
import chardet
from iarchiver.email import extract_emails
def md5_chunked(data: bytes, size: int = 1024):
m = hashlib.md5()
for i in range(0, len(data), size):
m.update(data[i:i + size])
return m.hexdigest()
class FileAttachment:
def __init__(self, file_name: str, file_hash: str):
self.filename = file_name
self.hash = file_hash
def to_dict(self):
return {'filename': self.filename, 'hash': self.hash}
class FileAttachmentEncoder(JSONEncoder):
def default(self, o):
if isinstance(o, FileAttachment):
return o.to_dict()
return super().default(o)
class MailConnection:
def __init__(self, host: str, username: str, password: str, attachments_dir: Path):
self.mail = imaplib.IMAP4_SSL(host)
self.mail.login(username, password)
self.attachments_dir = attachments_dir.expanduser().absolute().resolve()
self.folder_structure = {}
def load_folders(self):
folders = [tuple(f.decode().split(' "/" ')[1].replace('"', '').split('/')) for f in self.mail.list()[1]]
folder_structure = {}
for f in folders:
if not folder_structure.get(f[0]):
folder_structure[f[0]] = []
if len(f) > 1:
folder_structure[f[0]].append(f[1])
self.folder_structure = folder_structure
return self.folder_structure
def __fetch_email(self, i):
result, data = self.mail.uid('fetch', str(i), '(BODY[])') # fetch the raw email
if data[0] is None:
return
raw_email_bytes = data[0][1]
detected = chardet.detect(raw_email_bytes)
encoding = detected['encoding']
if not encoding:
encoding = 'utf-8'
raw_email = raw_email_bytes.decode(encoding, errors='replace')
email_message = email.message_from_string(raw_email)
date_header = email_message['Date']
date = parsedate_to_datetime(date_header)
unix_timestamp = int(time.mktime(date.timetuple()))
from_header = ', '.join(extract_emails(email_message['From']))
to_header = ', '.join(extract_emails(email_message['To']))
if '@' not in to_header:
to_header = email_message['To']
subject_header = email_message['Subject']
if subject_header:
subject = decode_header(subject_header)[0][0]
if isinstance(subject, bytes):
try:
detected = chardet.detect(subject)
encoding = detected['encoding']
if not encoding:
encoding = 'utf-8'
subject = subject.decode(encoding, errors='replace')
except UnicodeDecodeError:
subject = subject.decode('utf-8')
else:
return
attachments = []
if email_message.is_multipart():
for part in email_message.walk():
content_type = part.get_content_type()
content_disposition = str(part.get("Content-Disposition"))
if "attachment" in content_disposition:
filename = part.get_filename()
if filename:
# The filename of the file is the hash of its content, which should de-duplicate files.
filecontents = part.get_payload(decode=True)
filehash = md5_chunked(filecontents)
part.set_payload(filehash) # replace the attachment with its hash
filepath = self.attachments_dir / filehash
file_obj = FileAttachment(filename, filehash)
if not filepath.is_file():
with open(filepath, 'wb') as f:
f.write(filecontents)
attachments.append(file_obj)
raw_email_clean = email_message.as_string()
return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments
def fetch_folder(self, folder: str, search_criterion: str = 'ALL', max_threads: int = 1):
"""
Don't use multiple threads because most mail servers don't allow the client to multiplex.
"""
self.mail.select(f'"{folder}"')
result, data = self.mail.uid('search', None, search_criterion)
mail_ids = data[0]
id_list = mail_ids.split()
if not len(id_list):
# Empty folder
return
first_email_id = int(id_list[0])
latest_email_id = int(id_list[-1])
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
futures = {executor.submit(self.__fetch_email, i) for i in range(latest_email_id, first_email_id, -1)}
for future in concurrent.futures.as_completed(futures):
result = future.result()
if result is not None:
yield result

14
imaparchiver.service Normal file
View File

@ -0,0 +1,14 @@
[Unit]
Description=IMAP archiver service
Wants=network-online.target
After=network-online.target
[Service]
User=emailsync
Group=emailsync
ExecStart=/srv/email/imap-archiver/venv/bin/python3 /srv/email/imap-archiver/run.py --config /srv/email/imap-archiver/config.yml
SyslogIdentifier=imap-archiver
Restart=on-failure
[Install]
WantedBy=multi-user.target

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
pyyaml==6.0.1
chardet==5.2.0
humanize==4.9.0

81
run.py Executable file
View File

@ -0,0 +1,81 @@
#!/usr/bin/env python3
import argparse
import logging
import sys
from datetime import datetime
from pathlib import Path
import humanize
import yaml
from iarchiver.database import EmailDatabase
from iarchiver.email import unix_timestamp_since_to_imap_timestamp
from iarchiver.mail import MailConnection
def main(args):
logging.basicConfig()
logger = logging.getLogger('iarchiver')
logger.setLevel(logging.INFO)
with open(args.config) as f:
config = yaml.safe_load(f)
exclude_folders = config.get('exclude_folders', [])
if not config.get('server') or not config.get('username') or not config.get('password') or not config.get('database_path') or not config.get('attachments_path'):
logger.critical('Bad config file.')
sys.exit(1)
attachments_dir = Path(config['attachments_path'])
attachments_dir.mkdir(parents=True, exist_ok=True)
database = EmailDatabase(Path(config['database_path']))
mail = MailConnection(config['server'], config['username'], config['password'], attachments_dir)
mail.load_folders()
logger.info(f'Syncing {len(mail.folder_structure.keys())} folders...')
new_emails = 0
new_attachments = 0
did_full_sync = False
sync_start_time = datetime.now()
for parent_folder, subfolders in mail.folder_structure.items():
if parent_folder in exclude_folders:
# Exclude folder
continue
for folder in [parent_folder, *subfolders]:
folder_name = parent_folder + '/' + folder
if folder_name == f'{parent_folder}/{parent_folder}':
folder_name = parent_folder
if folder_name in exclude_folders:
# Exclude folder
continue
logger.info(folder_name)
last_refresh = database.have_we_done_a_full_sync_at_all()
if last_refresh:
date = unix_timestamp_since_to_imap_timestamp(last_refresh)
search_criterion = '(SINCE "' + date + '")'
else:
did_full_sync = True
search_criterion = 'ALL'
for email in mail.fetch_folder(folder_name, search_criterion=search_criterion):
timestamp, to_email, from_email, subject, raw, attachments = email
is_new_email = database.insert_email(folder_name, timestamp, subject, raw, to_email, from_email, attachments)
if is_new_email:
new_emails += 1
if len(attachments):
new_attachments += 1
database.finish_sync('refresh' if not did_full_sync else 'full')
elapsed = datetime.now() - sync_start_time
logger.info(f'Finished email {"refresh" if not did_full_sync else "sync"} in {humanize.naturaldelta(elapsed)} and added {new_emails} new emails.')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Sync and archive your IMAP server.')
parser.add_argument('--config', default='config.yml', help='Path to config file.')
args = parser.parse_args()
main(args)