add code
This commit is contained in:
parent
eff0c832fe
commit
878ec708e1
|
@ -1,3 +1,9 @@
|
||||||
|
.idea
|
||||||
|
attachments/
|
||||||
|
config.yml
|
||||||
|
emails.db
|
||||||
|
emails.db-journal
|
||||||
|
|
||||||
# ---> Python
|
# ---> Python
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
@ -159,4 +165,3 @@ cython_debug/
|
||||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
|
|
22
README.md
22
README.md
|
@ -1,3 +1,23 @@
|
||||||
# imap-archiver
|
# imap-archiver
|
||||||
|
|
||||||
Archive the content on your email account.
|
_Archive the content on your email account._
|
||||||
|
|
||||||
|
A very simple way to sync your email account with the goal of backing up all your emails.
|
||||||
|
|
||||||
|
Saves attachments to disk and stores the emails in an SQLite database.
|
||||||
|
|
||||||
|
No viewer yet, but the database is organized to do so. Does not support different accounts.
|
||||||
|
|
||||||
|
## Install
|
||||||
|
|
||||||
|
1. Create a venv
|
||||||
|
2. `pip install -r requirements.txt`
|
||||||
|
3. `cp config.yml.sample config.yml`
|
||||||
|
4. Edit `config.yml` and configure your login info.
|
||||||
|
5. `python3 run.py`
|
||||||
|
|
||||||
|
A systemd service is included.
|
||||||
|
|
||||||
|
## To Do
|
||||||
|
|
||||||
|
- [ ] Fix subject decoding. Some character sets aren't detected correctly.
|
|
@ -0,0 +1,10 @@
|
||||||
|
server: imap.example.com
|
||||||
|
username: bob@example.com
|
||||||
|
password: password123
|
||||||
|
|
||||||
|
database_path: emails.db
|
||||||
|
attachments_path: attachments
|
||||||
|
|
||||||
|
exclude_folders:
|
||||||
|
- Trash
|
||||||
|
- Drafts
|
|
@ -0,0 +1,83 @@
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from iarchiver.mail import FileAttachment, FileAttachmentEncoder
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_table_name(table_name):
|
||||||
|
return re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', table_name) is not None
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_table_name(name):
|
||||||
|
name = name.replace('/', '_')
|
||||||
|
# Remove any non-alphanumeric characters
|
||||||
|
sanitized_name = re.sub(r'\W+', '', name)
|
||||||
|
# If the first character is a digit, prepend an underscore
|
||||||
|
if sanitized_name and sanitized_name[0].isdigit():
|
||||||
|
sanitized_name = '_' + sanitized_name
|
||||||
|
return sanitized_name
|
||||||
|
|
||||||
|
|
||||||
|
class EmailDatabase:
|
||||||
|
__restricted_strings = ['folders_mapping', 'syncs']
|
||||||
|
|
||||||
|
def __init__(self, filepath: Path):
|
||||||
|
filepath = filepath.expanduser().absolute().resolve()
|
||||||
|
self.conn = sqlite3.connect(filepath)
|
||||||
|
cursor = self.conn.cursor()
|
||||||
|
cursor.execute(f'CREATE TABLE IF NOT EXISTS folders_mapping (name TEXT UNIQUE, table_name TEXT UNIQUE)')
|
||||||
|
cursor.execute(f'CREATE TABLE IF NOT EXISTS syncs (timestamp INTEGER UNIQUE, type TEXT)')
|
||||||
|
self.conn.commit()
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
def __create_table(self, table_name: str):
|
||||||
|
sanitized_table_name = sanitize_table_name(table_name)
|
||||||
|
if sanitized_table_name in self.__restricted_strings:
|
||||||
|
raise ValueError(f'Invalid table name, conflicts with system tables: {table_name}')
|
||||||
|
cursor = self.conn.cursor()
|
||||||
|
cursor.execute(f'CREATE TABLE IF NOT EXISTS {sanitized_table_name} (timestamp INTEGER, to_email TEXT, from_email TEXT, subject TEXT, raw TEXT, attachments TEXT, id INTEGER PRIMARY KEY AUTOINCREMENT)')
|
||||||
|
cursor.execute('INSERT OR IGNORE INTO folders_mapping (name, table_name) VALUES (?, ?)', (table_name, sanitized_table_name))
|
||||||
|
self.conn.commit()
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
def insert_email(self, folder: str, timestamp: int, subject: str, raw: str, to_email: str, from_email: str, attachments: List[FileAttachment]):
|
||||||
|
sanitized_table_name = sanitize_table_name(folder)
|
||||||
|
self.__create_table(folder)
|
||||||
|
cursor = self.conn.cursor()
|
||||||
|
|
||||||
|
# Check if record already exists
|
||||||
|
stmt_check = f"SELECT * FROM {sanitized_table_name} WHERE timestamp = ? AND raw = ?"
|
||||||
|
cursor.execute(stmt_check, (timestamp, raw))
|
||||||
|
data = cursor.fetchone()
|
||||||
|
|
||||||
|
# If record does not exist, insert it
|
||||||
|
new_email = False
|
||||||
|
if data is None:
|
||||||
|
stmt = f"INSERT INTO {sanitized_table_name} (timestamp, to_email, from_email, subject, raw, attachments) VALUES (?, ?, ?, ?, ?, ?)"
|
||||||
|
cursor.execute(stmt, (timestamp, to_email, from_email, subject, raw, json.dumps(attachments, cls=FileAttachmentEncoder)))
|
||||||
|
self.conn.commit()
|
||||||
|
new_email = True
|
||||||
|
cursor.close()
|
||||||
|
return new_email
|
||||||
|
|
||||||
|
def finish_sync(self, sync_type: str):
|
||||||
|
now = int(time.time())
|
||||||
|
cursor = self.conn.cursor()
|
||||||
|
cursor.execute('INSERT INTO syncs (timestamp, type) VALUES (?, ?)', (now, sync_type))
|
||||||
|
self.conn.commit()
|
||||||
|
cursor.close()
|
||||||
|
return now
|
||||||
|
|
||||||
|
def have_we_done_a_full_sync_at_all(self):
|
||||||
|
cursor = self.conn.cursor()
|
||||||
|
cursor.execute("SELECT * FROM syncs ORDER BY timestamp LIMIT 1")
|
||||||
|
row = cursor.fetchone()
|
||||||
|
cursor.close()
|
||||||
|
if row is not None:
|
||||||
|
return row[0]
|
||||||
|
else:
|
||||||
|
return None
|
|
@ -0,0 +1,12 @@
|
||||||
|
import re
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
|
||||||
|
def extract_emails(field: str):
|
||||||
|
matches = re.findall(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', field.lower())
|
||||||
|
return tuple({str(x) for x in matches})
|
||||||
|
|
||||||
|
|
||||||
|
def unix_timestamp_since_to_imap_timestamp(unix_timestamp: Union[int, float]) -> str:
|
||||||
|
return (datetime.fromtimestamp(unix_timestamp) - timedelta(days=1)).strftime("%d-%b-%Y")
|
|
@ -0,0 +1,132 @@
|
||||||
|
import concurrent.futures
|
||||||
|
import email
|
||||||
|
import hashlib
|
||||||
|
import imaplib
|
||||||
|
import time
|
||||||
|
from email.header import decode_header
|
||||||
|
from email.utils import parsedate_to_datetime
|
||||||
|
from json import JSONEncoder
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import chardet
|
||||||
|
|
||||||
|
from iarchiver.email import extract_emails
|
||||||
|
|
||||||
|
|
||||||
|
def md5_chunked(data: bytes, size: int = 1024):
|
||||||
|
m = hashlib.md5()
|
||||||
|
for i in range(0, len(data), size):
|
||||||
|
m.update(data[i:i + size])
|
||||||
|
return m.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
class FileAttachment:
|
||||||
|
def __init__(self, file_name: str, file_hash: str):
|
||||||
|
self.filename = file_name
|
||||||
|
self.hash = file_hash
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return {'filename': self.filename, 'hash': self.hash}
|
||||||
|
|
||||||
|
|
||||||
|
class FileAttachmentEncoder(JSONEncoder):
|
||||||
|
def default(self, o):
|
||||||
|
if isinstance(o, FileAttachment):
|
||||||
|
return o.to_dict()
|
||||||
|
return super().default(o)
|
||||||
|
|
||||||
|
|
||||||
|
class MailConnection:
|
||||||
|
def __init__(self, host: str, username: str, password: str, attachments_dir: Path):
|
||||||
|
self.mail = imaplib.IMAP4_SSL(host)
|
||||||
|
self.mail.login(username, password)
|
||||||
|
self.attachments_dir = attachments_dir.expanduser().absolute().resolve()
|
||||||
|
self.folder_structure = {}
|
||||||
|
|
||||||
|
def load_folders(self):
|
||||||
|
folders = [tuple(f.decode().split(' "/" ')[1].replace('"', '').split('/')) for f in self.mail.list()[1]]
|
||||||
|
folder_structure = {}
|
||||||
|
for f in folders:
|
||||||
|
if not folder_structure.get(f[0]):
|
||||||
|
folder_structure[f[0]] = []
|
||||||
|
if len(f) > 1:
|
||||||
|
folder_structure[f[0]].append(f[1])
|
||||||
|
self.folder_structure = folder_structure
|
||||||
|
return self.folder_structure
|
||||||
|
|
||||||
|
def __fetch_email(self, i):
|
||||||
|
result, data = self.mail.uid('fetch', str(i), '(BODY[])') # fetch the raw email
|
||||||
|
if data[0] is None:
|
||||||
|
return
|
||||||
|
raw_email_bytes = data[0][1]
|
||||||
|
detected = chardet.detect(raw_email_bytes)
|
||||||
|
encoding = detected['encoding']
|
||||||
|
if not encoding:
|
||||||
|
encoding = 'utf-8'
|
||||||
|
raw_email = raw_email_bytes.decode(encoding, errors='replace')
|
||||||
|
email_message = email.message_from_string(raw_email)
|
||||||
|
date_header = email_message['Date']
|
||||||
|
date = parsedate_to_datetime(date_header)
|
||||||
|
unix_timestamp = int(time.mktime(date.timetuple()))
|
||||||
|
|
||||||
|
from_header = ', '.join(extract_emails(email_message['From']))
|
||||||
|
to_header = ', '.join(extract_emails(email_message['To']))
|
||||||
|
if '@' not in to_header:
|
||||||
|
to_header = email_message['To']
|
||||||
|
|
||||||
|
subject_header = email_message['Subject']
|
||||||
|
if subject_header:
|
||||||
|
subject = decode_header(subject_header)[0][0]
|
||||||
|
if isinstance(subject, bytes):
|
||||||
|
try:
|
||||||
|
detected = chardet.detect(subject)
|
||||||
|
encoding = detected['encoding']
|
||||||
|
if not encoding:
|
||||||
|
encoding = 'utf-8'
|
||||||
|
subject = subject.decode(encoding, errors='replace')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
subject = subject.decode('utf-8')
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
|
||||||
|
attachments = []
|
||||||
|
if email_message.is_multipart():
|
||||||
|
for part in email_message.walk():
|
||||||
|
content_type = part.get_content_type()
|
||||||
|
content_disposition = str(part.get("Content-Disposition"))
|
||||||
|
if "attachment" in content_disposition:
|
||||||
|
filename = part.get_filename()
|
||||||
|
if filename:
|
||||||
|
# The filename of the file is the hash of its content, which should de-duplicate files.
|
||||||
|
filecontents = part.get_payload(decode=True)
|
||||||
|
filehash = md5_chunked(filecontents)
|
||||||
|
part.set_payload(filehash) # replace the attachment with its hash
|
||||||
|
filepath = self.attachments_dir / filehash
|
||||||
|
file_obj = FileAttachment(filename, filehash)
|
||||||
|
if not filepath.is_file():
|
||||||
|
with open(filepath, 'wb') as f:
|
||||||
|
f.write(filecontents)
|
||||||
|
attachments.append(file_obj)
|
||||||
|
raw_email_clean = email_message.as_string()
|
||||||
|
return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments
|
||||||
|
|
||||||
|
def fetch_folder(self, folder: str, search_criterion: str = 'ALL', max_threads: int = 1):
|
||||||
|
"""
|
||||||
|
Don't use multiple threads because most mail servers don't allow the client to multiplex.
|
||||||
|
"""
|
||||||
|
self.mail.select(f'"{folder}"')
|
||||||
|
result, data = self.mail.uid('search', None, search_criterion)
|
||||||
|
mail_ids = data[0]
|
||||||
|
id_list = mail_ids.split()
|
||||||
|
if not len(id_list):
|
||||||
|
# Empty folder
|
||||||
|
return
|
||||||
|
first_email_id = int(id_list[0])
|
||||||
|
latest_email_id = int(id_list[-1])
|
||||||
|
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
|
||||||
|
futures = {executor.submit(self.__fetch_email, i) for i in range(latest_email_id, first_email_id, -1)}
|
||||||
|
for future in concurrent.futures.as_completed(futures):
|
||||||
|
result = future.result()
|
||||||
|
if result is not None:
|
||||||
|
yield result
|
|
@ -0,0 +1,14 @@
|
||||||
|
[Unit]
|
||||||
|
Description=IMAP archiver service
|
||||||
|
Wants=network-online.target
|
||||||
|
After=network-online.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
User=emailsync
|
||||||
|
Group=emailsync
|
||||||
|
ExecStart=/srv/email/imap-archiver/venv/bin/python3 /srv/email/imap-archiver/run.py --config /srv/email/imap-archiver/config.yml
|
||||||
|
SyslogIdentifier=imap-archiver
|
||||||
|
Restart=on-failure
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
|
@ -0,0 +1,3 @@
|
||||||
|
pyyaml==6.0.1
|
||||||
|
chardet==5.2.0
|
||||||
|
humanize==4.9.0
|
|
@ -0,0 +1,81 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import humanize
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from iarchiver.database import EmailDatabase
|
||||||
|
from iarchiver.email import unix_timestamp_since_to_imap_timestamp
|
||||||
|
from iarchiver.mail import MailConnection
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
logging.basicConfig()
|
||||||
|
logger = logging.getLogger('iarchiver')
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
with open(args.config) as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
exclude_folders = config.get('exclude_folders', [])
|
||||||
|
|
||||||
|
if not config.get('server') or not config.get('username') or not config.get('password') or not config.get('database_path') or not config.get('attachments_path'):
|
||||||
|
logger.critical('Bad config file.')
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
attachments_dir = Path(config['attachments_path'])
|
||||||
|
attachments_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
database = EmailDatabase(Path(config['database_path']))
|
||||||
|
mail = MailConnection(config['server'], config['username'], config['password'], attachments_dir)
|
||||||
|
mail.load_folders()
|
||||||
|
|
||||||
|
logger.info(f'Syncing {len(mail.folder_structure.keys())} folders...')
|
||||||
|
|
||||||
|
new_emails = 0
|
||||||
|
new_attachments = 0
|
||||||
|
did_full_sync = False
|
||||||
|
sync_start_time = datetime.now()
|
||||||
|
|
||||||
|
for parent_folder, subfolders in mail.folder_structure.items():
|
||||||
|
if parent_folder in exclude_folders:
|
||||||
|
# Exclude folder
|
||||||
|
continue
|
||||||
|
for folder in [parent_folder, *subfolders]:
|
||||||
|
folder_name = parent_folder + '/' + folder
|
||||||
|
if folder_name == f'{parent_folder}/{parent_folder}':
|
||||||
|
folder_name = parent_folder
|
||||||
|
if folder_name in exclude_folders:
|
||||||
|
# Exclude folder
|
||||||
|
continue
|
||||||
|
logger.info(folder_name)
|
||||||
|
last_refresh = database.have_we_done_a_full_sync_at_all()
|
||||||
|
if last_refresh:
|
||||||
|
date = unix_timestamp_since_to_imap_timestamp(last_refresh)
|
||||||
|
search_criterion = '(SINCE "' + date + '")'
|
||||||
|
else:
|
||||||
|
did_full_sync = True
|
||||||
|
search_criterion = 'ALL'
|
||||||
|
|
||||||
|
for email in mail.fetch_folder(folder_name, search_criterion=search_criterion):
|
||||||
|
timestamp, to_email, from_email, subject, raw, attachments = email
|
||||||
|
is_new_email = database.insert_email(folder_name, timestamp, subject, raw, to_email, from_email, attachments)
|
||||||
|
if is_new_email:
|
||||||
|
new_emails += 1
|
||||||
|
if len(attachments):
|
||||||
|
new_attachments += 1
|
||||||
|
|
||||||
|
database.finish_sync('refresh' if not did_full_sync else 'full')
|
||||||
|
|
||||||
|
elapsed = datetime.now() - sync_start_time
|
||||||
|
logger.info(f'Finished email {"refresh" if not did_full_sync else "sync"} in {humanize.naturaldelta(elapsed)} and added {new_emails} new emails.')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(description='Sync and archive your IMAP server.')
|
||||||
|
parser.add_argument('--config', default='config.yml', help='Path to config file.')
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
Loading…
Reference in New Issue