add code
This commit is contained in:
parent
eff0c832fe
commit
878ec708e1
|
@ -1,3 +1,9 @@
|
|||
.idea
|
||||
attachments/
|
||||
config.yml
|
||||
emails.db
|
||||
emails.db-journal
|
||||
|
||||
# ---> Python
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
@ -159,4 +165,3 @@ cython_debug/
|
|||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
|
|
22
README.md
22
README.md
|
@ -1,3 +1,23 @@
|
|||
# imap-archiver
|
||||
|
||||
Archive the content on your email account.
|
||||
_Archive the content on your email account._
|
||||
|
||||
A very simple way to sync your email account with the goal of backing up all your emails.
|
||||
|
||||
Saves attachments to disk and stores the emails in an SQLite database.
|
||||
|
||||
No viewer yet, but the database is organized to do so. Does not support different accounts.
|
||||
|
||||
## Install
|
||||
|
||||
1. Create a venv
|
||||
2. `pip install -r requirements.txt`
|
||||
3. `cp config.yml.sample config.yml`
|
||||
4. Edit `config.yml` and configure your login info.
|
||||
5. `python3 run.py`
|
||||
|
||||
A systemd service is included.
|
||||
|
||||
## To Do
|
||||
|
||||
- [ ] Fix subject decoding. Some character sets aren't detected correctly.
|
|
@ -0,0 +1,10 @@
|
|||
server: imap.example.com
|
||||
username: bob@example.com
|
||||
password: password123
|
||||
|
||||
database_path: emails.db
|
||||
attachments_path: attachments
|
||||
|
||||
exclude_folders:
|
||||
- Trash
|
||||
- Drafts
|
|
@ -0,0 +1,83 @@
|
|||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from iarchiver.mail import FileAttachment, FileAttachmentEncoder
|
||||
|
||||
|
||||
def is_valid_table_name(table_name):
|
||||
return re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', table_name) is not None
|
||||
|
||||
|
||||
def sanitize_table_name(name):
|
||||
name = name.replace('/', '_')
|
||||
# Remove any non-alphanumeric characters
|
||||
sanitized_name = re.sub(r'\W+', '', name)
|
||||
# If the first character is a digit, prepend an underscore
|
||||
if sanitized_name and sanitized_name[0].isdigit():
|
||||
sanitized_name = '_' + sanitized_name
|
||||
return sanitized_name
|
||||
|
||||
|
||||
class EmailDatabase:
|
||||
__restricted_strings = ['folders_mapping', 'syncs']
|
||||
|
||||
def __init__(self, filepath: Path):
|
||||
filepath = filepath.expanduser().absolute().resolve()
|
||||
self.conn = sqlite3.connect(filepath)
|
||||
cursor = self.conn.cursor()
|
||||
cursor.execute(f'CREATE TABLE IF NOT EXISTS folders_mapping (name TEXT UNIQUE, table_name TEXT UNIQUE)')
|
||||
cursor.execute(f'CREATE TABLE IF NOT EXISTS syncs (timestamp INTEGER UNIQUE, type TEXT)')
|
||||
self.conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def __create_table(self, table_name: str):
|
||||
sanitized_table_name = sanitize_table_name(table_name)
|
||||
if sanitized_table_name in self.__restricted_strings:
|
||||
raise ValueError(f'Invalid table name, conflicts with system tables: {table_name}')
|
||||
cursor = self.conn.cursor()
|
||||
cursor.execute(f'CREATE TABLE IF NOT EXISTS {sanitized_table_name} (timestamp INTEGER, to_email TEXT, from_email TEXT, subject TEXT, raw TEXT, attachments TEXT, id INTEGER PRIMARY KEY AUTOINCREMENT)')
|
||||
cursor.execute('INSERT OR IGNORE INTO folders_mapping (name, table_name) VALUES (?, ?)', (table_name, sanitized_table_name))
|
||||
self.conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def insert_email(self, folder: str, timestamp: int, subject: str, raw: str, to_email: str, from_email: str, attachments: List[FileAttachment]):
|
||||
sanitized_table_name = sanitize_table_name(folder)
|
||||
self.__create_table(folder)
|
||||
cursor = self.conn.cursor()
|
||||
|
||||
# Check if record already exists
|
||||
stmt_check = f"SELECT * FROM {sanitized_table_name} WHERE timestamp = ? AND raw = ?"
|
||||
cursor.execute(stmt_check, (timestamp, raw))
|
||||
data = cursor.fetchone()
|
||||
|
||||
# If record does not exist, insert it
|
||||
new_email = False
|
||||
if data is None:
|
||||
stmt = f"INSERT INTO {sanitized_table_name} (timestamp, to_email, from_email, subject, raw, attachments) VALUES (?, ?, ?, ?, ?, ?)"
|
||||
cursor.execute(stmt, (timestamp, to_email, from_email, subject, raw, json.dumps(attachments, cls=FileAttachmentEncoder)))
|
||||
self.conn.commit()
|
||||
new_email = True
|
||||
cursor.close()
|
||||
return new_email
|
||||
|
||||
def finish_sync(self, sync_type: str):
|
||||
now = int(time.time())
|
||||
cursor = self.conn.cursor()
|
||||
cursor.execute('INSERT INTO syncs (timestamp, type) VALUES (?, ?)', (now, sync_type))
|
||||
self.conn.commit()
|
||||
cursor.close()
|
||||
return now
|
||||
|
||||
def have_we_done_a_full_sync_at_all(self):
|
||||
cursor = self.conn.cursor()
|
||||
cursor.execute("SELECT * FROM syncs ORDER BY timestamp LIMIT 1")
|
||||
row = cursor.fetchone()
|
||||
cursor.close()
|
||||
if row is not None:
|
||||
return row[0]
|
||||
else:
|
||||
return None
|
|
@ -0,0 +1,12 @@
|
|||
import re
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Union
|
||||
|
||||
|
||||
def extract_emails(field: str):
|
||||
matches = re.findall(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', field.lower())
|
||||
return tuple({str(x) for x in matches})
|
||||
|
||||
|
||||
def unix_timestamp_since_to_imap_timestamp(unix_timestamp: Union[int, float]) -> str:
|
||||
return (datetime.fromtimestamp(unix_timestamp) - timedelta(days=1)).strftime("%d-%b-%Y")
|
|
@ -0,0 +1,132 @@
|
|||
import concurrent.futures
|
||||
import email
|
||||
import hashlib
|
||||
import imaplib
|
||||
import time
|
||||
from email.header import decode_header
|
||||
from email.utils import parsedate_to_datetime
|
||||
from json import JSONEncoder
|
||||
from pathlib import Path
|
||||
|
||||
import chardet
|
||||
|
||||
from iarchiver.email import extract_emails
|
||||
|
||||
|
||||
def md5_chunked(data: bytes, size: int = 1024):
|
||||
m = hashlib.md5()
|
||||
for i in range(0, len(data), size):
|
||||
m.update(data[i:i + size])
|
||||
return m.hexdigest()
|
||||
|
||||
|
||||
class FileAttachment:
|
||||
def __init__(self, file_name: str, file_hash: str):
|
||||
self.filename = file_name
|
||||
self.hash = file_hash
|
||||
|
||||
def to_dict(self):
|
||||
return {'filename': self.filename, 'hash': self.hash}
|
||||
|
||||
|
||||
class FileAttachmentEncoder(JSONEncoder):
|
||||
def default(self, o):
|
||||
if isinstance(o, FileAttachment):
|
||||
return o.to_dict()
|
||||
return super().default(o)
|
||||
|
||||
|
||||
class MailConnection:
|
||||
def __init__(self, host: str, username: str, password: str, attachments_dir: Path):
|
||||
self.mail = imaplib.IMAP4_SSL(host)
|
||||
self.mail.login(username, password)
|
||||
self.attachments_dir = attachments_dir.expanduser().absolute().resolve()
|
||||
self.folder_structure = {}
|
||||
|
||||
def load_folders(self):
|
||||
folders = [tuple(f.decode().split(' "/" ')[1].replace('"', '').split('/')) for f in self.mail.list()[1]]
|
||||
folder_structure = {}
|
||||
for f in folders:
|
||||
if not folder_structure.get(f[0]):
|
||||
folder_structure[f[0]] = []
|
||||
if len(f) > 1:
|
||||
folder_structure[f[0]].append(f[1])
|
||||
self.folder_structure = folder_structure
|
||||
return self.folder_structure
|
||||
|
||||
def __fetch_email(self, i):
|
||||
result, data = self.mail.uid('fetch', str(i), '(BODY[])') # fetch the raw email
|
||||
if data[0] is None:
|
||||
return
|
||||
raw_email_bytes = data[0][1]
|
||||
detected = chardet.detect(raw_email_bytes)
|
||||
encoding = detected['encoding']
|
||||
if not encoding:
|
||||
encoding = 'utf-8'
|
||||
raw_email = raw_email_bytes.decode(encoding, errors='replace')
|
||||
email_message = email.message_from_string(raw_email)
|
||||
date_header = email_message['Date']
|
||||
date = parsedate_to_datetime(date_header)
|
||||
unix_timestamp = int(time.mktime(date.timetuple()))
|
||||
|
||||
from_header = ', '.join(extract_emails(email_message['From']))
|
||||
to_header = ', '.join(extract_emails(email_message['To']))
|
||||
if '@' not in to_header:
|
||||
to_header = email_message['To']
|
||||
|
||||
subject_header = email_message['Subject']
|
||||
if subject_header:
|
||||
subject = decode_header(subject_header)[0][0]
|
||||
if isinstance(subject, bytes):
|
||||
try:
|
||||
detected = chardet.detect(subject)
|
||||
encoding = detected['encoding']
|
||||
if not encoding:
|
||||
encoding = 'utf-8'
|
||||
subject = subject.decode(encoding, errors='replace')
|
||||
except UnicodeDecodeError:
|
||||
subject = subject.decode('utf-8')
|
||||
else:
|
||||
return
|
||||
|
||||
attachments = []
|
||||
if email_message.is_multipart():
|
||||
for part in email_message.walk():
|
||||
content_type = part.get_content_type()
|
||||
content_disposition = str(part.get("Content-Disposition"))
|
||||
if "attachment" in content_disposition:
|
||||
filename = part.get_filename()
|
||||
if filename:
|
||||
# The filename of the file is the hash of its content, which should de-duplicate files.
|
||||
filecontents = part.get_payload(decode=True)
|
||||
filehash = md5_chunked(filecontents)
|
||||
part.set_payload(filehash) # replace the attachment with its hash
|
||||
filepath = self.attachments_dir / filehash
|
||||
file_obj = FileAttachment(filename, filehash)
|
||||
if not filepath.is_file():
|
||||
with open(filepath, 'wb') as f:
|
||||
f.write(filecontents)
|
||||
attachments.append(file_obj)
|
||||
raw_email_clean = email_message.as_string()
|
||||
return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments
|
||||
|
||||
def fetch_folder(self, folder: str, search_criterion: str = 'ALL', max_threads: int = 1):
|
||||
"""
|
||||
Don't use multiple threads because most mail servers don't allow the client to multiplex.
|
||||
"""
|
||||
self.mail.select(f'"{folder}"')
|
||||
result, data = self.mail.uid('search', None, search_criterion)
|
||||
mail_ids = data[0]
|
||||
id_list = mail_ids.split()
|
||||
if not len(id_list):
|
||||
# Empty folder
|
||||
return
|
||||
first_email_id = int(id_list[0])
|
||||
latest_email_id = int(id_list[-1])
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
|
||||
futures = {executor.submit(self.__fetch_email, i) for i in range(latest_email_id, first_email_id, -1)}
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
result = future.result()
|
||||
if result is not None:
|
||||
yield result
|
|
@ -0,0 +1,14 @@
|
|||
[Unit]
|
||||
Description=IMAP archiver service
|
||||
Wants=network-online.target
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
User=emailsync
|
||||
Group=emailsync
|
||||
ExecStart=/srv/email/imap-archiver/venv/bin/python3 /srv/email/imap-archiver/run.py --config /srv/email/imap-archiver/config.yml
|
||||
SyslogIdentifier=imap-archiver
|
||||
Restart=on-failure
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
|
@ -0,0 +1,3 @@
|
|||
pyyaml==6.0.1
|
||||
chardet==5.2.0
|
||||
humanize==4.9.0
|
|
@ -0,0 +1,81 @@
|
|||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import humanize
|
||||
import yaml
|
||||
|
||||
from iarchiver.database import EmailDatabase
|
||||
from iarchiver.email import unix_timestamp_since_to_imap_timestamp
|
||||
from iarchiver.mail import MailConnection
|
||||
|
||||
|
||||
def main(args):
|
||||
logging.basicConfig()
|
||||
logger = logging.getLogger('iarchiver')
|
||||
logger.setLevel(logging.INFO)
|
||||
with open(args.config) as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
exclude_folders = config.get('exclude_folders', [])
|
||||
|
||||
if not config.get('server') or not config.get('username') or not config.get('password') or not config.get('database_path') or not config.get('attachments_path'):
|
||||
logger.critical('Bad config file.')
|
||||
sys.exit(1)
|
||||
|
||||
attachments_dir = Path(config['attachments_path'])
|
||||
attachments_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
database = EmailDatabase(Path(config['database_path']))
|
||||
mail = MailConnection(config['server'], config['username'], config['password'], attachments_dir)
|
||||
mail.load_folders()
|
||||
|
||||
logger.info(f'Syncing {len(mail.folder_structure.keys())} folders...')
|
||||
|
||||
new_emails = 0
|
||||
new_attachments = 0
|
||||
did_full_sync = False
|
||||
sync_start_time = datetime.now()
|
||||
|
||||
for parent_folder, subfolders in mail.folder_structure.items():
|
||||
if parent_folder in exclude_folders:
|
||||
# Exclude folder
|
||||
continue
|
||||
for folder in [parent_folder, *subfolders]:
|
||||
folder_name = parent_folder + '/' + folder
|
||||
if folder_name == f'{parent_folder}/{parent_folder}':
|
||||
folder_name = parent_folder
|
||||
if folder_name in exclude_folders:
|
||||
# Exclude folder
|
||||
continue
|
||||
logger.info(folder_name)
|
||||
last_refresh = database.have_we_done_a_full_sync_at_all()
|
||||
if last_refresh:
|
||||
date = unix_timestamp_since_to_imap_timestamp(last_refresh)
|
||||
search_criterion = '(SINCE "' + date + '")'
|
||||
else:
|
||||
did_full_sync = True
|
||||
search_criterion = 'ALL'
|
||||
|
||||
for email in mail.fetch_folder(folder_name, search_criterion=search_criterion):
|
||||
timestamp, to_email, from_email, subject, raw, attachments = email
|
||||
is_new_email = database.insert_email(folder_name, timestamp, subject, raw, to_email, from_email, attachments)
|
||||
if is_new_email:
|
||||
new_emails += 1
|
||||
if len(attachments):
|
||||
new_attachments += 1
|
||||
|
||||
database.finish_sync('refresh' if not did_full_sync else 'full')
|
||||
|
||||
elapsed = datetime.now() - sync_start_time
|
||||
logger.info(f'Finished email {"refresh" if not did_full_sync else "sync"} in {humanize.naturaldelta(elapsed)} and added {new_emails} new emails.')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Sync and archive your IMAP server.')
|
||||
parser.add_argument('--config', default='config.yml', help='Path to config file.')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
Loading…
Reference in New Issue