add viewer server

This commit is contained in:
Cyberes 2024-03-06 14:25:36 -07:00
parent be1083e9b3
commit 94aadf1246
6 changed files with 223 additions and 75 deletions

View File

@ -2,6 +2,7 @@ import concurrent.futures
import email
import imaplib
import logging
import traceback
from email.header import decode_header
from email.utils import parsedate_to_datetime
from json import JSONEncoder
@ -15,12 +16,13 @@ from iarchiver.hash import murmur3_chunked
class FileAttachment:
def __init__(self, file_name: str, file_hash: str):
def __init__(self, file_name: str, file_hash: str, content_type: str):
self.filename = file_name
self.hash = file_hash
self.content_type = content_type
def to_dict(self):
return {'filename': self.filename, 'hash': self.hash}
return {'filename': self.filename, 'hash': self.hash, 'content_type': self.content_type}
class FileAttachmentEncoder(JSONEncoder):
@ -44,79 +46,82 @@ class MailConnection:
return self.folder_structure
def __fetch_email(self, i):
result, data = self.mail.uid('fetch', str(i), '(BODY[])') # fetch the raw email
if data[0] is None:
return
raw_email_bytes = data[0][1]
try:
detected = chardet.detect(raw_email_bytes)
except TypeError as e:
self.logger.critical(f'Failed to decode an email. Timeout? Server error? - "{e}"')
result, data = self.mail.uid('fetch', str(i), '(BODY[])') # fetch the raw email
if data[0] is None:
return
raw_email_bytes = data[0][1]
try:
detected = chardet.detect(raw_email_bytes)
except TypeError as e:
self.logger.critical(f'Failed to decode an email. Timeout? Server error? - "{e}"')
return
encoding = detected['encoding']
if not encoding:
encoding = 'utf-8'
raw_email = raw_email_bytes.decode(encoding, errors='replace')
email_message = email.message_from_string(raw_email)
date_header = email_message['Date']
if not date_header:
date_header = 'Thu, 1 Jan 1970 00:00:00 +0000'
parsed_date = email.utils.parsedate_to_datetime(date_header)
unix_timestamp = int(parsed_date.timestamp())
from_addr = email_message['From']
to_addr = email_message['To']
if not from_addr and not to_addr:
return
if not from_addr:
from_addr = ''
if not to_addr:
to_addr = ''
from_header = ', '.join(extract_emails(from_addr))
to_header = ', '.join(extract_emails(to_addr))
if '@' not in from_header:
from_header = from_addr
if '@' not in to_header:
to_header = to_addr
subject_header = email_message['Subject']
if subject_header:
subject = decode_header(subject_header)[0][0]
if isinstance(subject, bytes):
try:
detected = chardet.detect(subject)
encoding = detected['encoding']
if not encoding:
encoding = 'utf-8'
subject = subject.decode(encoding, errors='replace')
except UnicodeDecodeError:
subject = subject.decode('utf-8')
else:
return
attachments = []
if email_message.is_multipart():
for part in email_message.walk():
content_disposition = str(part.get("Content-Disposition"))
if "attachment" in content_disposition:
filename = part.get_filename()
if filename:
# The filename of the file is the hash of its content, which should de-duplicate files.
filecontents = part.get_payload(decode=True)
if not filecontents:
continue
filehash = murmur3_chunked(filecontents)
part.set_payload(f'MMH3:{filehash}') # replace the attachment with its hash
filepath = self.attachments_dir / f'F{filehash}'
file_obj = FileAttachment(filename, filehash, part.get_content_type())
if not filepath.is_file():
with open(filepath, 'wb') as f:
f.write(filecontents)
attachments.append(file_obj)
raw_email_clean = email_message.as_string()
return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments
except Exception as e:
self.logger.critical(traceback.format_exc())
return
encoding = detected['encoding']
if not encoding:
encoding = 'utf-8'
raw_email = raw_email_bytes.decode(encoding, errors='replace')
email_message = email.message_from_string(raw_email)
date_header = email_message['Date']
if not date_header:
date_header = 'Thu, 1 Jan 1970 00:00:00 +0000'
parsed_date = email.utils.parsedate_to_datetime(date_header)
unix_timestamp = int(parsed_date.timestamp())
from_addr = email_message['From']
to_addr = email_message['To']
if not from_addr and not to_addr:
return
if not from_addr:
from_addr = ''
if not to_addr:
to_addr = ''
from_header = ', '.join(extract_emails(from_addr))
to_header = ', '.join(extract_emails(to_addr))
if '@' not in from_header:
from_header = from_addr
if '@' not in to_header:
to_header = to_addr
subject_header = email_message['Subject']
if subject_header:
subject = decode_header(subject_header)[0][0]
if isinstance(subject, bytes):
try:
detected = chardet.detect(subject)
encoding = detected['encoding']
if not encoding:
encoding = 'utf-8'
subject = subject.decode(encoding, errors='replace')
except UnicodeDecodeError:
subject = subject.decode('utf-8')
else:
return
attachments = []
if email_message.is_multipart():
for part in email_message.walk():
# content_type = part.get_content_type()
content_disposition = str(part.get("Content-Disposition"))
if "attachment" in content_disposition:
filename = part.get_filename()
if filename:
# The filename of the file is the hash of its content, which should de-duplicate files.
filecontents = part.get_payload(decode=True)
if not filecontents:
continue
filehash = murmur3_chunked(filecontents)
part.set_payload(f'MD5:{filehash}') # replace the attachment with its hash
filepath = self.attachments_dir / filehash
file_obj = FileAttachment(filename, filehash)
if not filepath.is_file():
with open(filepath, 'wb') as f:
f.write(filecontents)
attachments.append(file_obj)
raw_email_clean = email_message.as_string()
return unix_timestamp, to_header, from_header, subject, raw_email_clean, attachments
def fetch_folder(self, folder: str, search_criterion: List[str] = None, max_threads: int = 1):
"""
@ -135,7 +140,6 @@ class MailConnection:
first_email_id = int(id_list[0])
latest_email_id = int(id_list[-1])
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads) as executor:
futures = {executor.submit(self.__fetch_email, i) for i in range(latest_email_id, first_email_id, -1)}
for future in concurrent.futures.as_completed(futures):

View File

@ -1,4 +1,5 @@
pyyaml==6.0.1
chardet==5.2.0
humanize==4.9.0
mmh3==4.1.0
mmh3==4.1.0
flask==3.0.2

View File

@ -0,0 +1,63 @@
import json
import sqlite3
from datetime import datetime
from pathlib import Path
import magic
from flask import Flask, render_template, send_from_directory
app = Flask(__name__)
def get_db_connection():
conn = sqlite3.connect('emails.db')
conn.row_factory = sqlite3.Row
return conn
def dict_from_row(row):
return dict(zip(row.keys(), row))
@app.route('/')
def index():
conn = get_db_connection()
folders = conn.execute('SELECT name, table_name FROM folders_mapping').fetchall()
syncs = conn.execute('SELECT * FROM syncs ORDER BY timestamp DESC').fetchall()
conn.close()
syncs = [dict_from_row(sync) for sync in syncs]
for sync in syncs:
sync['timestamp'] = datetime.fromtimestamp(sync['timestamp']).strftime('%Y-%m-%d %H:%M:%S')
return render_template('index.html', folders=folders, syncs=syncs)
@app.route('/folder/<table_name>')
def folder(table_name):
conn = get_db_connection()
emails = conn.execute(f'SELECT * FROM {table_name} ORDER BY timestamp DESC').fetchall()
conn.close()
emails = [dict_from_row(email) for email in emails]
for email in emails:
email['timestamp'] = datetime.fromtimestamp(email['timestamp']).strftime('%Y-%m-%d %H:%M:%S')
return render_template('folder.html', emails=emails, table_name=table_name)
@app.route('/email/<table_name>/<id>')
def email(table_name, id):
conn = get_db_connection()
email = conn.execute(f'SELECT * FROM {table_name} WHERE id = ?', (id,)).fetchone()
conn.close()
email = dict_from_row(email)
email['timestamp'] = datetime.fromtimestamp(email['timestamp']).strftime('%Y-%m-%d %H:%M:%S')
attachments = json.loads(email['attachments'])
return render_template('email.html', email=email, attachments=attachments)
@app.route('/attachments/<path:filename>')
def download_file(filename):
mimetype = magic.from_file(str(Path('attachments', filename)), mime=True)
return send_from_directory('attachments', filename, mimetype=mimetype)
if __name__ == '__main__':
app.run(host='0.0.0.0', debug=True)

17
templates/email.html Normal file
View File

@ -0,0 +1,17 @@
<!doctype html>
<html>
<head>
<title>Email</title>
</head>
<body>
<h1>Email</h1>
<h2>Attachments</h2>
<ul>
{% for attachment in attachments %}
<a href="{{ url_for('download_file', filename='F' + attachment.hash) }}">{{ attachment.filename }}</a>
{% endfor %}
</ul>
<h2>Content</h2>
<pre>{{ email.raw_content }}</pre>
</body>
</html>

23
templates/folder.html Normal file
View File

@ -0,0 +1,23 @@
<!doctype html>
<html>
<head>
<title>Emails</title>
</head>
<body>
<h1>Emails</h1>
<ul>
{% for email in emails %}
<li>
<a href="{{ url_for('email', table_name=table_name, id=email.id) }}">
{{ email.timestamp }} | <i>{{ email.from_email }}</i> - <strong>{{ email.subject }}</strong>
</a>
</li>
{% endfor %}
</ul>
</body>
<style>
ul {
list-style-type: none;
}
</style>
</html>

40
templates/index.html Normal file
View File

@ -0,0 +1,40 @@
<!doctype html>
<html>
<head>
<title>Email Folders</title>
</head>
<body>
<h1>Email Folders</h1>
<ul>
{% for folder in folders %}
<li><a href="{{ url_for('folder', table_name=folder.table_name) }}">{{ folder.name }}</a></li>
{% endfor %}
</ul>
<h1>Last Syncs</h1>
<table>
<tr>
<th>Timestamp</th>
<th>Type</th>
<th>New Emails</th>
<th>New Attachments</th>
<th>New Folders</th>
<th>Duration</th>
</tr>
{% for sync in syncs %}
<tr>
<td>{{ sync.timestamp }}</td>
<td>{{ sync.type }}</td>
<td>{{ sync.new_emails }}</td>
<td>{{ sync.new_attachments }}</td>
<td>{{ sync.new_folders }}</td>
<td>{{ sync.duration }}</td>
</tr>
{% endfor %}
</table>
</body>
<style>
ul {
list-style-type: none;
}
</style>
</html>