initial prototype

This commit is contained in:
Cyberes 2023-06-06 00:55:26 -06:00
parent be55fac31c
commit a9a84ca3c0
4 changed files with 275 additions and 2 deletions

4
.gitignore vendored
View File

@ -1,3 +1,6 @@
matrix_rooms.db*
.idea
# ---> Python
# Byte-compiled / optimized / DLL files
__pycache__/
@ -159,4 +162,3 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

View File

@ -1,3 +1,6 @@
# matrix-spider-bot
A bot that explores the Matrix federation web.
_A bot that explores the Matrix federation web._
This is a Matrix bot that crawls through historical messages in rooms, looking for other Matrix room IDs. It then tries to join those rooms and continues the process.
The bot stores messages it finds in its database (including system metadata messages like edits and joins) along with the users in a room, plus various other metadata.

1
requirements.txt Normal file
View File

@ -0,0 +1 @@
matrix-nio[e2e]

267
spider.py Normal file
View File

@ -0,0 +1,267 @@
import asyncio
import json
import re
import sqlite3
import time
from nio import (AsyncClient, CallEvent, JoinError, MegolmEvent, PowerLevelsEvent, RedactionEvent, RoomAvatarEvent, RoomCreateEvent, RoomEncryptedAudio, RoomEncryptedFile, RoomEncryptedImage, RoomEncryptedVideo, RoomEncryptionEvent, RoomGuestAccessEvent, RoomHistoryVisibilityEvent,
RoomJoinRulesEvent, RoomMemberEvent, RoomMessageAudio, RoomMessageEmote, RoomMessageFile, RoomMessageImage, RoomMessageNotice, RoomMessageText, RoomMessageUnknown, RoomMessageVideo, RoomMessagesError, RoomNameEvent, RoomTopicEvent, RoomUpgradeEvent, StickerEvent,
UnknownEncryptedEvent, UnknownEvent)
# SQLite database setup
conn = sqlite3.connect("matrix_rooms.db")
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS rooms (room_id TEXT PRIMARY KEY, server_hostname TEXT)''')
c.execute('''CREATE TABLE IF NOT EXISTS messages (
id INTEGER PRIMARY KEY,
event_id TEXT UNIQUE,
room_id TEXT,
sender TEXT,
event_type TEXT,
content TEXT,
timestamp INTEGER
)''')
c.execute('''CREATE TABLE IF NOT EXISTS room_members (
id INTEGER PRIMARY KEY,
room_id TEXT,
user_id TEXT,
server_hostname TEXT,
room_name TEXT,
topic TEXT,
snapshot_timestamp INTEGER,
current_timestamp INTEGER,
UNIQUE (room_id, user_id)
)''')
conn.commit()
# Matrix account credentials
USERNAME = "user1"
PASSWORD = "jkldskjldsajklsda"
HOMESERVER = "https://matrix-client.matrix.org"
# Regular expression to match room IDs
ROOM_ID_REGEX = re.compile(r"([!#][A-Za-z0-9]+:[A-Za-z0-9.-]+)")
def handle_room_message(event, room_id, client):
event_type = None
content = None
if isinstance(event, UnknownEvent):
event_type = 'unknown'
content = event.source
elif isinstance(event, UnknownEncryptedEvent):
event_type = 'unknown_encrypted'
content = event.source
elif isinstance(event, MegolmEvent):
event_type = 'megolm'
content = event.source
elif isinstance(event, CallEvent):
event_type = 'call'
content = event.source
elif isinstance(event, RoomEncryptionEvent):
event_type = 'encryption_enabled'
content = event.source
elif isinstance(event, RoomCreateEvent):
event_type = 'room_create'
content = event.source
elif isinstance(event, RoomGuestAccessEvent):
event_type = 'guest_access'
content = event.guest_access
elif isinstance(event, RoomJoinRulesEvent):
event_type = 'join_rules'
content = event.join_rule
elif isinstance(event, RoomHistoryVisibilityEvent):
event_type = 'history_visibility'
content = event.history_visibility
elif isinstance(event, RoomNameEvent):
event_type = 'room_name'
content = event.source
elif isinstance(event, RoomTopicEvent):
event_type = 'room_name'
content = event.source
elif isinstance(event, RoomAvatarEvent):
event_type = 'room_avatar'
content = event.avatar_url
elif isinstance(event, RoomMemberEvent):
event_type = 'member'
content = event.source
elif isinstance(event, RoomMessageText):
event_type = 'message'
content = event.body
# Add any room IDs we find in the message to our database
room_ids = re.findall(ROOM_ID_REGEX, event.body)
for new_room_id in room_ids:
insert_room(new_room_id)
# new_room_id = new_room_id[0] if new_room_id[0] else new_room_id[1]
if new_room_id not in client.rooms:
asyncio.create_task(join_room(client, new_room_id))
elif isinstance(event, RoomMessageEmote):
event_type = 'emote'
content = event.body
elif isinstance(event, RoomMessageNotice):
event_type = 'notice'
content = event.body
elif isinstance(event, RoomMessageUnknown):
event_type = 'unknown_msg'
content = event.source
elif isinstance(event, PowerLevelsEvent):
event_type = 'power_levels'
content = str(event.power_levels)
elif isinstance(event, RedactionEvent):
event_type = 'redaction'
content = {'redacts': event.redacts, 'reason': event.reason}
elif isinstance(event, StickerEvent):
event_type = 'sticker'
content = event.source
elif isinstance(event, RoomUpgradeEvent):
event_type = 'room_upgrade'
content = {'body': event.body, 'replacement_room': event.replacement_room}
elif isinstance(event, RoomMessageImage):
event_type = 'message_image'
content = {'url': event.url, 'body': event.body}
elif isinstance(event, RoomMessageAudio):
event_type = 'message_audio'
content = {'url': event.url, 'body': event.body}
elif isinstance(event, RoomMessageVideo):
event_type = 'message_video'
content = {'url': event.url, 'body': event.body}
elif isinstance(event, RoomMessageFile):
event_type = 'message_file'
content = {'url': event.url, 'body': event.body}
elif isinstance(event, RoomEncryptedImage):
event_type = 'message_enc_image'
content = event.source
elif isinstance(event, RoomEncryptedAudio):
event_type = 'message_enc_audio'
content = event.source
elif isinstance(event, RoomEncryptedVideo):
event_type = 'message_enc_video'
content = event.source
elif isinstance(event, RoomEncryptedFile):
event_type = 'message_enc_file'
content = event.source
else:
event_type = 'event_type_not_found'
content = event.source
if isinstance(content, dict):
content = json.dumps(content)
room_id, server_hostname = sanitize_room_id(room_id)
# print(event.event_id, room_id, event.sender, event_type, content, event.server_timestamp)
if event_type:
try:
c.execute("INSERT INTO messages (event_id, room_id, sender, event_type, content, timestamp) VALUES (?, ?, ?, ?, ?, ?)",
(event.event_id, room_id, event.sender, event_type, content, event.server_timestamp))
conn.commit()
print(f"Added message: {event.event_id}")
except sqlite3.IntegrityError:
# print(f"Message {event.event_id} already exists in the database.")
pass
def store_room_members(room_id, room):
snapshot_timestamp = int(time.time())
room_name = room.display_name
topic = room.topic
for user_id in room.users:
server_hostname = user_id.split(":")[1]
try:
c.execute("INSERT INTO room_members (room_id, user_id, server_hostname, room_name, topic, snapshot_timestamp, current_timestamp) VALUES (?, ?, ?, ?, ?, ?, ?)",
(room_id, user_id, server_hostname, room_name, topic, snapshot_timestamp, snapshot_timestamp))
conn.commit()
print(f"Added member {user_id} to room {room_id}")
except sqlite3.IntegrityError:
pass
async def join_room(client, room_id):
response = await client.join(room_id)
if isinstance(response, JoinError):
print(f"Error while joining room {room_id}: {response.message}")
else:
print(f"Joined room {room_id}")
def sanitize_room_id(room_id: str):
if isinstance(room_id, tuple):
room_id = list(filter(None, list(room_id)))
if len(room_id) == 1:
room_id = room_id[0]
if room_id.startswith('#/#'):
room_id.replace('#/#', '#')
elif room_id.startswith('#/!'):
room_id.replace('#/!', '!')
server_hostname = room_id.split(":")[1]
else:
server_hostname = room_id[0].split(":")[1]
room_id = json.dumps(room_id)
else:
server_hostname = room_id.split(":")[1]
return room_id, server_hostname
def insert_room(room_id: str):
room_id, server_hostname = sanitize_room_id(room_id)
try:
c.execute("INSERT INTO rooms (room_id, server_hostname) VALUES (?, ?)", (room_id, server_hostname))
conn.commit()
print(f"Added room: {room_id}")
except sqlite3.IntegrityError:
pass
async def crawl_room_history(client, room_id):
prev_batch = ''
while True:
response = await client.room_messages(room_id=room_id, start=prev_batch, limit=100)
if isinstance(response, RoomMessagesError):
print(f"Error while fetching room messages: {response.message}")
break
if not response.chunk:
break
# Store room members in the database
room = client.rooms[room_id]
store_room_members(room_id, room)
insert_room(room_id)
for event in response.chunk:
handle_room_message(event, room_id, client)
prev_batch = response.end
async def main():
client = AsyncClient(HOMESERVER, USERNAME)
try:
await client.login(PASSWORD)
print(f"Logged in as {USERNAME}")
while True:
# Sync with the server to get the joined rooms
await client.sync(timeout=30000)
# Crawl through the history of each joined room
for room_id in client.rooms:
print(f"Crawling room history: {room_id}")
await crawl_room_history(client, room_id)
print('Crawl complete!')
print('===============================')
print('Sleeping 1 minute...')
time.sleep(60)
finally:
await client.close()
if __name__ == "__main__":
asyncio.run(main())