shuffle some things around

This commit is contained in:
Cyberes 2023-06-08 22:46:23 -06:00
parent 3448e4d683
commit afaac4247a
1 changed files with 30 additions and 23 deletions

53
spider.py Executable file → Normal file
View File

@ -6,8 +6,12 @@ import re
import sqlite3
import time
from nio import (AsyncClient, CallEvent, JoinError, MatrixRoom, MegolmEvent, PowerLevelsEvent, RedactionEvent, RoomAvatarEvent, RoomCreateEvent, RoomEncryptedAudio, RoomEncryptedFile, RoomEncryptedImage, RoomEncryptedVideo, RoomEncryptionEvent, RoomGuestAccessEvent, RoomHistoryVisibilityEvent,
RoomJoinRulesEvent, RoomMemberEvent, RoomMessageAudio, RoomMessageEmote, RoomMessageFile, RoomMessageImage, RoomMessageNotice, RoomMessageText, RoomMessageUnknown, RoomMessageVideo, RoomMessagesError, RoomNameEvent, RoomTopicEvent, RoomUpgradeEvent, StickerEvent,
from nio import (AsyncClient, CallEvent, JoinError, MatrixRoom, MegolmEvent, PowerLevelsEvent, RedactionEvent,
RoomAvatarEvent, RoomCreateEvent, RoomEncryptedAudio, RoomEncryptedFile, RoomEncryptedImage,
RoomEncryptedVideo, RoomEncryptionEvent, RoomGuestAccessEvent, RoomHistoryVisibilityEvent,
RoomJoinRulesEvent, RoomMemberEvent, RoomMessageAudio, RoomMessageEmote, RoomMessageFile,
RoomMessageImage, RoomMessageNotice, RoomMessageText, RoomMessageUnknown, RoomMessageVideo,
RoomMessagesError, RoomNameEvent, RoomTopicEvent, RoomUpgradeEvent, StickerEvent,
UnknownEncryptedEvent, UnknownEvent)
# SQLite database setup
@ -45,7 +49,7 @@ PASSWORD = "jkldskjldsajklsda"
HOMESERVER = "https://matrix-client.matrix.org"
# Regular expression to match room IDs
ROOM_ID_REGEX = re.compile(r"([!#][A-Za-z0-9]+:[A-Za-z0-9.-]+)")
ROOM_ID_REGEX = re.compile(r"([!#][A-Za-z0-9]+:(?:[\w-]+\.)*[\w-]{1,63}(?:\.(?:\w{3}|\w{2})))")
def handle_room_message(event, room_id, client):
@ -97,7 +101,6 @@ def handle_room_message(event, room_id, client):
# Add any room IDs we find in the message to our database
room_ids = re.findall(ROOM_ID_REGEX, event.body)
for new_room_id in room_ids:
# print(new_room_id, room_id)
if new_room_id not in client.rooms:
print(f'Found a new room: {room_id}')
asyncio.create_task(join_room(client, new_room_id))
@ -157,8 +160,9 @@ def handle_room_message(event, room_id, client):
if event_type:
try:
c.execute("INSERT INTO messages (event_id, room_id, sender, event_type, content, timestamp) VALUES (?, ?, ?, ?, ?, ?)",
(event.event_id, room_id, event.sender, event_type, content, event.server_timestamp))
c.execute(
"INSERT INTO messages (event_id, room_id, sender, event_type, content, timestamp) VALUES (?, ?, ?, ?, ?, ?)",
(event.event_id, room_id, event.sender, event_type, content, event.server_timestamp))
conn.commit()
print(f"Added message: {event.event_id}")
except sqlite3.IntegrityError:
@ -171,8 +175,9 @@ def store_room_members(room_id, room):
for user_id in room.users:
server_hostname = user_id.split(":")[1]
try:
c.execute("INSERT INTO room_members (room_id, user_id, server_hostname, snapshot_timestamp) VALUES (?, ?, ?, ?)",
(room_id, user_id, server_hostname, snapshot_timestamp))
c.execute(
"INSERT INTO room_members (room_id, user_id, server_hostname, snapshot_timestamp) VALUES (?, ?, ?, ?)",
(room_id, user_id, server_hostname, snapshot_timestamp))
conn.commit()
print(f"Added member {user_id} to room {room_id}")
except sqlite3.IntegrityError:
@ -180,13 +185,16 @@ def store_room_members(room_id, room):
async def join_room(client, room_id):
response = await client.join(room_id)
if isinstance(response, JoinError):
print(f"Error while joining room {room_id}: {response.message}")
else:
print(f"Joined room {room_id}")
await client.sync(timeout=30000)
insert_room(client.rooms[room_id])
try:
response = await client.join(room_id)
if isinstance(response, JoinError):
print(f"Error while joining room {room_id}: {response.message}")
else:
print(f"Joined room {room_id}")
await client.sync(timeout=30000)
except Exception as e:
print(f'Failed to join room {room_id}:', vars(e))
insert_room(room_id, topic=None, room_name=None)
def sanitize_room_id(room_id: str):
@ -207,13 +215,13 @@ def sanitize_room_id(room_id: str):
return room_id, server_hostname
def insert_room(room: MatrixRoom):
def insert_room(room_id: str, room_name, topic):
snapshot_timestamp = int(time.time())
room_name = room.display_name
topic = room.topic
room_id, server_hostname = sanitize_room_id(room.room_id)
room_id, server_hostname = sanitize_room_id(room_id)
try:
c.execute("INSERT INTO rooms (room_id, server_hostname, room_name, topic, snapshot_timestamp) VALUES (?, ?, ?, ?, ?)", (room_id, server_hostname, room_name, topic, snapshot_timestamp))
c.execute(
"INSERT INTO rooms (room_id, server_hostname, room_name, topic, snapshot_timestamp) VALUES (?, ?, ?, ?, ?)",
(room_id, server_hostname, room_name, topic, snapshot_timestamp))
conn.commit()
except sqlite3.IntegrityError:
pass
@ -231,10 +239,9 @@ async def crawl_room_history(client, room_id):
if not response.chunk:
break
room = client.rooms[room_id]
store_room_members(room_id, room)
store_room_members(room_id, client.rooms[room_id])
insert_room(room)
insert_room(room_id, topic=client.rooms[room_id].topic, room_name=client.rooms[room_id].name)
for event in response.chunk:
handle_room_message(event, room_id, client)