fix rumtime error
This commit is contained in:
parent
b83a5d7814
commit
066af5d3b6
14
spider.py
14
spider.py
|
@ -1,5 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
import copy
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
|
@ -96,7 +97,7 @@ def handle_room_message(event, room_id, client):
|
|||
# Add any room IDs we find in the message to our database
|
||||
room_ids = re.findall(ROOM_ID_REGEX, event.body)
|
||||
for new_room_id in room_ids:
|
||||
print(new_room_id, room_id)
|
||||
# print(new_room_id, room_id)
|
||||
if new_room_id not in client.rooms:
|
||||
print(f'Found a new room: {room_id}')
|
||||
asyncio.create_task(join_room(client, new_room_id))
|
||||
|
@ -154,8 +155,6 @@ def handle_room_message(event, room_id, client):
|
|||
|
||||
room_id, server_hostname = sanitize_room_id(room_id)
|
||||
|
||||
# print(event.event_id, room_id, event.sender, event_type, content, event.server_timestamp)
|
||||
|
||||
if event_type:
|
||||
try:
|
||||
c.execute("INSERT INTO messages (event_id, room_id, sender, event_type, content, timestamp) VALUES (?, ?, ?, ?, ?, ?)",
|
||||
|
@ -232,7 +231,6 @@ async def crawl_room_history(client, room_id):
|
|||
if not response.chunk:
|
||||
break
|
||||
|
||||
# Store room members in the database
|
||||
room = client.rooms[room_id]
|
||||
store_room_members(room_id, room)
|
||||
|
||||
|
@ -254,12 +252,16 @@ async def main():
|
|||
# Sync with the server to get the joined rooms
|
||||
await client.sync(timeout=30000)
|
||||
|
||||
# We are going to be joining rooms between crawl iterations.
|
||||
# Freezing the joined rooms prevents "RuntimeError: dictionary changed size during iteration"
|
||||
client_rooms_frozen = copy.deepcopy(client.rooms)
|
||||
|
||||
# Crawl through the history of each joined room
|
||||
for room_id in client.rooms:
|
||||
for room_id in client_rooms_frozen:
|
||||
print(f"Crawling room history: {room_id}")
|
||||
await crawl_room_history(client, room_id)
|
||||
|
||||
# TODO: parse https://matrix-client.matrix.org/_matrix/client/r0/publicRooms?limit=1000 to get more rooms to crawl
|
||||
# TODO: parse https://{HOMESERVER}/_matrix/client/r0/publicRooms?limit=1000 to get more rooms to crawl
|
||||
# TODO: thread the crawlers for each room
|
||||
|
||||
print('Crawl complete!')
|
||||
|
|
Loading…
Reference in New Issue