fix rumtime error
This commit is contained in:
parent
b83a5d7814
commit
066af5d3b6
14
spider.py
14
spider.py
|
@ -1,5 +1,6 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import copy
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
@ -96,7 +97,7 @@ def handle_room_message(event, room_id, client):
|
||||||
# Add any room IDs we find in the message to our database
|
# Add any room IDs we find in the message to our database
|
||||||
room_ids = re.findall(ROOM_ID_REGEX, event.body)
|
room_ids = re.findall(ROOM_ID_REGEX, event.body)
|
||||||
for new_room_id in room_ids:
|
for new_room_id in room_ids:
|
||||||
print(new_room_id, room_id)
|
# print(new_room_id, room_id)
|
||||||
if new_room_id not in client.rooms:
|
if new_room_id not in client.rooms:
|
||||||
print(f'Found a new room: {room_id}')
|
print(f'Found a new room: {room_id}')
|
||||||
asyncio.create_task(join_room(client, new_room_id))
|
asyncio.create_task(join_room(client, new_room_id))
|
||||||
|
@ -154,8 +155,6 @@ def handle_room_message(event, room_id, client):
|
||||||
|
|
||||||
room_id, server_hostname = sanitize_room_id(room_id)
|
room_id, server_hostname = sanitize_room_id(room_id)
|
||||||
|
|
||||||
# print(event.event_id, room_id, event.sender, event_type, content, event.server_timestamp)
|
|
||||||
|
|
||||||
if event_type:
|
if event_type:
|
||||||
try:
|
try:
|
||||||
c.execute("INSERT INTO messages (event_id, room_id, sender, event_type, content, timestamp) VALUES (?, ?, ?, ?, ?, ?)",
|
c.execute("INSERT INTO messages (event_id, room_id, sender, event_type, content, timestamp) VALUES (?, ?, ?, ?, ?, ?)",
|
||||||
|
@ -232,7 +231,6 @@ async def crawl_room_history(client, room_id):
|
||||||
if not response.chunk:
|
if not response.chunk:
|
||||||
break
|
break
|
||||||
|
|
||||||
# Store room members in the database
|
|
||||||
room = client.rooms[room_id]
|
room = client.rooms[room_id]
|
||||||
store_room_members(room_id, room)
|
store_room_members(room_id, room)
|
||||||
|
|
||||||
|
@ -254,12 +252,16 @@ async def main():
|
||||||
# Sync with the server to get the joined rooms
|
# Sync with the server to get the joined rooms
|
||||||
await client.sync(timeout=30000)
|
await client.sync(timeout=30000)
|
||||||
|
|
||||||
|
# We are going to be joining rooms between crawl iterations.
|
||||||
|
# Freezing the joined rooms prevents "RuntimeError: dictionary changed size during iteration"
|
||||||
|
client_rooms_frozen = copy.deepcopy(client.rooms)
|
||||||
|
|
||||||
# Crawl through the history of each joined room
|
# Crawl through the history of each joined room
|
||||||
for room_id in client.rooms:
|
for room_id in client_rooms_frozen:
|
||||||
print(f"Crawling room history: {room_id}")
|
print(f"Crawling room history: {room_id}")
|
||||||
await crawl_room_history(client, room_id)
|
await crawl_room_history(client, room_id)
|
||||||
|
|
||||||
# TODO: parse https://matrix-client.matrix.org/_matrix/client/r0/publicRooms?limit=1000 to get more rooms to crawl
|
# TODO: parse https://{HOMESERVER}/_matrix/client/r0/publicRooms?limit=1000 to get more rooms to crawl
|
||||||
# TODO: thread the crawlers for each room
|
# TODO: thread the crawlers for each room
|
||||||
|
|
||||||
print('Crawl complete!')
|
print('Crawl complete!')
|
||||||
|
|
Loading…
Reference in New Issue