32 lines
1.1 KiB
Python
32 lines
1.1 KiB
Python
import logging
|
|
import re
|
|
import time
|
|
|
|
import coloredlogs
|
|
import scrapy
|
|
|
|
from ..pipelines import FoundItem
|
|
|
|
|
|
class HFSearchSpider(scrapy.Spider):
|
|
name = "huggingface_search"
|
|
allowed_domains = ["huggingface.co"]
|
|
start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in ['a', 'h', 'm', 'q'] for j in range(10)]
|
|
logger = logging.getLogger('HFSearchSpider')
|
|
logger.setLevel(logging.DEBUG)
|
|
coloredlogs.install(logger=logger, level=logging.DEBUG)
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
super(HFSearchSpider, self).__init__(*args, **kwargs)
|
|
self.filename = f'found-keys-{int(time.time())}.txt'
|
|
|
|
def parse(self, response):
|
|
self.logger.debug(f"Status: {response.status}, URL: {response.url}")
|
|
keys = re.findall(r'(sk-.)</span>([a-zA-Z0-9]{47})', response.text)
|
|
found_keys = set()
|
|
for key in keys:
|
|
found_keys.add(''.join(key))
|
|
if len(found_keys):
|
|
self.logger.error(f"{found_keys}")
|
|
yield FoundItem(url=response.url, matches=found_keys)
|