import logging import re import time import coloredlogs import scrapy from string import ascii_lowercase, ascii_uppercase from ..pipelines import FoundItem class HFSearchSpider(scrapy.Spider): name = "huggingface_search" allowed_domains = ["huggingface.co"] start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in [*ascii_lowercase, *ascii_uppercase, *range(10), 'etk', 'ant'] for j in range(10)] logger = logging.getLogger('HFSearchSpider') logger.setLevel(logging.DEBUG) coloredlogs.install(logger=logger, level=logging.DEBUG) def __init__(self, *args, **kwargs): super(HFSearchSpider, self).__init__(*args, **kwargs) self.filename = f'found-keys-{int(time.time())}.txt' def parse(self, response): self.logger.debug(f"Status: {response.status}, URL: {response.url}") keys = re.findall(r'(sk-.)([a-zA-Z0-9]{47})', response.text) found_keys = set() for key in keys: found_keys.add(''.join(key)) if len(found_keys): self.logger.debug(f"{found_keys}") yield FoundItem(url=response.url, matches=found_keys)