This repository has been archived on 2023-10-20. You can view files and clone it, but cannot push or open issues or pull requests.
hf-key-scraper/huggingface/huggingface/spiders/search.py

32 lines
1.2 KiB
Python

import logging
import re
import time
import coloredlogs
import scrapy
from string import ascii_lowercase, ascii_uppercase
from ..pipelines import FoundItem
class HFSearchSpider(scrapy.Spider):
name = "huggingface_search"
allowed_domains = ["huggingface.co"]
start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in [*ascii_lowercase, *ascii_uppercase, *range(10), 'etk', 'ant'] for j in range(10)]
logger = logging.getLogger('HFSearchSpider')
logger.setLevel(logging.DEBUG)
coloredlogs.install(logger=logger, level=logging.DEBUG)
def __init__(self, *args, **kwargs):
super(HFSearchSpider, self).__init__(*args, **kwargs)
self.filename = f'found-keys-{int(time.time())}.txt'
def parse(self, response):
self.logger.debug(f"Status: {response.status}, URL: {response.url}")
keys = re.findall(r'(sk-.)</span>([a-zA-Z0-9]{47})', response.text)
found_keys = set()
for key in keys:
found_keys.add(''.join(key))
if len(found_keys):
self.logger.debug(f"{found_keys}")
yield FoundItem(url=response.url, matches=found_keys)