This repository has been archived on 2023-10-20. You can view files and clone it, but cannot push or open issues or pull requests.
hf-key-scraper/huggingface/huggingface/spiders/search.py

32 lines
1.2 KiB
Python
Raw Normal View History

2023-07-23 18:11:48 -06:00
import logging
import re
import time
import coloredlogs
import scrapy
2023-10-20 18:54:11 -06:00
from string import ascii_lowercase, ascii_uppercase
2023-07-23 18:11:48 -06:00
from ..pipelines import FoundItem
class HFSearchSpider(scrapy.Spider):
name = "huggingface_search"
allowed_domains = ["huggingface.co"]
2023-10-20 18:54:11 -06:00
start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in [*ascii_lowercase, *ascii_uppercase, *range(10), 'etk', 'ant'] for j in range(10)]
2023-07-23 18:11:48 -06:00
logger = logging.getLogger('HFSearchSpider')
logger.setLevel(logging.DEBUG)
coloredlogs.install(logger=logger, level=logging.DEBUG)
def __init__(self, *args, **kwargs):
super(HFSearchSpider, self).__init__(*args, **kwargs)
self.filename = f'found-keys-{int(time.time())}.txt'
def parse(self, response):
self.logger.debug(f"Status: {response.status}, URL: {response.url}")
keys = re.findall(r'(sk-.)</span>([a-zA-Z0-9]{47})', response.text)
found_keys = set()
for key in keys:
found_keys.add(''.join(key))
if len(found_keys):
2023-10-20 18:54:11 -06:00
self.logger.debug(f"{found_keys}")
2023-07-23 18:11:48 -06:00
yield FoundItem(url=response.url, matches=found_keys)