hf-key-scraper/huggingface/huggingface/spiders/huggingface.py

import logging
import re
import time

import coloredlogs
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule

from ..pipelines import FoundItem


class HuggingfaceSpider(CrawlSpider):
    name = "huggingface"
    allowed_domains = ["huggingface.co"]
    start_urls = ["https://huggingface.co/"]
    logger = logging.getLogger('HFSearchSpider')
    logger.setLevel(logging.DEBUG)
    coloredlogs.install(logger=logger, level=logging.DEBUG)

    rules = (
        Rule(LinkExtractor(allow_domains=allowed_domains), callback='parse_start_url', follow=True),
    )

    def __init__(self, *args, **kwargs):
        super(HuggingfaceSpider, self).__init__(*args, **kwargs)
        self.regex = re.compile(r'sk-[a-zA-Z0-9]{47}')
        self.filename = f'found-keys-site-{int(time.time())}.txt'

    def parse_start_url(self, response):
        self.logger.debug(f"Status: {response.status}, URL: {response.url}")
        text = response.text
        matches = self.regex.findall(text)
        if matches:
            self.logger.debug(f"{matches}")
            return FoundItem(url=response.url, matches=matches)