import logging import re import time import coloredlogs from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from ..pipelines import FoundItem class HuggingfaceSpider(CrawlSpider): name = "huggingface" allowed_domains = ["huggingface.co"] start_urls = ["https://huggingface.co/"] logger = logging.getLogger('HFSearchSpider') logger.setLevel(logging.DEBUG) coloredlogs.install(logger=logger, level=logging.DEBUG) rules = ( Rule(LinkExtractor(allow_domains=allowed_domains), callback='parse_start_url', follow=True), ) def __init__(self, *args, **kwargs): super(HuggingfaceSpider, self).__init__(*args, **kwargs) self.regex = re.compile(r'sk-[a-zA-Z0-9]{47}') self.filename = f'found-keys-site-{int(time.time())}.txt' def parse_start_url(self, response): self.logger.debug(f"Status: {response.status}, URL: {response.url}") text = response.text matches = self.regex.findall(text) if matches: self.logger.error(f"{matches}") return FoundItem(url=response.url, matches=matches)