This repository has been archived on 2023-10-20. You can view files and clone it, but cannot push or open issues or pull requests.
hf-key-scraper/huggingface/huggingface/spiders/huggingface.py

36 lines
1.1 KiB
Python

import logging
import re
import time
import coloredlogs
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..pipelines import FoundItem
class HuggingfaceSpider(CrawlSpider):
name = "huggingface"
allowed_domains = ["huggingface.co"]
start_urls = ["https://huggingface.co/"]
logger = logging.getLogger('HFSearchSpider')
logger.setLevel(logging.DEBUG)
coloredlogs.install(logger=logger, level=logging.DEBUG)
rules = (
Rule(LinkExtractor(allow_domains=allowed_domains), callback='parse_start_url', follow=True),
)
def __init__(self, *args, **kwargs):
super(HuggingfaceSpider, self).__init__(*args, **kwargs)
self.regex = re.compile(r'sk-[a-zA-Z0-9]{47}')
self.filename = f'found-keys-site-{int(time.time())}.txt'
def parse_start_url(self, response):
self.logger.debug(f"Status: {response.status}, URL: {response.url}")
text = response.text
matches = self.regex.findall(text)
if matches:
self.logger.error(f"{matches}")
return FoundItem(url=response.url, matches=matches)