36 lines
1.1 KiB
Python
36 lines
1.1 KiB
Python
import logging
|
|
import re
|
|
import time
|
|
|
|
import coloredlogs
|
|
from scrapy.linkextractors import LinkExtractor
|
|
from scrapy.spiders import CrawlSpider, Rule
|
|
|
|
from ..pipelines import FoundItem
|
|
|
|
|
|
class HuggingfaceSpider(CrawlSpider):
|
|
name = "huggingface"
|
|
allowed_domains = ["huggingface.co"]
|
|
start_urls = ["https://huggingface.co/"]
|
|
logger = logging.getLogger('HFSearchSpider')
|
|
logger.setLevel(logging.DEBUG)
|
|
coloredlogs.install(logger=logger, level=logging.DEBUG)
|
|
|
|
rules = (
|
|
Rule(LinkExtractor(allow_domains=allowed_domains), callback='parse_start_url', follow=True),
|
|
)
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
super(HuggingfaceSpider, self).__init__(*args, **kwargs)
|
|
self.regex = re.compile(r'sk-[a-zA-Z0-9]{47}')
|
|
self.filename = f'found-keys-site-{int(time.time())}.txt'
|
|
|
|
def parse_start_url(self, response):
|
|
self.logger.debug(f"Status: {response.status}, URL: {response.url}")
|
|
text = response.text
|
|
matches = self.regex.findall(text)
|
|
if matches:
|
|
self.logger.debug(f"{matches}")
|
|
return FoundItem(url=response.url, matches=matches)
|