diff --git a/.gitignore b/.gitignore index 309e3a1..5c222a8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ .idea +found-keys*.txt + # ---> Python # Byte-compiled / optimized / DLL files diff --git a/README.md b/README.md index e650915..49cc2c8 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ # hf-key-scraper -Scrape for OpenAI keys on Hugging Face \ No newline at end of file +Scrape for OpenAI keys on Hugging Face. + +This requires at least one Squid proxy to function. Define your proxies in `settings.py` -> `PROXIES`. diff --git a/huggingface/check-keys.py b/huggingface/check-keys.py index 3ae0035..358fe06 100644 --- a/huggingface/check-keys.py +++ b/huggingface/check-keys.py @@ -1,8 +1,9 @@ import argparse -import concurrent from concurrent.futures import ThreadPoolExecutor from pathlib import Path +from tqdm import tqdm + from keychecker.check import check_key @@ -17,26 +18,29 @@ def process_key(key): def main(): parser = argparse.ArgumentParser(description='Scrape data from chub.ai.') - parser.add_argument('input_file', help='Path to the file containing the keys.') + parser.add_argument('input_file', nargs='*', help='Path to the file containing the keys.') args = parser.parse_args() - input_file = Path(args.input_file).resolve().absolute() - - if not input_file.exists(): - print('Input file does not exist:', input_file) - quit(1) - - with open(input_file) as f: - content = set(f.read().splitlines()) + keys = set() + for file in args.input_file: + input_file = Path(file).resolve().expanduser().absolute() + if not input_file.exists(): + print('Input file does not exist:', input_file) + quit(1) + data = set(input_file.read_text().splitlines()) + keys = keys | data # content = ['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'] + print('Checking', len(keys), 'keys...') + gpt_4 = set() gpt_4_32k = set() gpt_3 = set() + pbar = tqdm(total=len(keys)) with ThreadPoolExecutor(max_workers=50) as executor: - results = executor.map(process_key, content) + results = executor.map(process_key, keys) for result in results: if result is not None: if result['has_gpt_4']: @@ -45,6 +49,8 @@ def main(): gpt_4_32k.add(result['api_key']) if result['has_only_turbo']: gpt_3.add(result['api_key']) + pbar.update(1) + pbar.close() print('') print('GPT4 KEYS:') diff --git a/huggingface/huggingface/settings.py b/huggingface/huggingface/settings.py index 0f63c92..6bc19e7 100644 --- a/huggingface/huggingface/settings.py +++ b/huggingface/huggingface/settings.py @@ -92,10 +92,7 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" FEED_EXPORT_ENCODING = "utf-8" PROXIES = [ - "http://172.0.4.7:3128", - "http://172.0.4.8:3128", - "http://172.0.4.10:3128", - "http://172.0.4.12:3128", + "http://127.0.0.1:3128" ] LOG_LEVEL = 'CRITICAL' diff --git a/huggingface/huggingface/spiders/huggingface.py b/huggingface/huggingface/spiders/huggingface.py index ff36b84..60972d7 100644 --- a/huggingface/huggingface/spiders/huggingface.py +++ b/huggingface/huggingface/spiders/huggingface.py @@ -31,5 +31,5 @@ class HuggingfaceSpider(CrawlSpider): text = response.text matches = self.regex.findall(text) if matches: - self.logger.error(f"{matches}") + self.logger.debug(f"{matches}") return FoundItem(url=response.url, matches=matches) diff --git a/huggingface/huggingface/spiders/search.py b/huggingface/huggingface/spiders/search.py index 70b7286..2b1a018 100644 --- a/huggingface/huggingface/spiders/search.py +++ b/huggingface/huggingface/spiders/search.py @@ -4,14 +4,14 @@ import time import coloredlogs import scrapy - +from string import ascii_lowercase, ascii_uppercase from ..pipelines import FoundItem class HFSearchSpider(scrapy.Spider): name = "huggingface_search" allowed_domains = ["huggingface.co"] - start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in ['a', 'h', 'm', 'q'] for j in range(10)] + start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in [*ascii_lowercase, *ascii_uppercase, *range(10), 'etk', 'ant'] for j in range(10)] logger = logging.getLogger('HFSearchSpider') logger.setLevel(logging.DEBUG) coloredlogs.install(logger=logger, level=logging.DEBUG) @@ -27,5 +27,5 @@ class HFSearchSpider(scrapy.Spider): for key in keys: found_keys.add(''.join(key)) if len(found_keys): - self.logger.error(f"{found_keys}") + self.logger.debug(f"{found_keys}") yield FoundItem(url=response.url, matches=found_keys)