final commit

This commit is contained in:
Cyberes 2023-10-20 18:54:11 -06:00
parent 15759586e4
commit c13c224b36
6 changed files with 27 additions and 20 deletions

2
.gitignore vendored
View File

@ -1,4 +1,6 @@
.idea
found-keys*.txt
# ---> Python
# Byte-compiled / optimized / DLL files

View File

@ -1,3 +1,5 @@
# hf-key-scraper
Scrape for OpenAI keys on Hugging Face
Scrape for OpenAI keys on Hugging Face.
This requires at least one Squid proxy to function. Define your proxies in `settings.py` -> `PROXIES`.

View File

@ -1,8 +1,9 @@
import argparse
import concurrent
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from tqdm import tqdm
from keychecker.check import check_key
@ -17,26 +18,29 @@ def process_key(key):
def main():
parser = argparse.ArgumentParser(description='Scrape data from chub.ai.')
parser.add_argument('input_file', help='Path to the file containing the keys.')
parser.add_argument('input_file', nargs='*', help='Path to the file containing the keys.')
args = parser.parse_args()
input_file = Path(args.input_file).resolve().absolute()
if not input_file.exists():
print('Input file does not exist:', input_file)
quit(1)
with open(input_file) as f:
content = set(f.read().splitlines())
keys = set()
for file in args.input_file:
input_file = Path(file).resolve().expanduser().absolute()
if not input_file.exists():
print('Input file does not exist:', input_file)
quit(1)
data = set(input_file.read_text().splitlines())
keys = keys | data
# content = ['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']
print('Checking', len(keys), 'keys...')
gpt_4 = set()
gpt_4_32k = set()
gpt_3 = set()
pbar = tqdm(total=len(keys))
with ThreadPoolExecutor(max_workers=50) as executor:
results = executor.map(process_key, content)
results = executor.map(process_key, keys)
for result in results:
if result is not None:
if result['has_gpt_4']:
@ -45,6 +49,8 @@ def main():
gpt_4_32k.add(result['api_key'])
if result['has_only_turbo']:
gpt_3.add(result['api_key'])
pbar.update(1)
pbar.close()
print('')
print('GPT4 KEYS:')

View File

@ -92,10 +92,7 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
PROXIES = [
"http://172.0.4.7:3128",
"http://172.0.4.8:3128",
"http://172.0.4.10:3128",
"http://172.0.4.12:3128",
"http://127.0.0.1:3128"
]
LOG_LEVEL = 'CRITICAL'

View File

@ -31,5 +31,5 @@ class HuggingfaceSpider(CrawlSpider):
text = response.text
matches = self.regex.findall(text)
if matches:
self.logger.error(f"{matches}")
self.logger.debug(f"{matches}")
return FoundItem(url=response.url, matches=matches)

View File

@ -4,14 +4,14 @@ import time
import coloredlogs
import scrapy
from string import ascii_lowercase, ascii_uppercase
from ..pipelines import FoundItem
class HFSearchSpider(scrapy.Spider):
name = "huggingface_search"
allowed_domains = ["huggingface.co"]
start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in ['a', 'h', 'm', 'q'] for j in range(10)]
start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in [*ascii_lowercase, *ascii_uppercase, *range(10), 'etk', 'ant'] for j in range(10)]
logger = logging.getLogger('HFSearchSpider')
logger.setLevel(logging.DEBUG)
coloredlogs.install(logger=logger, level=logging.DEBUG)
@ -27,5 +27,5 @@ class HFSearchSpider(scrapy.Spider):
for key in keys:
found_keys.add(''.join(key))
if len(found_keys):
self.logger.error(f"{found_keys}")
self.logger.debug(f"{found_keys}")
yield FoundItem(url=response.url, matches=found_keys)