diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml new file mode 100644 index 0000000..c50660e --- /dev/null +++ b/.github/workflows/workflow.yml @@ -0,0 +1,51 @@ + + +name: Python package + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10","3.11"] + + steps: + - uses: actions/checkout@v3 + - name: Setup Chrome + uses: browser-actions/setup-chrome@v1.2.0 + with: + chrome-version: stable + - name: set chrome in path + run: | + echo "/opt/hostedtoolcache/chromium/stable/x64" >> $GITHUB_PATH + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install package + run: | + python -m pip install --upgrade pip + if [ -f requirements.txt ]; then pip install -r requirements.txt; else pip install -U . ; fi + - name: run example + run: | + python example/test_workflow.py + - name: Upload a Build Artifact + uses: actions/upload-artifact@v3.1.2 + with: + # Artifact name + name: screenshots + # A file, directory or wildcard pattern that describes what to upload + path: /home/runner/work/_temp/*p* + + + + + diff --git a/README.md b/README.md index d405c9c..61b17de 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,22 @@ Optimized Selenium Chromedriver patch which does not trigger anti-bot services l Automatically downloads the driver binary and patches it. * Tested until current chrome beta versions -* Works also on Brave Browser and many other Chromium based browsers, some tweaking +* Works also on Brave Browser and many other Chromium based browsers, but you need to know what you're doing and needs some tweaking. * Python 3.6++** + +## Installation ## + +``` +pip install undetected-chromedriver +``` +or , if you're feeling adventurous, install directly via github + +``` +pip install git+https://www.github.com/ultrafunkamsterdam/undetected-chromedriver@master # replace @master with @branchname for other branches +``` + + - - - ## Message for all ## I will be putting limits on the issue tracker. It has beeen abused too long. @@ -17,6 +30,33 @@ any good news? Yes, i've opened [Undetected-Discussions](https://github.com/ultrafunkamsterdam/undetected-chromedriver/discussions) which i think will help us better in the long run. - - - +What this is not +--- +**THIS PACKAGE DOES NOT, and i repeat DOES NOT hide your IP address, so when running from a datacenter (even smaller ones), chances are large you will not pass! Also, if your ip reputation at home is low, you won't pass!** + +Running following code from home , and from a datacenter. +```python +import undetected_chromedriver as uc +driver = uc.Chrome(headless=True,use_subprocess=False) +driver.get('https://nowsecure.nl') +driver.save_screenshot('nowsecure.png') +``` +
+ + +
+ + + + + +## 3.5.0 ## +- selenium 4.10 caused some issues. 3.5.0 is compatible and has selenium 4.9 or above pinned. I can't support <4.9 any longer. +- Removed some kwargs from constructor: service_args, service_creationflags, service_log_path. +- added find_elements_recursive generator function. which is more of a convenience funtion as lots of websites seem to serve different content from different frames, making it hard + to use find_elements + + ## 3.4.5 ## - What a week. Had the recent advancedments in Automation-Detection algorithms pwned (so i thought) with 3.4.0, but apparently, for some OS-es this caused an error when interacting with elements. Had to revert back using a different method, fix bugs, and now eventually was still able to stick to the initial idea (+ fixing bugs) - Update to chrome 110 caused another surprise, this time for HEADLESS users. @@ -143,11 +183,7 @@ the solution is simple: **newsflash: https://github.com/ultrafunkamsterdam/undetected-chromedriver/pull/255** -## Installation ## -``` -pip install undetected-chromedriver -``` ## Usage ## diff --git a/example/example.py b/example/example.py index 622975a..48fde5e 100644 --- a/example/example.py +++ b/example/example.py @@ -1,10 +1,13 @@ import time +import logging +logging.basicConfig(level=10) from selenium.common.exceptions import WebDriverException from selenium.webdriver.remote.webdriver import By import selenium.webdriver.support.expected_conditions as EC # noqa from selenium.webdriver.support.wait import WebDriverWait + import undetected_chromedriver as uc @@ -164,7 +167,8 @@ def main(args=None): print("lets go to UC project page") driver.get("https://www.github.com/ultrafunkamsterdam/undetected-chromedriver") - input("press a key if you have RTFM") + + sleep(2) driver.quit() diff --git a/example/test_workflow.py b/example/test_workflow.py new file mode 100644 index 0000000..e4d3914 --- /dev/null +++ b/example/test_workflow.py @@ -0,0 +1,121 @@ +# coding: utf-8 + +import time +import logging +import os +from selenium.webdriver.support.wait import WebDriverWait +import selenium.webdriver.support.expected_conditions as EC +from selenium.common.exceptions import TimeoutException +import undetected_chromedriver as uc +from pathlib import Path + + +logging.basicConfig(level=10) +logger = logging.getLogger('test') + +def main(): + + #### + # this block is a dirty helper since + # in the action runner devices serveral chrome versions exists + # and i need to ensure it takes the one which is installed + # by the task. + #### + + for k,v in os.environ.items(): + logger.info("%s = %s" % (k,v)) + logger.info('==== END ENV ==== ') + tmp = Path('/tmp').resolve() + + for item in tmp.rglob('**'): + logger.info('found %s ' % item) + + if item.is_dir(): + if 'chrome-' in item.name: + + logger.info('adding %s to PATH' % str(item)) + logger.info('current PATH: %s' % str(os.environ.get('PATH'))) + path_list = os.environ['PATH'].split(os.pathsep) + path_list.insert(0, str(item)) + os.environ['PATH'] = os.pathsep.join(path_list) + logger.info('new PATH %s:' % str(os.environ.get('PATH'))) + browser_executable_path = str(item / 'chrome') + break + + #### + # test really starts here + #3## + + + driver = uc.Chrome(headless=True, browser_executable_path=browser_executable_path) + logging.getLogger().setLevel(10) + + driver.get('chrome://version') + + driver.save_screenshot('/home/runner/work/_temp/versioninfo.png') + + driver.get('chrome://settings/help') + driver.save_screenshot('/home/runner/work/_temp/helpinfo.png') + + driver.get('https://www.google.com') + driver.save_screenshot('/home/runner/work/_temp/google.com.png') + + driver.get('https://bot.incolumitas.com/#botChallenge') + + pdfdata = driver.execute_cdp_cmd('Page.printToPDF', {}) + if pdfdata: + if 'data' in pdfdata: + data = pdfdata['data'] + import base64 + buffer = base64.b64decode(data) + with open('/home/runner/work/_temp/report.pdf', 'w+b') as f: + f.write(buffer) + + driver.get('https://www.nowsecure.nl') + + logger.info('current url %s' % driver.current_url) + + try: + WebDriverWait(driver,15).until(EC.title_contains('moment')) + except TimeoutException: + pass + + logger.info('current page source:\n%s' % driver.page_source) + + logger.info('current url %s' % driver.current_url) + + try: + WebDriverWait(driver,15).until(EC.title_contains('nowSecure')) + logger.info('PASSED CLOUDFLARE!') + + except TimeoutException: + logger.info('timeout') + print(driver.current_url) + + logger.info('current page source:\n%s\n' % driver.page_source) + + #logger.info('trying to save a screenshot via imgur') + + driver.save_screenshot('/home/runner/work/_temp/nowsecure.png') + + #driver.get('https://imgur.com/upload') + + #driver.find_element('css selector', 'input').send_keys('/home/runner/work/_temp/nowsecure.png') + + #time.sleep(1) + #logger.info('current url %s' % driver.current_url) + #time.sleep(1) + #logger.info(f'A SCREENSHOT IS SAVED ON {driver.current_url} <<< if this ends onlywith /upload than it failed. after all we are running from a datacenter no human being would ever surf the internet from ') + #time.sleep(5) + + driver.quit() + + + + + + + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index 12b1d87..aaba3a7 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ setup( version=version, packages=["undetected_chromedriver"], install_requires=[ - "selenium>=4.0.0", + "selenium>=4.9.0", "requests", "websockets", ], diff --git a/undetected_chromedriver/__init__.py b/undetected_chromedriver/__init__.py index ca28b43..2af11bc 100644 --- a/undetected_chromedriver/__init__.py +++ b/undetected_chromedriver/__init__.py @@ -17,7 +17,7 @@ by UltrafunkAmsterdam (https://github.com/ultrafunkamsterdam) from __future__ import annotations -__version__ = "3.4.6" +__version__ = "3.5.0" import json import logging @@ -33,7 +33,7 @@ from weakref import finalize import selenium.webdriver.chrome.service import selenium.webdriver.chrome.webdriver from selenium.webdriver.common.by import By -import selenium.webdriver.common.service +import selenium.webdriver.chromium.service import selenium.webdriver.remote.command import selenium.webdriver.remote.webdriver @@ -109,11 +109,11 @@ class Chrome(selenium.webdriver.chrome.webdriver.WebDriver): browser_executable_path=None, port=0, enable_cdp_events=False, - service_args=None, - service_creationflags=None, + # service_args=None, + # service_creationflags=None, desired_capabilities=None, advanced_elements=False, - service_log_path=None, + # service_log_path=None, keep_alive=True, log_level=0, headless=False, @@ -123,6 +123,7 @@ class Chrome(selenium.webdriver.chrome.webdriver.WebDriver): use_subprocess=True, debug=False, no_sandbox=True, + user_multi_procs: bool = False, **kw, ): """ @@ -234,6 +235,14 @@ class Chrome(selenium.webdriver.chrome.webdriver.WebDriver): uses the --no-sandbox option, and additionally does suppress the "unsecure option" status bar this option has a default of True since many people seem to run this as root (....) , and chrome does not start when running as root without using --no-sandbox flag. + + user_multi_procs: + set to true when you are using multithreads/multiprocessing + ensures not all processes are trying to modify a binary which is in use by another. + for this to work. YOU MUST HAVE AT LEAST 1 UNDETECTED_CHROMEDRIVER BINARY IN YOUR ROAMING DATA FOLDER. + this requirement can be easily satisfied, by just running this program "normal" and close/kill it. + + """ finalize(self, self._ensure_close, self) @@ -242,8 +251,11 @@ class Chrome(selenium.webdriver.chrome.webdriver.WebDriver): executable_path=driver_executable_path, force=patcher_force_close, version_main=version_main, + user_multi_procs=user_multi_procs, ) + # self.patcher.auto(user_multiprocess = user_multi_num_procs) self.patcher.auto() + # self.patcher = patcher if not options: options = ChromeOptions() @@ -371,9 +383,15 @@ class Chrome(selenium.webdriver.chrome.webdriver.WebDriver): options.arguments.extend(["--no-sandbox", "--test-type"]) if headless or options.headless: - if self.patcher.version_main < 108: - options.add_argument("--headless=chrome") - elif self.patcher.version_main >= 108: + #workaround until a better checking is found + try: + if self.patcher.version_main < 108: + options.add_argument("--headless=chrome") + elif self.patcher.version_main >= 108: + options.add_argument("--headless=new") + except: + logger.warning("could not detect version_main." + "therefore, we are assuming it is chrome 108 or higher") options.add_argument("--headless=new") options.add_argument("--window-size=1920,1080") @@ -427,28 +445,13 @@ class Chrome(selenium.webdriver.chrome.webdriver.WebDriver): ) self.browser_pid = browser.pid - if keep_alive: - service_args.append('--keep_alive') if service_args else service_args = ['--keep_alive'] - - if service_creationflags: - service = selenium.webdriver.common.service.Service( - self.patcher.executable_path, port, service_args, service_log_path - ) - for attr_name in ("creationflags", "creation_flags"): - if hasattr(service, attr_name): - setattr(service, attr_name, service_creationflags) - break - else: - service = selenium.webdriver.chrome.service.Service( - self.patcher.executable_path - ) + service = selenium.webdriver.chromium.service.ChromiumService( + self.patcher.executable_path + ) super(Chrome, self).__init__( - port=port, - options=options, - service_args=service_args, - service_log_path=service_log_path, - service=service, # needed or the service will be re-created + service=service, + options=options ) self.reactor = None @@ -704,10 +707,45 @@ class Chrome(selenium.webdriver.chrome.webdriver.WebDriver): if not capabilities: capabilities = self.options.to_capabilities() super(selenium.webdriver.chrome.webdriver.WebDriver, self).start_session( - capabilities, browser_profile + capabilities ) # super(Chrome, self).start_session(capabilities, browser_profile) + def find_elements_recursive(self, by, value): + """ + find elements in all frames + this is a generator function, which is needed + since if it would return a list of elements, they + will be stale on arrival. + using generator, when the element is returned we are in the correct frame + to use it directly + Args: + by: By + value: str + Returns: Generator[webelement.WebElement] + """ + def search_frame(f=None): + if not f: + # ensure we are on main content frame + self.switch_to.default_content() + else: + self.switch_to.frame(f) + for elem in self.find_elements(by, value): + yield elem + # switch back to main content, otherwise we will get StaleElementReferenceException + self.switch_to.default_content() + + # search root frame + for elem in search_frame(): + yield elem + # get iframes + frames = self.find_elements('css selector', 'iframe') + + # search per frame + for f in frames: + for elem in search_frame(f): + yield elem + def quit(self): try: self.service.process.kill() @@ -723,7 +761,7 @@ class Chrome(selenium.webdriver.chrome.webdriver.WebDriver): os.kill(self.browser_pid, 15) logger.debug("gracefully closed browser") except Exception as e: # noqa - logger.debug(e, exc_info=True) + pass if ( hasattr(self, "keep_user_data_dir") and hasattr(self, "user_data_dir") @@ -842,5 +880,7 @@ def find_chrome_executable(): ): candidates.add(os.sep.join((item, subitem, "chrome.exe"))) for candidate in candidates: + logger.debug('checking if %s exists and is executable' % candidate) if os.path.exists(candidate) and os.access(candidate, os.X_OK): + logger.debug('found! using %s' % candidate) return os.path.normpath(candidate) diff --git a/undetected_chromedriver/patcher.py b/undetected_chromedriver/patcher.py index 24da802..d083dc3 100644 --- a/undetected_chromedriver/patcher.py +++ b/undetected_chromedriver/patcher.py @@ -5,15 +5,17 @@ from distutils.version import LooseVersion import io import logging import os +import pathlib import random import re +import shutil import string import sys import time from urllib.request import urlopen from urllib.request import urlretrieve import zipfile - +from multiprocessing import Lock logger = logging.getLogger(__name__) @@ -21,6 +23,7 @@ IS_POSIX = sys.platform.startswith(("darwin", "cygwin", "linux", "linux2")) class Patcher(object): + lock = Lock() url_repo = "https://chromedriver.storage.googleapis.com" zip_name = "chromedriver_%s.zip" exe_name = "chromedriver%s" @@ -48,7 +51,13 @@ class Patcher(object): d = "~/.undetected_chromedriver" data_path = os.path.abspath(os.path.expanduser(d)) - def __init__(self, executable_path=None, force=False, version_main: int = 0): + def __init__( + self, + executable_path=None, + force=False, + version_main: int = 0, + user_multi_procs=False, + ): """ Args: executable_path: None = automatic @@ -61,6 +70,7 @@ class Patcher(object): self.force = force self._custom_exe_path = False prefix = "undetected" + self.user_multi_procs = user_multi_procs if not os.path.exists(self.data_path): os.makedirs(self.data_path, exist_ok=True) @@ -78,17 +88,41 @@ class Patcher(object): self.zip_path = os.path.join(self.data_path, prefix) if not executable_path: - self.executable_path = os.path.abspath( - os.path.join(".", self.executable_path) - ) + if not self.user_multi_procs: + self.executable_path = os.path.abspath( + os.path.join(".", self.executable_path) + ) if executable_path: self._custom_exe_path = True self.executable_path = executable_path + self.version_main = version_main self.version_full = None - def auto(self, executable_path=None, force=False, version_main=None): + def auto(self, executable_path=None, force=False, version_main=None, _=None): + """ + + Args: + executable_path: + force: + version_main: + + Returns: + + """ + # if self.user_multi_procs and \ + # self.user_multi_procs != -1: + # # -1 being a skip value used later in this block + # + p = pathlib.Path(self.data_path) + with Lock(): + files = list(p.rglob("*chromedriver*?")) + for file in files: + if self.is_binary_patched(file): + self.executable_path = str(file) + return True + if executable_path: self.executable_path = executable_path self._custom_exe_path = True @@ -127,6 +161,49 @@ class Patcher(object): self.unzip_package(self.fetch_package()) return self.patch() + def driver_binary_in_use(self, path: str = None) -> bool: + """ + naive test to check if a found chromedriver binary is + currently in use + + Args: + path: a string or PathLike object to the binary to check. + if not specified, we check use this object's executable_path + """ + if not path: + path = self.executable_path + p = pathlib.Path(path) + + if not p.exists(): + raise OSError("file does not exist: %s" % p) + try: + with open(p, mode="a+b") as fs: + exc = [] + try: + + fs.seek(0, 0) + except PermissionError as e: + exc.append(e) # since some systems apprently allow seeking + # we conduct another test + try: + fs.readline() + except PermissionError as e: + exc.append(e) + + if exc: + + return True + return False + # ok safe to assume this is in use + except Exception as e: + # logger.exception("whoops ", e) + pass + + def cleanup_unused_files(self): + p = pathlib.Path(self.data_path) + items = list(p.glob("*undetected*")) + print(items) + def patch(self): self.patch_exe() return self.is_binary_patched() @@ -255,21 +332,17 @@ class Patcher(object): else: timeout = 3 # stop trying after this many seconds t = time.monotonic() - while True: - now = time.monotonic() - if now - t > timeout: - # we don't want to wait until the end of time - logger.debug( - "could not unlink %s in time (%d seconds)" - % (self.executable_path, timeout) - ) - break + now = lambda: time.monotonic() + while now() - t > timeout: + # we don't want to wait until the end of time try: + if self.user_multi_procs: + break os.unlink(self.executable_path) logger.debug("successfully unlinked %s" % self.executable_path) break except (OSError, RuntimeError, PermissionError): - time.sleep(0.1) + time.sleep(0.01) continue except FileNotFoundError: break