From cf059a638cc9139f6fda5da23072488d06577071 Mon Sep 17 00:00:00 2001 From: ultrafunkamsterdam Date: Thu, 29 Apr 2021 12:54:49 +0200 Subject: [PATCH] - support for specifying and reusing the user profile folder. if a user-data-dir is specified, that folder will NOT be deleted on exit. example: options.add_argument('--user-data-dir=c:\\temp') - uses a platform specific app data folder to store driver instead of the current workdir. - impoved headless mode. fixed detection by notification perms. - eliminates the "restore tabs" notification at startup - added methods find_elements_by_text and find_element_by_text - updated docs (partly) -known issues: - extensions not running. this is due to the inner workings of chromedriver. still working on this. - driver window is not always closing along with a program exit. - MacOS: startup nag notifications. might be solved by re(using) a profile directory. - known stuff: - some specific use cases, network conditions or behaviour can cause being detected. --- undetected_chromedriver/__init__.py | 2 +- .../tests/test_undetected_chromedriver.py | 21 +- undetected_chromedriver/v2.py | 343 ++++++++++++------ 3 files changed, 240 insertions(+), 126 deletions(-) diff --git a/undetected_chromedriver/__init__.py b/undetected_chromedriver/__init__.py index 6d21854..877e551 100644 --- a/undetected_chromedriver/__init__.py +++ b/undetected_chromedriver/__init__.py @@ -31,7 +31,7 @@ from selenium.webdriver import Chrome as _Chrome from selenium.webdriver import ChromeOptions as _ChromeOptions logger = logging.getLogger(__name__) -__version__ = '2.2.6' +__version__ = "2.2.6" TARGET_VERSION = 0 diff --git a/undetected_chromedriver/tests/test_undetected_chromedriver.py b/undetected_chromedriver/tests/test_undetected_chromedriver.py index 24984ca..1669644 100644 --- a/undetected_chromedriver/tests/test_undetected_chromedriver.py +++ b/undetected_chromedriver/tests/test_undetected_chromedriver.py @@ -7,21 +7,31 @@ from ..v2 import * logging.basicConfig(level=10) -logger = logging.getLogger('TEST') +logger = logging.getLogger("TEST") logger.setLevel(20) +JS_SERIALIZE_FUNCTION = """ +decycle=function(n,e){"use strict";var t=new WeakMap;return function n(o,r){var c,i;return void 0!==e&&(o=e(o)),"object"!=typeof o||null===o||o instanceof Boolean||o instanceof Date||o instanceof Number||o instanceof RegExp||o instanceof String?o:void 0!==(c=t.get(o))?{$ref:c}:(t.set(o,r),Array.isArray(o)?(i=[],o.forEach(function(e,t){i[t]=n(e,r+"["+t+"]")})):(i={},Object.keys(o).forEach(function(e){i[e]=n(o[e],r+"["+JSON.stringify(e)+"]")})),i)}(n,"$")}; +function replacer(t){try{if(Array.prototype.splice.call(t).length<100){let e={};for(let r in t)e[r]=t[r];return e}}catch(t){}} +return decycle(window) +""" + + def test_quick(): import undetected_chromedriver.v2 as uc - print('uc module: ', uc) + print("uc module: ", uc) # options = selenium.webdriver.ChromeOptions() options = uc.ChromeOptions() - options.add_argument('--user-data-dir=c:\\temp') + options.add_argument("--user-data-dir=c:\\temp") options.binary_location = uc.find_chrome_executable() - driver = uc.Chrome(executable_path='./chromedriver.exe', options=options, - service_log_path='c:\\temp\\service.log.txt') + driver = uc.Chrome( + executable_path="./chromedriver.exe", + options=options, + service_log_path="c:\\temp\\service.log.txt", + ) while True: sys.stdin.read() @@ -51,5 +61,6 @@ def test_undetected_chromedriver(): time.sleep(4) # sleep only used for timing of screenshot driver.save_screenshot("datadome.co.png") + # test_quick() # #test_undetected_chromedriver() diff --git a/undetected_chromedriver/v2.py b/undetected_chromedriver/v2.py index df094f3..6093919 100644 --- a/undetected_chromedriver/v2.py +++ b/undetected_chromedriver/v2.py @@ -38,13 +38,13 @@ import logging import os import random import re +import shutil import string import subprocess import sys import tempfile import time import zipfile -import shutil from distutils.version import LooseVersion from urllib.request import urlopen, urlretrieve @@ -64,11 +64,13 @@ logger.setLevel(logging.getLogger().getEffectiveLevel()) def find_chrome_executable(): """ - returns the full path to the chrome _browser binary - may not work if chrome is in a custom folder. + Finds the chrome, chrome beta, chrome canary, chromium executable + + Returns + ------- + executable_path : str + the full file path to found executable - :return: path to chrome executable - :rtype: str """ candidates = set() if IS_POSIX: @@ -95,45 +97,122 @@ def find_chrome_executable(): class Chrome(object): - __doc__ = ( - """\ - -------------------------------------------------------------------------- - NOTE: + """ + Controls the ChromeDriver and allows you to drive the browser. + + The webdriver file will be downloaded by this module automatically, + you do not need to specify this. however, you may if you wish. + + + Attributes + ---------- + + + Methods + ------- + + reconnect() + + this can be useful in case of heavy detection methods + -stops the chromedriver service which runs in the background + -starts the chromedriver service which runs in the background + -recreate session + + + start_session(capabilities=None, browser_profile=None) + + differentiates from the regular method in that it does not + require a capabilities argument. The capabilities are automatically + recreated from the options at creation time. + + + -------------------------------------------------------------------------- + NOTE: Chrome has everything included to work out of the box. it does not `need` customizations. any customizations MAY lead to trigger bot migitation systems. - - -------------------------------------------------------------------------- - """ - + selenium.webdriver.remote.webdriver.WebDriver.__doc__ - ) + + -------------------------------------------------------------------------- + """ _instances = set() def __init__( self, - executable_path="./chromedriver", + executable_path=None, port=0, options=None, service_args=None, desired_capabilities=None, service_log_path=None, keep_alive=True, - keep_user_data_dir=False, log_level=0, + headless=False, emulate_touch=False, + delay=5, ): + """ + Creates a new instance of the chrome driver. - p = Patcher.auto(executable_path=executable_path) - # p.auto(False) + Starts the service and then creates new instance of chrome driver. - self._patcher = p - self.port = port - self.process = None - self.browser_args = None - self._rcount = 0 - self._rdiff = 10 - self.keep_user_data_dir = keep_user_data_dir + # Parameters + # ----------- + # - executable_path - path to the executable. If the default is used it assumes the executable is in the $PATH + # - port - port you would like the service to run, if left as 0, a free port will be found. + # - options - this takes an instance of ChromeOptions + # - service_args - List of args to pass to the driver service + # - desired_capabilities - Dictionary object with non-browser specific + # capabilities only, such as "proxy" or "loggingPref". + # - service_log_path - Where to log information from the driver. + # - chrome_options - Deprecated argument for options + # - keep_alive - Whether to configure ChromeRemoteConnection to use HTTP keep-alive. + + Parameters + ---------- + executable_path: str, optional, default: None - use find_chrome_executable + Path to the executable. If the default is used it assumes the executable is in the $PATH + + port: int, optional, default: 0 + port you would like the service to run, if left as 0, a free port will be found. + + options: ChromeOptions, optional, default: None - automatic useful defaults + this takes an instance of ChromeOptions, mainly to customize browser behavior. + anything other dan the default, for example extensions or startup options + are not supported in case of failure, and can probably lowers your undetectability. + + service_args: list of str, optional, default: None + arguments to pass to the driver service + + desired_capabilities: dict, optional, default: None - auto from config + Dictionary object with non-browser specific capabilities only, such as "proxy" or "loggingPref". + + service_log_path: str, optional, default: None + path to log information from the driver. + + keep_alive: bool, optional, default: True + Whether to configure ChromeRemoteConnection to use HTTP keep-alive. + + log_level: int, optional, default: adapts to python global log level + + headless: bool, optional, default: False + can also be specified in the options instance. + Specify whether you want to use the browser in headless mode. + warning: this lowers undetectability and not fully supported. + + emulate_touch: bool, optional, default: False + if set to True, patches window.maxTouchPoints to always return non-zero + + delay: int, optional, default: 5 + delay in seconds to wait before giving back control. + this is used only when using the context manager + (`with` statement) to bypass, for example CloudFlare. + 5 seconds is a foolproof value. + + """ + + patcher = Patcher(executable_path=executable_path) + patcher.auto() debug_port = selenium.webdriver.common.service.utils.free_port() debug_host = "127.0.0.1" @@ -141,17 +220,7 @@ class Chrome(object): if not options: options = selenium.webdriver.chrome.webdriver.Options() - if not options.debugger_address: - options.debugger_address = "%s:%d" % (debug_host, debug_port) - - if not options.binary_location: - options.binary_location = find_chrome_executable() - - if not desired_capabilities: - desired_capabilities = options.to_capabilities() - - user_data_dir = None - + # see if a custom user profile is specified for arg in options.arguments: if "user-data-dir" in arg: m = re.search("(?:--)?user-data-dir(?:[ =])?(.*)", arg) @@ -160,7 +229,7 @@ class Chrome(object): logger.debug( "user-data-dir found in user argument %s => %s" % (arg, m[1]) ) - self.keep_user_data_dir = True + keep_user_data_dir = True break except IndexError: logger.debug( @@ -169,42 +238,69 @@ class Chrome(object): ) else: user_data_dir = os.path.normpath(tempfile.mkdtemp()) - self.keep_user_data_dir = False - arg = "--user-data-dir=%s" % user_data_dir - options.add_argument(arg) + keep_user_data_dir = False + options.add_argument("--user-data-dir=%s" % user_data_dir) logger.debug( "created a temporary folder in which the user-data (profile) will be stored during this\n" "session, and added it to chrome startup arguments: %s" % arg ) + + if not options.debugger_address: + options.debugger_address = "%s:%d" % (debug_host, debug_port) + + if not options.binary_location: + options.binary_location = find_chrome_executable() + + self._delay = delay + self.user_data_dir = user_data_dir + self.keep_user_data_dir = keep_user_data_dir + + if headless or options.headless: + options.headless = True + options.add_argument("--window-size=1920,1080") + options.add_argument("--start-maximized") + + options.add_argument("--remote-debugging-host=%s " % debug_host) + options.add_argument("--remote-debugging-port=%s" % debug_port) + options.add_argument( + "--log-level=%d" % log_level + or divmod(logging.getLogger().getEffectiveLevel(), 10)[0] + ) self.options = options - extra_args = options.arguments + # fix exit_type flag to prevent tab-restore nag + try: + with open( + os.path.join(user_data_dir, "Default/Preferences"), + encoding="latin1", + mode="r+", + ) as fs: + import json - if options.headless: - extra_args.append("--headless") - extra_args.append("--window-size=1920,1080") - - self.browser_args = [ - options.binary_location, - "--remote-debugging-host=%s" % debug_host, - "--remote-debugging-port=%s" % debug_port, - "--log-level=%d" % log_level - or divmod(logging.getLogger().getEffectiveLevel(), 10)[0], - *extra_args, - ] + config = json.load(fs) + if config["profile"]["exit_type"] is not None: + # fixing the restore-tabs-nag + config["profile"]["exit_type"] = None + fs.seek(0, 0) + fs.write(json.dumps(config, indent=4)) + logger.debug("fixed exit_type flag") + except Exception as e: + logger.debug("did not find a bad exit_type flag ") self.browser = subprocess.Popen( - self.browser_args, - # close_fds="win32" in sys.platform, + [options.binary_location, *options.arguments], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) + if not desired_capabilities: + desired_capabilities = options.to_capabilities() + self.webdriver = selenium.webdriver.chrome.webdriver.WebDriver( - # executable_path=p.executable_path, + executable_path=patcher.executable_path, port=port, options=options, service_args=service_args, @@ -214,6 +310,16 @@ class Chrome(object): ) if options.headless: + if emulate_touch: + self.execute_cdp_cmd( + "Page.addScriptToEvaluateOnNewDocument", + { + "source": """ + Object.defineProperty(navigator, 'maxTouchPoints', { + get: () => 1 + })""" + }, + ) orig_get = self.webdriver.get @@ -237,29 +343,29 @@ class Chrome(object): : target[key] }) }); - - Object.defineProperty(Notification, "permission", { - configurable: true, - enumerable: true, - get: () => { - return "unknown" - }, - }); """ }, ) - logger.info("removing headless from user-agent string") + logger.info("removing headless from user-agent string") - self.execute_cdp_cmd( - "Network.setUserAgentOverride", - { - "userAgent": self.execute_script( - "return navigator.userAgent" - ).replace("Headless", "") - }, - ) - logger.info("fixing notifications permission in headless browsers") + self.execute_cdp_cmd( + "Network.setUserAgentOverride", + { + "userAgent": self.execute_script( + "return navigator.userAgent" + ).replace("Headless", "") + }, + ) + self.execute_cdp_cmd( + "Page.addScriptToEvaluateOnNewDocument", + { + "source": """ + // fix Notification permission in headless mode + Object.defineProperty(Notification, 'permission', { get: () => "default"}); + """ + }, + ) if emulate_touch: self.execute_cdp_cmd( @@ -287,50 +393,27 @@ class Chrome(object): def __dir__(self): return object.__dir__(self) + object.__dir__(self.webdriver) + def reconnect(self): + try: + self.service.stop() + except Exception as e: + logger.debug(e) + + try: + self.service.start() + except Exception as e: + logger.debug(e) + + try: + self.start_session() + except Exception as e: + logger.debug(e) + def start_session(self, capabilities=None, browser_profile=None): if not capabilities: capabilities = self.options.to_capabilities() self.webdriver.start_session(capabilities, browser_profile) - # def get_in(self, url: str, delay=2, factor=1): - # """ - # :param url: str - # :param delay: int - # :param factor: disconnect seconds after .get() - # too low will disconnect before get() fired. - # - # ================================================= - # - # In case you are being detected by some sophisticated - # algorithm, and you are the kind that hates losing, - # this might be your friend. - # - # this currently works for hCaptcha based systems - # (this includes CloudFlare!), and also passes many - # custom setups (eg: ticketmaster.com), - # - # - # Once you are past the first challenge, a cookie is saved - # which (in my tests) also worked for other sites, and lasted - # my entire session! However, to play safe, i'd recommend to just - # call it once for every new site/domain you navigate to. - # - # NOTE: mileage may vary! - # bad behaviour can still be detected, and this program does not - # magically "fix" a flagged ip. - # - # please don't spam issues on github! first look if the issue - # is not already reported. - # """ - # try: - # self.get(url) - # finally: - # self.service.stop() - # # threading.Timer(factor or self.factor, self.close).start() - # time.sleep(delay or self.delay) - # self.service.start() - # self.start_session() - # def quit(self): logger.debug("closing webdriver") try: @@ -353,7 +436,9 @@ class Chrome(object): except FileNotFoundError: pass except PermissionError: - logger.debug("permission error. files are still in use/locked. retying...") + logger.debug( + "permission error. files are still in use/locked. retying..." + ) else: break time.sleep(1) @@ -366,14 +451,29 @@ class Chrome(object): def __exit__(self, exc_type, exc_val, exc_tb): self.service.stop() - # threading.Timer(self.factor, self.service.start).start() - time.sleep(self.delay) + time.sleep(self._delay) self.service.start() self.start_session() def __hash__(self): return hash(self.options.debugger_address) + def find_elements_by_text(self, text: str): + for elem in self.find_elements_by_css_selector("*"): + try: + if text.lower() in elem.text.lower(): + yield elem + except Exception as e: + logger.debug("find_elements_by_text: %s" % e) + + def find_element_by_text(self, text: str): + for elem in self.find_elements_by_css_selector("*"): + try: + if text.lower() in elem.text.lower(): + return elem + except Exception as e: + logger.debug("find_elements_by_text: %s" % e) + class Patcher(object): url_repo = "https://chromedriver.storage.googleapis.com" @@ -430,7 +530,7 @@ class Patcher(object): self.version_full = None @classmethod - def auto(cls, executable_path="./chromedriver", force=False): + def auto(cls, executable_path=None, force=False): """ Args: @@ -514,9 +614,7 @@ class Patcher(object): with zipfile.ZipFile(fp, mode="r") as zf: zf.extract(self.exe_name, os.path.dirname(self.executable_path)) - # os.rename(self.zip_path, self.executable_path) os.remove(fp) - os.chmod(self.executable_path, 0o755) return self.executable_path @@ -575,8 +673,13 @@ class Patcher(object): linect += 1 return linect + def __repr__(self): + return "{0:s}({1:s})".format( + self.__class__.__name__, + self.executable_path, + ) + -# class ChromeOptions(selenium.webdriver.chrome.webdriver.Options): class ChromeOptions(_ChromeOptions): def add_extension_file_crx(self, extension=None): if extension: