- support for specifying and reusing the user profile folder.

if a user-data-dir is specified, that folder will NOT be
    deleted on exit.
    example:
        options.add_argument('--user-data-dir=c:\\temp')

- uses a platform specific app data folder to store driver instead
    of the current workdir.

- impoved headless mode. fixed detection by notification perms.

- eliminates the "restore tabs" notification at startup

- added methods find_elements_by_text and find_element_by_text

- updated docs (partly)

-known issues:
    - extensions not running. this is due to the inner workings
        of chromedriver. still working on this.
    - driver window is not always closing along with a program exit.
    - MacOS: startup nag notifications. might be solved by
        re(using) a profile directory.

- known stuff:
    - some specific use cases, network conditions or behaviour
      can cause being detected.
This commit is contained in:
ultrafunkamsterdam 2021-04-29 12:54:49 +02:00
parent b40d23c649
commit cf059a638c
3 changed files with 240 additions and 126 deletions

View File

@ -31,7 +31,7 @@ from selenium.webdriver import Chrome as _Chrome
from selenium.webdriver import ChromeOptions as _ChromeOptions from selenium.webdriver import ChromeOptions as _ChromeOptions
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
__version__ = '2.2.6' __version__ = "2.2.6"
TARGET_VERSION = 0 TARGET_VERSION = 0

View File

@ -7,21 +7,31 @@ from ..v2 import *
logging.basicConfig(level=10) logging.basicConfig(level=10)
logger = logging.getLogger('TEST') logger = logging.getLogger("TEST")
logger.setLevel(20) logger.setLevel(20)
JS_SERIALIZE_FUNCTION = """
decycle=function(n,e){"use strict";var t=new WeakMap;return function n(o,r){var c,i;return void 0!==e&&(o=e(o)),"object"!=typeof o||null===o||o instanceof Boolean||o instanceof Date||o instanceof Number||o instanceof RegExp||o instanceof String?o:void 0!==(c=t.get(o))?{$ref:c}:(t.set(o,r),Array.isArray(o)?(i=[],o.forEach(function(e,t){i[t]=n(e,r+"["+t+"]")})):(i={},Object.keys(o).forEach(function(e){i[e]=n(o[e],r+"["+JSON.stringify(e)+"]")})),i)}(n,"$")};
function replacer(t){try{if(Array.prototype.splice.call(t).length<100){let e={};for(let r in t)e[r]=t[r];return e}}catch(t){}}
return decycle(window)
"""
def test_quick(): def test_quick():
import undetected_chromedriver.v2 as uc import undetected_chromedriver.v2 as uc
print('uc module: ', uc) print("uc module: ", uc)
# options = selenium.webdriver.ChromeOptions() # options = selenium.webdriver.ChromeOptions()
options = uc.ChromeOptions() options = uc.ChromeOptions()
options.add_argument('--user-data-dir=c:\\temp') options.add_argument("--user-data-dir=c:\\temp")
options.binary_location = uc.find_chrome_executable() options.binary_location = uc.find_chrome_executable()
driver = uc.Chrome(executable_path='./chromedriver.exe', options=options, driver = uc.Chrome(
service_log_path='c:\\temp\\service.log.txt') executable_path="./chromedriver.exe",
options=options,
service_log_path="c:\\temp\\service.log.txt",
)
while True: while True:
sys.stdin.read() sys.stdin.read()
@ -51,5 +61,6 @@ def test_undetected_chromedriver():
time.sleep(4) # sleep only used for timing of screenshot time.sleep(4) # sleep only used for timing of screenshot
driver.save_screenshot("datadome.co.png") driver.save_screenshot("datadome.co.png")
# test_quick() # test_quick()
# #test_undetected_chromedriver() # #test_undetected_chromedriver()

View File

@ -38,13 +38,13 @@ import logging
import os import os
import random import random
import re import re
import shutil
import string import string
import subprocess import subprocess
import sys import sys
import tempfile import tempfile
import time import time
import zipfile import zipfile
import shutil
from distutils.version import LooseVersion from distutils.version import LooseVersion
from urllib.request import urlopen, urlretrieve from urllib.request import urlopen, urlretrieve
@ -64,11 +64,13 @@ logger.setLevel(logging.getLogger().getEffectiveLevel())
def find_chrome_executable(): def find_chrome_executable():
""" """
returns the full path to the chrome _browser binary Finds the chrome, chrome beta, chrome canary, chromium executable
may not work if chrome is in a custom folder.
Returns
-------
executable_path : str
the full file path to found executable
:return: path to chrome executable
:rtype: str
""" """
candidates = set() candidates = set()
if IS_POSIX: if IS_POSIX:
@ -95,8 +97,35 @@ def find_chrome_executable():
class Chrome(object): class Chrome(object):
__doc__ = ( """
"""\ Controls the ChromeDriver and allows you to drive the browser.
The webdriver file will be downloaded by this module automatically,
you do not need to specify this. however, you may if you wish.
Attributes
----------
Methods
-------
reconnect()
this can be useful in case of heavy detection methods
-stops the chromedriver service which runs in the background
-starts the chromedriver service which runs in the background
-recreate session
start_session(capabilities=None, browser_profile=None)
differentiates from the regular method in that it does not
require a capabilities argument. The capabilities are automatically
recreated from the options at creation time.
-------------------------------------------------------------------------- --------------------------------------------------------------------------
NOTE: NOTE:
Chrome has everything included to work out of the box. Chrome has everything included to work out of the box.
@ -105,35 +134,85 @@ class Chrome(object):
-------------------------------------------------------------------------- --------------------------------------------------------------------------
""" """
+ selenium.webdriver.remote.webdriver.WebDriver.__doc__
)
_instances = set() _instances = set()
def __init__( def __init__(
self, self,
executable_path="./chromedriver", executable_path=None,
port=0, port=0,
options=None, options=None,
service_args=None, service_args=None,
desired_capabilities=None, desired_capabilities=None,
service_log_path=None, service_log_path=None,
keep_alive=True, keep_alive=True,
keep_user_data_dir=False,
log_level=0, log_level=0,
headless=False,
emulate_touch=False, emulate_touch=False,
delay=5,
): ):
"""
Creates a new instance of the chrome driver.
p = Patcher.auto(executable_path=executable_path) Starts the service and then creates new instance of chrome driver.
# p.auto(False)
self._patcher = p # Parameters
self.port = port # -----------
self.process = None # - executable_path - path to the executable. If the default is used it assumes the executable is in the $PATH
self.browser_args = None # - port - port you would like the service to run, if left as 0, a free port will be found.
self._rcount = 0 # - options - this takes an instance of ChromeOptions
self._rdiff = 10 # - service_args - List of args to pass to the driver service
self.keep_user_data_dir = keep_user_data_dir # - desired_capabilities - Dictionary object with non-browser specific
# capabilities only, such as "proxy" or "loggingPref".
# - service_log_path - Where to log information from the driver.
# - chrome_options - Deprecated argument for options
# - keep_alive - Whether to configure ChromeRemoteConnection to use HTTP keep-alive.
Parameters
----------
executable_path: str, optional, default: None - use find_chrome_executable
Path to the executable. If the default is used it assumes the executable is in the $PATH
port: int, optional, default: 0
port you would like the service to run, if left as 0, a free port will be found.
options: ChromeOptions, optional, default: None - automatic useful defaults
this takes an instance of ChromeOptions, mainly to customize browser behavior.
anything other dan the default, for example extensions or startup options
are not supported in case of failure, and can probably lowers your undetectability.
service_args: list of str, optional, default: None
arguments to pass to the driver service
desired_capabilities: dict, optional, default: None - auto from config
Dictionary object with non-browser specific capabilities only, such as "proxy" or "loggingPref".
service_log_path: str, optional, default: None
path to log information from the driver.
keep_alive: bool, optional, default: True
Whether to configure ChromeRemoteConnection to use HTTP keep-alive.
log_level: int, optional, default: adapts to python global log level
headless: bool, optional, default: False
can also be specified in the options instance.
Specify whether you want to use the browser in headless mode.
warning: this lowers undetectability and not fully supported.
emulate_touch: bool, optional, default: False
if set to True, patches window.maxTouchPoints to always return non-zero
delay: int, optional, default: 5
delay in seconds to wait before giving back control.
this is used only when using the context manager
(`with` statement) to bypass, for example CloudFlare.
5 seconds is a foolproof value.
"""
patcher = Patcher(executable_path=executable_path)
patcher.auto()
debug_port = selenium.webdriver.common.service.utils.free_port() debug_port = selenium.webdriver.common.service.utils.free_port()
debug_host = "127.0.0.1" debug_host = "127.0.0.1"
@ -141,17 +220,7 @@ class Chrome(object):
if not options: if not options:
options = selenium.webdriver.chrome.webdriver.Options() options = selenium.webdriver.chrome.webdriver.Options()
if not options.debugger_address: # see if a custom user profile is specified
options.debugger_address = "%s:%d" % (debug_host, debug_port)
if not options.binary_location:
options.binary_location = find_chrome_executable()
if not desired_capabilities:
desired_capabilities = options.to_capabilities()
user_data_dir = None
for arg in options.arguments: for arg in options.arguments:
if "user-data-dir" in arg: if "user-data-dir" in arg:
m = re.search("(?:--)?user-data-dir(?:[ =])?(.*)", arg) m = re.search("(?:--)?user-data-dir(?:[ =])?(.*)", arg)
@ -160,7 +229,7 @@ class Chrome(object):
logger.debug( logger.debug(
"user-data-dir found in user argument %s => %s" % (arg, m[1]) "user-data-dir found in user argument %s => %s" % (arg, m[1])
) )
self.keep_user_data_dir = True keep_user_data_dir = True
break break
except IndexError: except IndexError:
logger.debug( logger.debug(
@ -169,42 +238,69 @@ class Chrome(object):
) )
else: else:
user_data_dir = os.path.normpath(tempfile.mkdtemp()) user_data_dir = os.path.normpath(tempfile.mkdtemp())
self.keep_user_data_dir = False keep_user_data_dir = False
arg = "--user-data-dir=%s" % user_data_dir options.add_argument("--user-data-dir=%s" % user_data_dir)
options.add_argument(arg)
logger.debug( logger.debug(
"created a temporary folder in which the user-data (profile) will be stored during this\n" "created a temporary folder in which the user-data (profile) will be stored during this\n"
"session, and added it to chrome startup arguments: %s" % arg "session, and added it to chrome startup arguments: %s" % arg
) )
if not options.debugger_address:
options.debugger_address = "%s:%d" % (debug_host, debug_port)
if not options.binary_location:
options.binary_location = find_chrome_executable()
self._delay = delay
self.user_data_dir = user_data_dir self.user_data_dir = user_data_dir
self.keep_user_data_dir = keep_user_data_dir
if headless or options.headless:
options.headless = True
options.add_argument("--window-size=1920,1080")
options.add_argument("--start-maximized")
options.add_argument("--remote-debugging-host=%s " % debug_host)
options.add_argument("--remote-debugging-port=%s" % debug_port)
options.add_argument(
"--log-level=%d" % log_level
or divmod(logging.getLogger().getEffectiveLevel(), 10)[0]
)
self.options = options self.options = options
extra_args = options.arguments # fix exit_type flag to prevent tab-restore nag
try:
with open(
os.path.join(user_data_dir, "Default/Preferences"),
encoding="latin1",
mode="r+",
) as fs:
import json
if options.headless: config = json.load(fs)
extra_args.append("--headless") if config["profile"]["exit_type"] is not None:
extra_args.append("--window-size=1920,1080") # fixing the restore-tabs-nag
config["profile"]["exit_type"] = None
self.browser_args = [ fs.seek(0, 0)
options.binary_location, fs.write(json.dumps(config, indent=4))
"--remote-debugging-host=%s" % debug_host, logger.debug("fixed exit_type flag")
"--remote-debugging-port=%s" % debug_port, except Exception as e:
"--log-level=%d" % log_level logger.debug("did not find a bad exit_type flag ")
or divmod(logging.getLogger().getEffectiveLevel(), 10)[0],
*extra_args,
]
self.browser = subprocess.Popen( self.browser = subprocess.Popen(
self.browser_args, [options.binary_location, *options.arguments],
# close_fds="win32" in sys.platform,
stdin=subprocess.PIPE, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
) )
if not desired_capabilities:
desired_capabilities = options.to_capabilities()
self.webdriver = selenium.webdriver.chrome.webdriver.WebDriver( self.webdriver = selenium.webdriver.chrome.webdriver.WebDriver(
# executable_path=p.executable_path, executable_path=patcher.executable_path,
port=port, port=port,
options=options, options=options,
service_args=service_args, service_args=service_args,
@ -214,6 +310,16 @@ class Chrome(object):
) )
if options.headless: if options.headless:
if emulate_touch:
self.execute_cdp_cmd(
"Page.addScriptToEvaluateOnNewDocument",
{
"source": """
Object.defineProperty(navigator, 'maxTouchPoints', {
get: () => 1
})"""
},
)
orig_get = self.webdriver.get orig_get = self.webdriver.get
@ -237,14 +343,6 @@ class Chrome(object):
: target[key] : target[key]
}) })
}); });
Object.defineProperty(Notification, "permission", {
configurable: true,
enumerable: true,
get: () => {
return "unknown"
},
});
""" """
}, },
) )
@ -259,7 +357,15 @@ class Chrome(object):
).replace("Headless", "") ).replace("Headless", "")
}, },
) )
logger.info("fixing notifications permission in headless browsers") self.execute_cdp_cmd(
"Page.addScriptToEvaluateOnNewDocument",
{
"source": """
// fix Notification permission in headless mode
Object.defineProperty(Notification, 'permission', { get: () => "default"});
"""
},
)
if emulate_touch: if emulate_touch:
self.execute_cdp_cmd( self.execute_cdp_cmd(
@ -287,50 +393,27 @@ class Chrome(object):
def __dir__(self): def __dir__(self):
return object.__dir__(self) + object.__dir__(self.webdriver) return object.__dir__(self) + object.__dir__(self.webdriver)
def reconnect(self):
try:
self.service.stop()
except Exception as e:
logger.debug(e)
try:
self.service.start()
except Exception as e:
logger.debug(e)
try:
self.start_session()
except Exception as e:
logger.debug(e)
def start_session(self, capabilities=None, browser_profile=None): def start_session(self, capabilities=None, browser_profile=None):
if not capabilities: if not capabilities:
capabilities = self.options.to_capabilities() capabilities = self.options.to_capabilities()
self.webdriver.start_session(capabilities, browser_profile) self.webdriver.start_session(capabilities, browser_profile)
# def get_in(self, url: str, delay=2, factor=1):
# """
# :param url: str
# :param delay: int
# :param factor: disconnect <factor> seconds after .get()
# too low will disconnect before get() fired.
#
# =================================================
#
# In case you are being detected by some sophisticated
# algorithm, and you are the kind that hates losing,
# this might be your friend.
#
# this currently works for hCaptcha based systems
# (this includes CloudFlare!), and also passes many
# custom setups (eg: ticketmaster.com),
#
#
# Once you are past the first challenge, a cookie is saved
# which (in my tests) also worked for other sites, and lasted
# my entire session! However, to play safe, i'd recommend to just
# call it once for every new site/domain you navigate to.
#
# NOTE: mileage may vary!
# bad behaviour can still be detected, and this program does not
# magically "fix" a flagged ip.
#
# please don't spam issues on github! first look if the issue
# is not already reported.
# """
# try:
# self.get(url)
# finally:
# self.service.stop()
# # threading.Timer(factor or self.factor, self.close).start()
# time.sleep(delay or self.delay)
# self.service.start()
# self.start_session()
#
def quit(self): def quit(self):
logger.debug("closing webdriver") logger.debug("closing webdriver")
try: try:
@ -353,7 +436,9 @@ class Chrome(object):
except FileNotFoundError: except FileNotFoundError:
pass pass
except PermissionError: except PermissionError:
logger.debug("permission error. files are still in use/locked. retying...") logger.debug(
"permission error. files are still in use/locked. retying..."
)
else: else:
break break
time.sleep(1) time.sleep(1)
@ -366,14 +451,29 @@ class Chrome(object):
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
self.service.stop() self.service.stop()
# threading.Timer(self.factor, self.service.start).start() time.sleep(self._delay)
time.sleep(self.delay)
self.service.start() self.service.start()
self.start_session() self.start_session()
def __hash__(self): def __hash__(self):
return hash(self.options.debugger_address) return hash(self.options.debugger_address)
def find_elements_by_text(self, text: str):
for elem in self.find_elements_by_css_selector("*"):
try:
if text.lower() in elem.text.lower():
yield elem
except Exception as e:
logger.debug("find_elements_by_text: %s" % e)
def find_element_by_text(self, text: str):
for elem in self.find_elements_by_css_selector("*"):
try:
if text.lower() in elem.text.lower():
return elem
except Exception as e:
logger.debug("find_elements_by_text: %s" % e)
class Patcher(object): class Patcher(object):
url_repo = "https://chromedriver.storage.googleapis.com" url_repo = "https://chromedriver.storage.googleapis.com"
@ -430,7 +530,7 @@ class Patcher(object):
self.version_full = None self.version_full = None
@classmethod @classmethod
def auto(cls, executable_path="./chromedriver", force=False): def auto(cls, executable_path=None, force=False):
""" """
Args: Args:
@ -514,9 +614,7 @@ class Patcher(object):
with zipfile.ZipFile(fp, mode="r") as zf: with zipfile.ZipFile(fp, mode="r") as zf:
zf.extract(self.exe_name, os.path.dirname(self.executable_path)) zf.extract(self.exe_name, os.path.dirname(self.executable_path))
# os.rename(self.zip_path, self.executable_path)
os.remove(fp) os.remove(fp)
os.chmod(self.executable_path, 0o755) os.chmod(self.executable_path, 0o755)
return self.executable_path return self.executable_path
@ -575,8 +673,13 @@ class Patcher(object):
linect += 1 linect += 1
return linect return linect
def __repr__(self):
return "{0:s}({1:s})".format(
self.__class__.__name__,
self.executable_path,
)
# class ChromeOptions(selenium.webdriver.chrome.webdriver.Options):
class ChromeOptions(_ChromeOptions): class ChromeOptions(_ChromeOptions):
def add_extension_file_crx(self, extension=None): def add_extension_file_crx(self, extension=None):
if extension: if extension: