undetected-chromedriver/undetected_chromedriver/__init__.py

861 lines
33 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
2021-12-23 10:23:25 -07:00
"""
888 888 d8b
888 888 Y8P
888 888
.d8888b 88888b. 888d888 .d88b. 88888b.d88b. .d88b. .d88888 888d888 888 888 888 .d88b. 888d888
d88P" 888 "88b 888P" d88""88b 888 "888 "88b d8P Y8b d88" 888 888P" 888 888 888 d8P Y8b 888P"
888 888 888 888 888 888 888 888 888 88888888 888 888 888 888 Y88 88P 88888888 888
Y88b. 888 888 888 Y88..88P 888 888 888 Y8b. Y88b 888 888 888 Y8bd8P Y8b. 888
"Y8888P 888 888 888 "Y88P" 888 888 888 "Y8888 "Y88888 888 888 Y88P "Y8888 888 88888888
by UltrafunkAmsterdam (https://github.com/ultrafunkamsterdam)
"""
2022-11-29 10:26:11 -07:00
from __future__ import annotations
__version__ = "3.4.7"
import json
import logging
import os
import re
import shutil
import subprocess
import sys
import tempfile
2022-04-04 05:20:25 -06:00
import time
from weakref import finalize
import selenium.webdriver.chrome.service
import selenium.webdriver.chrome.webdriver
from selenium.webdriver.common.by import By
import selenium.webdriver.common.service
2022-07-17 03:18:24 -06:00
import selenium.webdriver.remote.command
import selenium.webdriver.remote.webdriver
from .cdp import CDP
2022-04-04 05:20:25 -06:00
from .dprocess import start_detached
from .options import ChromeOptions
from .patcher import IS_POSIX
from .patcher import Patcher
from .reactor import Reactor
from .webelement import UCWebElement
from .webelement import WebElement
__all__ = (
2023-02-07 17:27:50 -07:00
"Chrome",
"ChromeOptions",
"Patcher",
"Reactor",
"CDP",
"find_chrome_executable",
)
2023-02-07 17:27:50 -07:00
logger = logging.getLogger("uc")
logger.setLevel(logging.getLogger().getEffectiveLevel())
2023-02-07 17:27:50 -07:00
class Chrome(selenium.webdriver.chrome.webdriver.WebDriver):
"""
Controls the ChromeDriver and allows you to drive the browser.
The webdriver file will be downloaded by this module automatically,
you do not need to specify this. however, you may if you wish.
Attributes
----------
Methods
-------
reconnect()
this can be useful in case of heavy detection methods
-stops the chromedriver service which runs in the background
-starts the chromedriver service which runs in the background
-recreate session
start_session(capabilities=None, browser_profile=None)
differentiates from the regular method in that it does not
require a capabilities argument. The capabilities are automatically
recreated from the options at creation time.
--------------------------------------------------------------------------
NOTE:
Chrome has everything included to work out of the box.
it does not `need` customizations.
any customizations MAY lead to trigger bot migitation systems.
--------------------------------------------------------------------------
"""
2023-02-07 17:27:50 -07:00
_instances = set()
session_id = None
2021-12-22 07:07:27 -07:00
debug = False
2023-02-07 17:27:50 -07:00
def __init__(
2023-02-07 17:27:50 -07:00
self,
options=None,
user_data_dir=None,
driver_executable_path=None,
browser_executable_path=None,
port=0,
enable_cdp_events=False,
service_args=None,
service_creationflags=None,
desired_capabilities=None,
advanced_elements=False,
service_log_path=None,
keep_alive=True,
log_level=0,
headless=False,
version_main=None,
patcher_force_close=False,
suppress_welcome=True,
use_subprocess=True,
debug=False,
no_sandbox=True,
user_multi_procs: bool = False,
2023-02-07 17:27:50 -07:00
**kw,
):
"""
Creates a new instance of the chrome driver.
Starts the service and then creates new instance of chrome driver.
Parameters
----------
2021-12-22 07:07:27 -07:00
Patcher: changed the way how patcher works (for those using multiple sessions/processes). when not specifying a executable_path (the default, and recommended!), the filename gets randomized to <somehex>_chromedriver[.exe]. this should fix the issue for multiprocessing (although Chrome/driver itself has restrictions in this as well, see it using processhacker). As i told before, webdriver is a purely io-based operation which only sends and pulls data. multiprocessing/threading isn't going to help much. You'd better use asyncio.) find_chrome_executable: added google-chrome-stable to the list, as some distro's have this name. advanced_webelements: bool, optional, default: False makes it easier to recognize elements like you know them from html/browser inspection, especially when working in an interactive environment default webelement repr: <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")> advanced webelement repr <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> note: when retrieving large amounts of elements ( example: find_elements_by_tag("*") ) and **print** them, it does take a little more time for all the repr's to fetch Chrome() parameters driver_executable_path=None ( = executable_path ) if you really need to specify your own chromedriver binary. (don't log issues when you are not using the default. the downloading per session happens for a reason. remember this is a detection-focussed fork) browser_executable_path=None ( = browser binary path ) to specify your browser in case you use exotic locations instead of the more default install folders advanced_elements=False if set to True, webelements get a nicer REPR showing. this is very convenient when working interactively (like ipython for example). <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> instead of <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")>
2022-03-13 16:05:22 -06:00
options: ChromeOptions, optional, default: None - automatic useful defaults
this takes an instance of ChromeOptions, mainly to customize browser behavior.
anything other dan the default, for example extensions or startup options
are not supported in case of failure, and can probably lowers your undetectability.
user_data_dir: str , optional, default: None (creates temp profile)
if user_data_dir is a path to a valid chrome profile directory, use it,
and turn off automatic removal mechanism at exit.
2021-12-22 07:07:27 -07:00
Patcher: changed the way how patcher works (for those using multiple sessions/processes). when not specifying a executable_path (the default, and recommended!), the filename gets randomized to <somehex>_chromedriver[.exe]. this should fix the issue for multiprocessing (although Chrome/driver itself has restrictions in this as well, see it using processhacker). As i told before, webdriver is a purely io-based operation which only sends and pulls data. multiprocessing/threading isn't going to help much. You'd better use asyncio.) find_chrome_executable: added google-chrome-stable to the list, as some distro's have this name. advanced_webelements: bool, optional, default: False makes it easier to recognize elements like you know them from html/browser inspection, especially when working in an interactive environment default webelement repr: <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")> advanced webelement repr <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> note: when retrieving large amounts of elements ( example: find_elements_by_tag("*") ) and **print** them, it does take a little more time for all the repr's to fetch Chrome() parameters driver_executable_path=None ( = executable_path ) if you really need to specify your own chromedriver binary. (don't log issues when you are not using the default. the downloading per session happens for a reason. remember this is a detection-focussed fork) browser_executable_path=None ( = browser binary path ) to specify your browser in case you use exotic locations instead of the more default install folders advanced_elements=False if set to True, webelements get a nicer REPR showing. this is very convenient when working interactively (like ipython for example). <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> instead of <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")>
2022-03-13 16:05:22 -06:00
driver_executable_path: str, optional, default: None(=downloads and patches new binary)
browser_executable_path: str, optional, default: None - use find_chrome_executable
2021-12-22 07:07:27 -07:00
Path to the browser executable.
If not specified, make sure the executable's folder is in $PATH
port: int, optional, default: 0
port to be used by the chromedriver executable, this is NOT the debugger port.
leave it at 0 unless you know what you are doing.
the default value of 0 automatically picks an available port.
2022-11-29 10:26:11 -07:00
enable_cdp_events: bool, default: False
:: currently for chrome only
this enables the handling of wire messages
when enabled, you can subscribe to CDP events by using:
driver.add_cdp_listener("Network.dataReceived", yourcallback)
# yourcallback is an callable which accepts exactly 1 dict as parameter
3.15 changed the way how patcher works (for those using multiple sessions/processes). when not specifying a executable_path (the default, and recommended!), the filename gets randomized to <somehex>_chromedriver[.exe]. this should fix the issue for multiprocessing (although Chrome/driver itself has restrictions in this as well, see it using processhacker). As i told before, webdriver is a purely io-based operation which only sends and pulls data. multiprocessing/threading isn't going to help much. You'd better use asyncio.) find_chrome_executable: added google-chrome-stable to the list, as some distro's have this name. advanced_webelements: bool, optional, default: False makes it easier to recognize elements like you know them from html/browser inspection, especially when working in an interactive environment default webelement repr: <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")> advanced webelement repr <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> note: when retrieving large amounts of elements ( example: find_elements_by_tag("*") ) and **print** them, it does take a little more time for all the repr's to fetch Chrome() parameters driver_executable_path=None ( = executable_path ) if you really need to specify your own chromedriver binary. (don't log issues when you are not using the default. the downloading per session happens for a reason. remember this is a detection-focussed fork) browser_executable_path=None ( = browser binary path ) to specify your browser in case you use exotic locations instead of the more default install folders advanced_elements=False if set to True, webelements get a nicer REPR showing. this is very convenient when working interactively (like ipython for example). <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> instead of <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")>
2022-03-13 16:42:41 -06:00
service_args: list of str, optional, default: None
arguments to pass to the driver service
desired_capabilities: dict, optional, default: None - auto from config
Dictionary object with non-browser specific capabilities only, such as "item" or "loggingPref".
3.1.5r2 changed the way how patcher works (for those using multiple sessions/processes). when not specifying a executable_path (the default, and recommended!), the filename gets randomized to <somehex>_chromedriver[.exe]. this should fix the issue for multiprocessing (although Chrome/driver itself has restrictions in this as well, see it using processhacker). As i told before, webdriver is a purely io-based operation which only sends and pulls data. multiprocessing/threading isn't going to help much. You'd better use asyncio.) find_chrome_executable: added google-chrome-stable to the list, as some distro's have this name. advanced_webelements: bool, optional, default: False makes it easier to recognize elements like you know them from html/browser inspection, especially when working in an interactive environment default webelement repr: <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")> advanced webelement repr <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> note: when retrieving large amounts of elements ( example: find_elements_by_tag("*") ) and **print** them, it does take a little more time for all the repr's to fetch Chrome() parameters driver_executable_path=None ( = executable_path ) if you really need to specify your own chromedriver binary. (don't log issues when you are not using the default. the downloading per session happens for a reason. remember this is a detection-focussed fork) browser_executable_path=None ( = browser binary path ) to specify your browser in case you use exotic locations instead of the more default install folders advanced_elements=False if set to True, webelements get a nicer REPR showing. this is very convenient when working interactively (like ipython for example). <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> instead of <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")>
2022-03-13 16:49:02 -06:00
advanced_elements: bool, optional, default: False
3.15 changed the way how patcher works (for those using multiple sessions/processes). when not specifying a executable_path (the default, and recommended!), the filename gets randomized to <somehex>_chromedriver[.exe]. this should fix the issue for multiprocessing (although Chrome/driver itself has restrictions in this as well, see it using processhacker). As i told before, webdriver is a purely io-based operation which only sends and pulls data. multiprocessing/threading isn't going to help much. You'd better use asyncio.) find_chrome_executable: added google-chrome-stable to the list, as some distro's have this name. advanced_webelements: bool, optional, default: False makes it easier to recognize elements like you know them from html/browser inspection, especially when working in an interactive environment default webelement repr: <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")> advanced webelement repr <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> note: when retrieving large amounts of elements ( example: find_elements_by_tag("*") ) and **print** them, it does take a little more time for all the repr's to fetch Chrome() parameters driver_executable_path=None ( = executable_path ) if you really need to specify your own chromedriver binary. (don't log issues when you are not using the default. the downloading per session happens for a reason. remember this is a detection-focussed fork) browser_executable_path=None ( = browser binary path ) to specify your browser in case you use exotic locations instead of the more default install folders advanced_elements=False if set to True, webelements get a nicer REPR showing. this is very convenient when working interactively (like ipython for example). <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> instead of <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")>
2022-03-13 16:42:41 -06:00
makes it easier to recognize elements like you know them from html/browser inspection, especially when working
in an interactive environment
default webelement repr:
<selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")>
advanced webelement repr
<WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)>
note: when retrieving large amounts of elements ( example: find_elements_by_tag("*") ) and print them, it does take a little more time.
service_log_path: str, optional, default: None
path to log information from the driver.
keep_alive: bool, optional, default: True
Whether to configure ChromeRemoteConnection to use HTTP keep-alive.
log_level: int, optional, default: adapts to python global log level
headless: bool, optional, default: False
can also be specified in the options instance.
Specify whether you want to use the browser in headless mode.
warning: this lowers undetectability and not fully supported.
version_main: int, optional, default: None (=auto)
if you, for god knows whatever reason, use
an older version of Chrome. You can specify it's full rounded version number
here. Example: 87 for all versions of 87
patcher_force_close: bool, optional, default: False
instructs the patcher to do whatever it can to access the chromedriver binary
if the file is locked, it will force shutdown all instances.
setting it is not recommended, unless you know the implications and think
you might need it.
2021-12-22 07:07:27 -07:00
suppress_welcome: bool, optional , default: True
a "welcome" alert might show up on *nix-like systems asking whether you want to set
chrome as your default browser, and if you want to send even more data to google.
now, in case you are nag-fetishist, or a diagnostics data feeder to google, you can set this to False.
Note: if you don't handle the nag screen in time, the browser loses it's connection and throws an Exception.
use_subprocess: bool, optional , default: True,
2021-12-24 07:31:51 -07:00
False (the default) makes sure Chrome will get it's own process (so no subprocess of chromedriver.exe or python
This fixes a LOT of issues, like multithreaded run, but mst importantly. shutting corectly after
program exits or using .quit()
you should be knowing what you're doing, and know how python works.
2021-12-24 07:31:51 -07:00
unfortunately, there is always an edge case in which one would like to write an single script with the only contents being:
--start script--
import undetected_chromedriver as uc
d = uc.Chrome()
d.get('https://somesite/')
---end script --
and will be greeted with an error, since the program exists before chrome has a change to launch.
in that case you can set this to `True`. The browser will start via subprocess, and will keep running most of times.
! setting it to True comes with NO support when being detected. !
2022-11-20 14:05:04 -07:00
no_sandbox: bool, optional, default=True
uses the --no-sandbox option, and additionally does suppress the "unsecure option" status bar
this option has a default of True since many people seem to run this as root (....) , and chrome does not start
when running as root without using --no-sandbox flag.
user_multi_procs:
set to true when you are using multithreads/multiprocessing
ensures not all processes are trying to modify a binary which is in use by another.
for this to work. YOU MUST HAVE AT LEAST 1 UNDETECTED_CHROMEDRIVER BINARY IN YOUR ROAMING DATA FOLDER.
this requirement can be easily satisfied, by just running this program "normal" and close/kill it.
2021-12-22 07:07:27 -07:00
"""
2023-02-07 17:27:50 -07:00
finalize(self, self._ensure_close, self)
2021-12-22 07:07:27 -07:00
self.debug = debug
2023-02-05 07:36:31 -07:00
self.patcher = Patcher(
2023-02-07 17:27:50 -07:00
executable_path=driver_executable_path,
force=patcher_force_close,
version_main=version_main,
user_multi_procs=user_multi_procs,
2023-02-07 17:27:50 -07:00
)
# self.patcher.auto(user_multiprocess = user_multi_num_procs)
2023-02-05 07:36:31 -07:00
self.patcher.auto()
2023-02-05 07:36:31 -07:00
# self.patcher = patcher
if not options:
options = ChromeOptions()
2023-02-07 17:27:50 -07:00
try:
2023-02-07 17:27:50 -07:00
if hasattr(options, "_session") and options._session is not None:
# prevent reuse of options,
# as it just appends arguments, not replace them
# you'll get conflicts starting chrome
2023-02-07 17:27:50 -07:00
raise RuntimeError("you cannot reuse the ChromeOptions object")
except AttributeError:
pass
2023-02-07 17:27:50 -07:00
options._session = self
2023-02-07 17:27:50 -07:00
if not options.debugger_address:
2022-08-30 05:46:41 -06:00
debug_port = (
port
if port != 0
else selenium.webdriver.common.service.utils.free_port()
)
2022-06-29 04:07:25 -06:00
debug_host = "127.0.0.1"
2023-02-07 17:27:50 -07:00
options.debugger_address = "%s:%d" % (debug_host, debug_port)
2022-06-29 04:07:25 -06:00
else:
2023-02-07 17:27:50 -07:00
debug_host, debug_port = options.debugger_address.split(":")
debug_port = int(debug_port)
if enable_cdp_events:
2022-08-30 05:46:41 -06:00
options.set_capability(
2023-02-07 17:27:50 -07:00
"goog:loggingPrefs", {"performance": "ALL", "browser": "ALL"}
)
options.add_argument("--remote-debugging-host=%s" % debug_host)
options.add_argument("--remote-debugging-port=%s" % debug_port)
2022-03-16 15:47:48 -06:00
if user_data_dir:
2023-02-07 17:27:50 -07:00
options.add_argument("--user-data-dir=%s" % user_data_dir)
language, keep_user_data_dir = None, bool(user_data_dir)
# see if a custom user profile is specified in options
for arg in options.arguments:
if any([_ in arg for _ in ("--headless", "headless")]):
options.arguments.remove(arg)
options.headless = True
if "lang" in arg:
2023-02-07 17:27:50 -07:00
m = re.search("(?:--)?lang(?:[ =])?(.*)", arg)
try:
2023-02-07 17:27:50 -07:00
language = m[1]
except IndexError:
2023-02-07 17:27:50 -07:00
logger.debug("will set the language to en-US,en;q=0.9")
language = "en-US,en;q=0.9"
2023-02-07 17:27:50 -07:00
if "user-data-dir" in arg:
2023-02-07 17:27:50 -07:00
m = re.search("(?:--)?user-data-dir(?:[ =])?(.*)", arg)
try:
2023-02-07 17:27:50 -07:00
user_data_dir = m[1]
2022-08-30 05:46:41 -06:00
logger.debug(
2023-02-07 17:27:50 -07:00
"user-data-dir found in user argument %s => %s" % (arg, m[1])
)
keep_user_data_dir = True
2023-02-07 17:27:50 -07:00
except IndexError:
logger.debug(
2022-08-30 05:46:41 -06:00
"no user data dir could be extracted from supplied argument %s "
% arg
2023-02-07 17:27:50 -07:00
)
if not user_data_dir:
2021-12-22 07:07:27 -07:00
# backward compatiblity
# check if an old uc.ChromeOptions is used, and extract the user data dir
2023-02-07 17:27:50 -07:00
if hasattr(options, "user_data_dir") and getattr(
options, "user_data_dir", None
):
2021-12-22 07:07:27 -07:00
import warnings
2023-02-07 17:27:50 -07:00
2021-12-22 07:07:27 -07:00
warnings.warn(
"using ChromeOptions.user_data_dir might stop working in future versions."
"use uc.Chrome(user_data_dir='/xyz/some/data') in case you need existing profile folder"
2023-02-07 17:27:50 -07:00
)
options.add_argument("--user-data-dir=%s" % options.user_data_dir)
keep_user_data_dir = True
2022-08-30 05:46:41 -06:00
logger.debug(
"user_data_dir property found in options object: %s" % user_data_dir
2023-02-07 17:27:50 -07:00
)
else:
2023-02-07 17:27:50 -07:00
user_data_dir = os.path.normpath(tempfile.mkdtemp())
keep_user_data_dir = False
arg = "--user-data-dir=%s" % user_data_dir
2023-02-07 17:27:50 -07:00
options.add_argument(arg)
logger.debug(
"created a temporary folder in which the user-data (profile) will be stored during this\n"
"session, and added it to chrome startup arguments: %s" % arg
2023-02-07 17:27:50 -07:00
)
if not language:
try:
import locale
2023-02-07 17:27:50 -07:00
language = locale.getdefaultlocale()[0].replace("_", "-")
except Exception:
pass
if not language:
language = "en-US"
2023-02-07 17:27:50 -07:00
options.add_argument("--lang=%s" % language)
if not options.binary_location:
2022-08-30 05:46:41 -06:00
options.binary_location = (
2023-02-07 17:27:50 -07:00
browser_executable_path or find_chrome_executable()
2022-08-30 05:46:41 -06:00
)
2021-12-22 07:07:27 -07:00
self._delay = 3
self.user_data_dir = user_data_dir
self.keep_user_data_dir = keep_user_data_dir
2023-02-07 17:27:50 -07:00
2021-12-22 07:07:27 -07:00
if suppress_welcome:
2023-02-07 17:27:50 -07:00
options.arguments.extend(["--no-default-browser-check", "--no-first-run"])
if no_sandbox:
2023-02-07 17:27:50 -07:00
options.arguments.extend(["--no-sandbox", "--test-type"])
if headless or options.headless:
2023-06-02 15:04:09 -06:00
#workaround until a better checking is found
options.add_argument("--headless=new")
#if self.patcher.version_main < 108:
# options.add_argument("--headless=chrome")
#elif self.patcher.version_main >= 108:
options.add_argument("--window-size=1920,1080")
options.add_argument("--start-maximized")
options.add_argument("--no-sandbox")
# fixes "could not connect to chrome" error when running
# on linux using privileged user like root (which i don't recommend)
options.add_argument(
2022-08-30 05:46:41 -06:00
"--log-level=%d" % log_level
2023-02-07 17:27:50 -07:00
or divmod(logging.getLogger().getEffectiveLevel(), 10)[0]
)
if hasattr(options, "handle_prefs"):
options.handle_prefs(user_data_dir)
# fix exit_type flag to prevent tab-restore nag
try:
with open(
2023-02-07 17:27:50 -07:00
os.path.join(user_data_dir, "Default/Preferences"),
encoding="latin1",
mode="r+",
) as fs:
config = json.load(fs)
if config["profile"]["exit_type"] is not None:
# fixing the restore-tabs-nag
2023-02-07 17:27:50 -07:00
config["profile"]["exit_type"] = None
fs.seek(0, 0)
json.dump(config, fs)
2022-04-04 05:22:28 -06:00
fs.truncate() # the file might be shorter
2023-02-07 17:27:50 -07:00
logger.debug("fixed exit_type flag")
except Exception as e:
2023-02-07 17:27:50 -07:00
logger.debug("did not find a bad exit_type flag ")
self.options = options
2023-02-07 17:27:50 -07:00
if not desired_capabilities:
desired_capabilities = options.to_capabilities()
2023-02-07 17:27:50 -07:00
2021-12-24 07:31:51 -07:00
if not use_subprocess:
2022-08-30 05:46:41 -06:00
self.browser_pid = start_detached(
2023-02-07 17:27:50 -07:00
options.binary_location, *options.arguments
)
2021-12-24 07:31:51 -07:00
else:
2021-12-23 10:23:25 -07:00
browser = subprocess.Popen(
2023-02-07 17:27:50 -07:00
[options.binary_location, *options.arguments],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
close_fds=IS_POSIX,
)
2021-12-23 10:23:25 -07:00
self.browser_pid = browser.pid
2023-02-07 17:27:50 -07:00
2022-08-30 05:46:41 -06:00
if service_creationflags:
service = selenium.webdriver.common.service.Service(
2023-02-07 17:27:50 -07:00
self.patcher.executable_path, port, service_args, service_log_path
)
for attr_name in ("creationflags", "creation_flags"):
if hasattr(service, attr_name):
setattr(service, attr_name, service_creationflags)
2022-11-20 14:05:04 -07:00
break
2022-08-30 05:46:41 -06:00
else:
service = None
2023-02-07 17:27:50 -07:00
super(Chrome, self).__init__(
executable_path=self.patcher.executable_path,
port=port,
options=options,
service_args=service_args,
desired_capabilities=desired_capabilities,
service_log_path=service_log_path,
keep_alive=keep_alive,
service=service, # needed or the service will be re-created
)
self.reactor = None
2023-02-07 17:27:50 -07:00
2021-12-22 07:07:27 -07:00
if enable_cdp_events:
if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
2022-08-30 05:46:41 -06:00
logging.getLogger(
"selenium.webdriver.remote.remote_connection"
2023-02-07 17:27:50 -07:00
).setLevel(20)
reactor = Reactor(self)
reactor.start()
self.reactor = reactor
2023-02-07 17:27:50 -07:00
3.15 changed the way how patcher works (for those using multiple sessions/processes). when not specifying a executable_path (the default, and recommended!), the filename gets randomized to <somehex>_chromedriver[.exe]. this should fix the issue for multiprocessing (although Chrome/driver itself has restrictions in this as well, see it using processhacker). As i told before, webdriver is a purely io-based operation which only sends and pulls data. multiprocessing/threading isn't going to help much. You'd better use asyncio.) find_chrome_executable: added google-chrome-stable to the list, as some distro's have this name. advanced_webelements: bool, optional, default: False makes it easier to recognize elements like you know them from html/browser inspection, especially when working in an interactive environment default webelement repr: <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")> advanced webelement repr <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> note: when retrieving large amounts of elements ( example: find_elements_by_tag("*") ) and **print** them, it does take a little more time for all the repr's to fetch Chrome() parameters driver_executable_path=None ( = executable_path ) if you really need to specify your own chromedriver binary. (don't log issues when you are not using the default. the downloading per session happens for a reason. remember this is a detection-focussed fork) browser_executable_path=None ( = browser binary path ) to specify your browser in case you use exotic locations instead of the more default install folders advanced_elements=False if set to True, webelements get a nicer REPR showing. this is very convenient when working interactively (like ipython for example). <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> instead of <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")>
2022-03-13 16:42:41 -06:00
if advanced_elements:
self._web_element_cls = UCWebElement
else:
3.15 changed the way how patcher works (for those using multiple sessions/processes). when not specifying a executable_path (the default, and recommended!), the filename gets randomized to <somehex>_chromedriver[.exe]. this should fix the issue for multiprocessing (although Chrome/driver itself has restrictions in this as well, see it using processhacker). As i told before, webdriver is a purely io-based operation which only sends and pulls data. multiprocessing/threading isn't going to help much. You'd better use asyncio.) find_chrome_executable: added google-chrome-stable to the list, as some distro's have this name. advanced_webelements: bool, optional, default: False makes it easier to recognize elements like you know them from html/browser inspection, especially when working in an interactive environment default webelement repr: <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")> advanced webelement repr <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> note: when retrieving large amounts of elements ( example: find_elements_by_tag("*") ) and **print** them, it does take a little more time for all the repr's to fetch Chrome() parameters driver_executable_path=None ( = executable_path ) if you really need to specify your own chromedriver binary. (don't log issues when you are not using the default. the downloading per session happens for a reason. remember this is a detection-focussed fork) browser_executable_path=None ( = browser binary path ) to specify your browser in case you use exotic locations instead of the more default install folders advanced_elements=False if set to True, webelements get a nicer REPR showing. this is very convenient when working interactively (like ipython for example). <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> instead of <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")>
2022-03-13 16:42:41 -06:00
self._web_element_cls = WebElement
2023-02-07 17:27:50 -07:00
if options.headless:
self._configure_headless()
2023-02-07 17:27:50 -07:00
def _configure_headless(self):
orig_get = self.get
2023-02-07 17:27:50 -07:00
logger.info("setting properties for headless")
def get_wrapped(*args, **kwargs):
if self.execute_script("return navigator.webdriver"):
logger.info("patch navigator.webdriver")
self.execute_cdp_cmd(
2023-02-07 17:27:50 -07:00
"Page.addScriptToEvaluateOnNewDocument",
{
"source": """
Object.defineProperty(window, "navigator", {
Object.defineProperty(window, "navigator", {
value: new Proxy(navigator, {
has: (target, key) => (key === "webdriver" ? false : key in target),
get: (target, key) =>
key === "webdriver"
? false
: typeof target[key] === "function"
? target[key].bind(target)
: target[key],
}),
});
"""
2023-02-07 17:27:50 -07:00
},
)
logger.info("patch user-agent string")
self.execute_cdp_cmd(
2023-02-07 17:27:50 -07:00
"Network.setUserAgentOverride",
{
2022-08-30 05:46:41 -06:00
"userAgent": self.execute_script(
"return navigator.userAgent"
2023-02-07 17:27:50 -07:00
).replace("Headless", "")
},
)
self.execute_cdp_cmd(
2023-02-07 17:27:50 -07:00
"Page.addScriptToEvaluateOnNewDocument",
{
"source": """
2022-04-04 05:20:25 -06:00
Object.defineProperty(navigator, 'maxTouchPoints', {get: () => 1});
Object.defineProperty(navigator.connection, 'rtt', {get: () => 100});
// https://github.com/microlinkhq/browserless/blob/master/packages/goto/src/evasions/chrome-runtime.js
window.chrome = {
app: {
isInstalled: false,
InstallState: {
DISABLED: 'disabled',
INSTALLED: 'installed',
NOT_INSTALLED: 'not_installed'
},
RunningState: {
CANNOT_RUN: 'cannot_run',
READY_TO_RUN: 'ready_to_run',
RUNNING: 'running'
}
},
runtime: {
OnInstalledReason: {
CHROME_UPDATE: 'chrome_update',
INSTALL: 'install',
SHARED_MODULE_UPDATE: 'shared_module_update',
UPDATE: 'update'
},
OnRestartRequiredReason: {
APP_UPDATE: 'app_update',
OS_UPDATE: 'os_update',
PERIODIC: 'periodic'
},
PlatformArch: {
ARM: 'arm',
ARM64: 'arm64',
MIPS: 'mips',
MIPS64: 'mips64',
X86_32: 'x86-32',
X86_64: 'x86-64'
},
PlatformNaclArch: {
ARM: 'arm',
MIPS: 'mips',
MIPS64: 'mips64',
X86_32: 'x86-32',
X86_64: 'x86-64'
},
PlatformOs: {
ANDROID: 'android',
CROS: 'cros',
LINUX: 'linux',
MAC: 'mac',
OPENBSD: 'openbsd',
WIN: 'win'
},
RequestUpdateCheckStatus: {
NO_UPDATE: 'no_update',
THROTTLED: 'throttled',
UPDATE_AVAILABLE: 'update_available'
}
}
}
// https://github.com/microlinkhq/browserless/blob/master/packages/goto/src/evasions/navigator-permissions.js
if (!window.Notification) {
window.Notification = {
permission: 'denied'
}
}
const originalQuery = window.navigator.permissions.query
window.navigator.permissions.__proto__.query = parameters =>
parameters.name === 'notifications'
? Promise.resolve({ state: window.Notification.permission })
: originalQuery(parameters)
const oldCall = Function.prototype.call
function call() {
return oldCall.apply(this, arguments)
}
Function.prototype.call = call
const nativeToStringFunctionString = Error.toString().replace(/Error/g, 'toString')
const oldToString = Function.prototype.toString
function functionToString() {
if (this === window.navigator.permissions.query) {
return 'function query() { [native code] }'
}
if (this === functionToString) {
return nativeToStringFunctionString
}
return oldCall.call(oldToString, this)
}
// eslint-disable-next-line
Function.prototype.toString = functionToString
"""
2023-02-07 17:27:50 -07:00
},
)
return orig_get(*args, **kwargs)
self.get = get_wrapped
2023-02-07 17:27:50 -07:00
# def _get_cdc_props(self):
# return self.execute_script(
# """
# let objectToInspect = window,
# result = [];
# while(objectToInspect !== null)
# { result = result.concat(Object.getOwnPropertyNames(objectToInspect));
# objectToInspect = Object.getPrototypeOf(objectToInspect); }
#
# return result.filter(i => i.match(/^([a-zA-Z]){27}(Array|Promise|Symbol)$/ig))
# """
# )
#
# def _hook_remove_cdc_props(self):
# self.execute_cdp_cmd(
# "Page.addScriptToEvaluateOnNewDocument",
# {
# "source": """
# let objectToInspect = window,
# result = [];
# while(objectToInspect !== null)
# { result = result.concat(Object.getOwnPropertyNames(objectToInspect));
# objectToInspect = Object.getPrototypeOf(objectToInspect); }
# result.forEach(p => p.match(/^([a-zA-Z]){27}(Array|Promise|Symbol)$/ig)
# &&delete window[p]&&console.log('removed',p))
# """
# },
# )
2023-02-07 17:27:50 -07:00
def get(self, url):
# if self._get_cdc_props():
# self._hook_remove_cdc_props()
2023-02-07 17:27:50 -07:00
return super().get(url)
def add_cdp_listener(self, event_name, callback):
2022-08-30 05:46:41 -06:00
if (
2023-02-07 17:27:50 -07:00
self.reactor
and self.reactor is not None
and isinstance(self.reactor, Reactor)
2022-08-30 05:46:41 -06:00
):
2023-02-07 17:27:50 -07:00
self.reactor.add_event_handler(event_name, callback)
return self.reactor.handlers
return False
2023-02-07 17:27:50 -07:00
def clear_cdp_listeners(self):
if self.reactor and isinstance(self.reactor, Reactor):
self.reactor.handlers.clear()
2023-02-07 17:27:50 -07:00
def window_new(self):
2022-07-17 03:18:24 -06:00
self.execute(
2023-02-07 17:27:50 -07:00
selenium.webdriver.remote.command.Command.NEW_WINDOW, {"type": "window"}
)
def tab_new(self, url: str):
"""
this opens a url in a new tab.
apparently, that passes all tests directly!
Parameters
----------
url
Returns
-------
"""
2023-02-07 17:27:50 -07:00
if not hasattr(self, "cdp"):
from .cdp import CDP
2023-02-07 17:27:50 -07:00
cdp = CDP(self.options)
cdp.tab_new(url)
def reconnect(self, timeout=0.1):
try:
self.service.stop()
except Exception as e:
2023-02-07 17:27:50 -07:00
logger.debug(e)
time.sleep(timeout)
try:
self.service.start()
except Exception as e:
2023-02-07 17:27:50 -07:00
logger.debug(e)
try:
self.start_session()
except Exception as e:
2023-02-07 17:27:50 -07:00
logger.debug(e)
def start_session(self, capabilities=None, browser_profile=None):
if not capabilities:
capabilities = self.options.to_capabilities()
2023-02-07 17:27:50 -07:00
super(selenium.webdriver.chrome.webdriver.WebDriver, self).start_session(
capabilities, browser_profile
)
2021-12-22 07:07:27 -07:00
# super(Chrome, self).start_session(capabilities, browser_profile)
2023-02-07 17:27:50 -07:00
def quit(self):
try:
2021-12-23 10:23:25 -07:00
self.service.process.kill()
2023-02-07 17:27:50 -07:00
logger.debug("webdriver process ended")
except (AttributeError, RuntimeError, OSError):
pass
try:
self.reactor.event.set()
2023-02-07 17:27:50 -07:00
logger.debug("shutting down reactor")
except AttributeError:
pass
try:
2023-02-07 17:27:50 -07:00
os.kill(self.browser_pid, 15)
logger.debug("gracefully closed browser")
except Exception as e: # noqa
2023-06-02 14:34:33 -06:00
pass
if (
2023-02-07 17:27:50 -07:00
hasattr(self, "keep_user_data_dir")
and hasattr(self, "user_data_dir")
and not self.keep_user_data_dir
):
2023-02-07 17:27:50 -07:00
for _ in range(5):
try:
2023-02-07 17:27:50 -07:00
shutil.rmtree(self.user_data_dir, ignore_errors=False)
except FileNotFoundError:
pass
2023-02-07 17:27:50 -07:00
except (RuntimeError, OSError, PermissionError) as e:
logger.debug(
2021-12-22 07:07:27 -07:00
"When removing the temp profile, a %s occured: %s\nretrying..."
2023-02-07 17:27:50 -07:00
% (e.__class__.__name__, e)
)
else:
2023-02-07 17:27:50 -07:00
logger.debug("successfully removed %s" % self.user_data_dir)
break
2023-02-07 17:27:50 -07:00
time.sleep(0.1)
Patcher: changed the way how patcher works (for those using multiple sessions/processes). when not specifying a executable_path (the default, and recommended!), the filename gets randomized to <somehex>_chromedriver[.exe]. this should fix the issue for multiprocessing (although Chrome/driver itself has restrictions in this as well, see it using processhacker). As i told before, webdriver is a purely io-based operation which only sends and pulls data. multiprocessing/threading isn't going to help much. You'd better use asyncio.) find_chrome_executable: added google-chrome-stable to the list, as some distro's have this name. advanced_webelements: bool, optional, default: False makes it easier to recognize elements like you know them from html/browser inspection, especially when working in an interactive environment default webelement repr: <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")> advanced webelement repr <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> note: when retrieving large amounts of elements ( example: find_elements_by_tag("*") ) and **print** them, it does take a little more time for all the repr's to fetch Chrome() parameters driver_executable_path=None ( = executable_path ) if you really need to specify your own chromedriver binary. (don't log issues when you are not using the default. the downloading per session happens for a reason. remember this is a detection-focussed fork) browser_executable_path=None ( = browser binary path ) to specify your browser in case you use exotic locations instead of the more default install folders advanced_elements=False if set to True, webelements get a nicer REPR showing. this is very convenient when working interactively (like ipython for example). <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> instead of <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")>
2022-03-13 16:05:22 -06:00
# dereference patcher, so patcher can start cleaning up as well.
# this must come last, otherwise it will throw 'in use' errors
self.patcher = None
2023-02-07 17:27:50 -07:00
def __getattribute__(self, item):
if not super().__getattribute__("debug"):
return super().__getattribute__(item)
2022-11-29 10:26:11 -07:00
else:
import inspect
2023-02-07 17:27:50 -07:00
original = super().__getattribute__(item)
if inspect.ismethod(original) and not inspect.isclass(original):
def newfunc(*args, **kwargs):
2022-11-29 10:26:11 -07:00
logger.debug(
"calling %s with args %s and kwargs %s\n"
2023-02-07 17:27:50 -07:00
% (original.__qualname__, args, kwargs)
)
return original(*args, **kwargs)
2022-11-29 10:26:11 -07:00
return newfunc
return original
2023-02-07 17:27:50 -07:00
def __enter__(self):
return self
2023-02-07 17:27:50 -07:00
def __exit__(self, exc_type, exc_val, exc_tb):
self.service.stop()
2023-02-07 17:27:50 -07:00
time.sleep(self._delay)
self.service.start()
self.start_session()
2023-02-07 17:27:50 -07:00
def __hash__(self):
return hash(self.options.debugger_address)
def __dir__(self):
return object.__dir__(self)
def __del__(self):
2022-11-29 10:26:11 -07:00
try:
self.service.process.kill()
except: # noqa
pass
self.quit()
2023-02-07 17:27:50 -07:00
@classmethod
2023-02-07 17:27:50 -07:00
def _ensure_close(cls, self):
# needs to be a classmethod so finalize can find the reference
2023-02-07 17:27:50 -07:00
logger.info("ensuring close")
if (
2023-02-07 17:27:50 -07:00
hasattr(self, "service")
and hasattr(self.service, "process")
and hasattr(self.service.process, "kill")
):
self.service.process.kill()
def find_chrome_executable():
"""
Finds the chrome, chrome beta, chrome canary, chromium executable
Returns
-------
executable_path : str
the full file path to found executable
"""
candidates = set()
if IS_POSIX:
2023-02-07 17:27:50 -07:00
for item in os.environ.get("PATH").split(os.pathsep):
3.15 changed the way how patcher works (for those using multiple sessions/processes). when not specifying a executable_path (the default, and recommended!), the filename gets randomized to <somehex>_chromedriver[.exe]. this should fix the issue for multiprocessing (although Chrome/driver itself has restrictions in this as well, see it using processhacker). As i told before, webdriver is a purely io-based operation which only sends and pulls data. multiprocessing/threading isn't going to help much. You'd better use asyncio.) find_chrome_executable: added google-chrome-stable to the list, as some distro's have this name. advanced_webelements: bool, optional, default: False makes it easier to recognize elements like you know them from html/browser inspection, especially when working in an interactive environment default webelement repr: <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")> advanced webelement repr <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> note: when retrieving large amounts of elements ( example: find_elements_by_tag("*") ) and **print** them, it does take a little more time for all the repr's to fetch Chrome() parameters driver_executable_path=None ( = executable_path ) if you really need to specify your own chromedriver binary. (don't log issues when you are not using the default. the downloading per session happens for a reason. remember this is a detection-focussed fork) browser_executable_path=None ( = browser binary path ) to specify your browser in case you use exotic locations instead of the more default install folders advanced_elements=False if set to True, webelements get a nicer REPR showing. this is very convenient when working interactively (like ipython for example). <WebElement(<a class="mobile-show-inline-block mc-update-infos init-ok" href="#" id="main-cat-switcher-mobile">)> instead of <selenium.webdriver.remote.webelement.WebElement (session="85ff0f671512fa535630e71ee951b1f2", element="6357cb55-92c3-4c0f-9416-b174f9c1b8c4")>
2022-03-13 16:42:41 -06:00
for subitem in (
2023-02-07 17:27:50 -07:00
"google-chrome",
"chromium",
"chromium-browser",
"chrome",
"google-chrome-stable",
):
candidates.add(os.sep.join((item, subitem)))
if "darwin" in sys.platform:
candidates.update(
2022-03-13 17:37:12 -06:00
[
2023-02-07 17:27:50 -07:00
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"/Applications/Chromium.app/Contents/MacOS/Chromium",
]
)
else:
2022-08-30 05:46:41 -06:00
for item in map(
2023-02-07 17:27:50 -07:00
os.environ.get,
("PROGRAMFILES", "PROGRAMFILES(X86)", "LOCALAPPDATA", "PROGRAMW6432"),
):
if item is not None:
for subitem in (
2023-02-07 17:27:50 -07:00
"Google/Chrome/Application",
"Google/Chrome Beta/Application",
"Google/Chrome Canary/Application",
):
candidates.add(os.sep.join((item, subitem, "chrome.exe")))
for candidate in candidates:
2023-06-02 14:44:05 -06:00
logger.debug('checking if %s exists and is executable' % candidate)
2023-02-07 17:27:50 -07:00
if os.path.exists(candidate) and os.access(candidate, os.X_OK):
2023-06-02 14:42:43 -06:00
logger.debug('found! using %s' % candidate)
2023-02-07 17:27:50 -07:00
return os.path.normpath(candidate)