diff --git a/.gitignore b/.gitignore index f8b73e7..4ec4639 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +venv/ +.idea/ +geckodriver +VitalSource/ + # ---> Python # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index d87ecbe..4cb34bd 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,3 @@ # vitalsource2pdf +unset SNAP_NAME; unset SNAP_INSTANCE_NAME diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ef9f54b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +selenium +webdriver-manager +requests +pyshadow +tqdm \ No newline at end of file diff --git a/vitalsource2pdf.py b/vitalsource2pdf.py new file mode 100644 index 0000000..df2c8ce --- /dev/null +++ b/vitalsource2pdf.py @@ -0,0 +1,114 @@ +import argparse +import time +from pathlib import Path + +import selenium +from selenium.webdriver import ActionChains, Keys +from selenium.webdriver.chrome.service import Service +from selenium.webdriver.common.by import By +# from webdriver_manager.firefox import GeckoDriverManager +# from selenium.webdriver.firefox.service import Service as FirefoxService +from seleniumwire import webdriver +from tqdm import tqdm +from webdriver_manager.chrome import ChromeDriverManager + +from vitalsource_scraper.file import download_file + +parser = argparse.ArgumentParser() +parser.add_argument('--output', default='./VitalSource/') +parser.add_argument('--isbn', required=True) +parser.add_argument('--delay', default=8, type=int, help='Delay between pages to let them load.') +parser.add_argument('--pages', default=None, type=int, help='Override how many pages to save.') +args = parser.parse_args() + +args.output = Path(args.output) + +# driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install())) +# version = read_version_from_cmd('/usr/bin/chromium-browser --version', PATTERN[ChromeType.CHROMIUM]) +# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager(version='111.0.5563.64', chrome_type=ChromeType.CHROMIUM).install())) + +driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) + +driver.get(f'https://bookshelf.vitalsource.com') +input('Press ENTER once logged in...') + +driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/0') +while True: + try: + num_pages = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').replace(' ', '').split('/')[1]) if not args.pages else args.pages + break + except selenium.common.exceptions.JavascriptException: + time.sleep(1) +print('Total number of pages:', num_pages) + + +def load_page(page_id): + driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/{page_id}') + + # Wait for the page to load + while True: + try: + driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML') + break + except selenium.common.exceptions.JavascriptException: + time.sleep(1) + while len(driver.find_elements(By.CLASS_NAME, "sc-AjmGg dDNaMw")): + time.sleep(1) + + +auth_headers = {} + + +def form_header(input_headers): + output = {} + for item in input_headers: + output[item[0]] = item[1] + # if output.get) + return output + + +# cookies = driver.get_cookies() +# r_cookies = {} +# for cookie in cookies: +# r_cookies[cookie['name']] = cookie['value'] +# # s.cookies.set(cookie['name'], cookie['value']) +# download_file('https://jigsaw.vitalsource.com/books/9781524976422/images/553246736447566b5831395573413731646d5371495171745037726a2f49564a6d574c424e424e793154513d0a/encrypted/2000', '/home/dpanzer/test.jpg', cookies=r_cookies) + +img_sizes = [2000, 1600, 800] + +page_urls = {None} +all_images = [] +for page_num in tqdm(iterable=range(num_pages)): + time.sleep(args.delay) + for i in range(3): + base_url = None + headers = {} + for i in range(60): + for request in driver.requests: + # print(request.headers) + if request.response and request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'): + base_url = request.url.split('/') + del base_url[-1] + base_url = '/'.join(base_url) + if base_url in page_urls: + break + page_urls.add(base_url) + headers = form_header(request.headers) + print(headers) + time.sleep(1) + + if base_url: + del driver.requests + download_file(base_url, '/home/dpanzer/test.jpg', headers=headers) + tqdm.write(base_url) + break + else: + tqdm.write(f'Failed to find image on page {page_num}, reloading.') + load_page(page_num) + time.sleep(20) + + actions = ActionChains(driver) + actions.send_keys(Keys.RIGHT) + actions.perform() + +driver.close() diff --git a/vitalsource_scraper/__init__.py b/vitalsource_scraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/vitalsource_scraper/file.py b/vitalsource_scraper/file.py new file mode 100644 index 0000000..28b6f19 --- /dev/null +++ b/vitalsource_scraper/file.py @@ -0,0 +1,14 @@ +import requests + + +def download_file(url, full_output_path, headers): + # NOTE the stream=True parameter below + with requests.get(url, stream=True, headers=headers) as r: + r.raise_for_status() + with open(full_output_path, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + # If you have chunk encoded response uncomment if + # and set chunk_size parameter to None. + # if chunk: + f.write(chunk) + return full_output_path