From 67618e443f6861d9b9894954fce7c47167507bc6 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Mon, 13 Mar 2023 16:39:47 -0600 Subject: [PATCH] working pretty well --- README.md | 2 + requirements.txt | 7 +- vitalsource2pdf.py | 194 ++++++++++++++++++++++++++---------- vitalsource_scraper/file.py | 68 ++++++++++--- 4 files changed, 202 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index 4cb34bd..3d7ea2c 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ # vitalsource2pdf +sudo apt-get install python3-tk python3-dev + unset SNAP_NAME; unset SNAP_INSTANCE_NAME diff --git a/requirements.txt b/requirements.txt index ef9f54b..a32ee2b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ selenium webdriver-manager -requests -pyshadow -tqdm \ No newline at end of file +tqdm +pillow +pyautogui +selenium-wire \ No newline at end of file diff --git a/vitalsource2pdf.py b/vitalsource2pdf.py index df2c8ce..5627f6c 100644 --- a/vitalsource2pdf.py +++ b/vitalsource2pdf.py @@ -2,6 +2,8 @@ import argparse import time from pathlib import Path +from PIL import Image +import pyautogui import selenium from selenium.webdriver import ActionChains, Keys from selenium.webdriver.chrome.service import Service @@ -12,59 +14,90 @@ from seleniumwire import webdriver from tqdm import tqdm from webdriver_manager.chrome import ChromeDriverManager -from vitalsource_scraper.file import download_file - parser = argparse.ArgumentParser() parser.add_argument('--output', default='./VitalSource/') parser.add_argument('--isbn', required=True) -parser.add_argument('--delay', default=8, type=int, help='Delay between pages to let them load.') +parser.add_argument('--delay', default=2, type=int, help='Delay between pages to let them load.') parser.add_argument('--pages', default=None, type=int, help='Override how many pages to save.') +parser.add_argument('--disable-web-security', action='store_true', help="If pages aren't loading then you can try disabling CORS protections.") args = parser.parse_args() args.output = Path(args.output) +args.output.mkdir(exist_ok=True, parents=True) +ebook_output = args.output / f'{args.isbn}.pdf' +ebook_files = args.output / args.isbn +ebook_files.mkdir(exist_ok=True, parents=True) +saved_cookies = args.output / 'cookies.pkl' # driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install())) # version = read_version_from_cmd('/usr/bin/chromium-browser --version', PATTERN[ChromeType.CHROMIUM]) # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager(version='111.0.5563.64', chrome_type=ChromeType.CHROMIUM).install())) -driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) -driver.get(f'https://bookshelf.vitalsource.com') -input('Press ENTER once logged in...') +options = webdriver.ChromeOptions() +options.add_experimental_option('prefs', {'download.default_directory': str(ebook_files)}) +if args.disable_web_security: + options.add_argument('--disable-web-security') + print('DISABLED WEB SECURITY!') +options.add_argument('--disable-http2') # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data. +driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=options) -driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/0') -while True: - try: - num_pages = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').replace(' ', '').split('/')[1]) if not args.pages else args.pages - break - except selenium.common.exceptions.JavascriptException: - time.sleep(1) -print('Total number of pages:', num_pages) +if not saved_cookies.is_file(): + driver.get(f'https://bookshelf.vitalsource.com') + input('Press ENTER once logged in...') -def load_page(page_id): +# else: +# driver.get(f'https://bookshelf.vitalsource.com') +# driver.execute_script('window.stop()') +# cookies = pickle.load(open(saved_cookies, 'rb')) +# for cookie in cookies: +# print(cookie) +# driver.add_cookie(cookie) + +# Save cookies now that we're in +# pickle.dump(driver.get_cookies(), open(saved_cookies, "wb")) + +def get_num_pages(): + while True: + try: + total_pages = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').strip().split('/')[-1].strip()) + try: + # this element may be empty so just set it to 0 + page = driver.execute_script('return document.getElementsByClassName("InputControl__input-fbzQBk hDtUvs TextField__InputControl-iza-dmV iISUBf")[0].value') + if page == '' or not page: + page = 0 + except selenium.common.exceptions.JavascriptException: + page = 0 + return page, total_pages + except selenium.common.exceptions.JavascriptException: + time.sleep(1) + + +def load_book_page(page_id): driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/{page_id}') # Wait for the page to load - while True: - try: - driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML') - break - except selenium.common.exceptions.JavascriptException: - time.sleep(1) + get_num_pages() while len(driver.find_elements(By.CLASS_NAME, "sc-AjmGg dDNaMw")): - time.sleep(1) + time.sleep( + 1) # driver.execute_script( # https://github.com/eligrey/FileSaver.js/ # '(function(a,b){if("function"==typeof define&&define.amd)define([],b);else if("undefined"!=typeof exports)b();else{b(),a.FileSaver={exports:{}}.exports}})(this,function(){"use strict";function b(a,b){return"undefined"==typeof b?b={autoBom:!1}:"object"!=typeof b&&(console.warn("Deprecated: Expected third argument to be a object"),b={autoBom:!b}),b.autoBom&&/^\s*(?:text\/\S*|application\/xml|\S*\/\S*\+xml)\s*;.*charset\s*=\s*utf-8/i.test(a.type)?new Blob(["\uFEFF",a],{type:a.type}):a}function c(a,b,c){var d=new XMLHttpRequest;d.open("GET",a),d.responseType="blob",d.onload=function(){g(d.response,b,c)},d.onerror=function(){console.error("could not download file")},d.send()}function d(a){var b=new XMLHttpRequest;b.open("HEAD",a,!1);try{b.send()}catch(a){}return 200<=b.status&&299>=b.status}function e(a){try{a.dispatchEvent(new MouseEvent("click"))}catch(c){var b=document.createEvent("MouseEvents");b.initMouseEvent("click",!0,!0,window,0,0,0,80,20,!1,!1,!1,!1,0,null),a.dispatchEvent(b)}}var f="object"==typeof window&&window.window===window?window:"object"==typeof self&&self.self===self?self:"object"==typeof global&&global.global===global?global:void 0,a=/Macintosh/.test(navigator.userAgent)&&/AppleWebKit/.test(navigator.userAgent)&&!/Safari/.test(navigator.userAgent),g=f.saveAs||("object"!=typeof window||window!==f?function(){}:"download"in HTMLAnchorElement.prototype&&!a?function(b,g,h){var i=f.URL||f.webkitURL,j=document.createElement("a");g=g||b.name||"download",j.download=g,j.rel="noopener","string"==typeof b?(j.href=b,j.origin===location.origin?e(j):d(j.href)?c(b,g,h):e(j,j.target="_blank")):(j.href=i.createObjectURL(b),setTimeout(function(){i.revokeObjectURL(j.href)},4E4),setTimeout(function(){e(j)},0))}:"msSaveOrOpenBlob"in navigator?function(f,g,h){if(g=g||f.name||"download","string"!=typeof f)navigator.msSaveOrOpenBlob(b(f,h),g);else if(d(f))c(f,g,h);else{var i=document.createElement("a");i.href=f,i.target="_blank",setTimeout(function(){e(i)})}}:function(b,d,e,g){if(g=g||open("","_blank"),g&&(g.document.title=g.document.body.innerText="downloading..."),"string"==typeof b)return c(b,d,e);var h="application/octet-stream"===b.type,i=/constructor/i.test(f.HTMLElement)||f.safari,j=/CriOS\/[\d]+/.test(navigator.userAgent);if((j||h&&i||a)&&"undefined"!=typeof FileReader){var k=new FileReader;k.onloadend=function(){var a=k.result;a=j?a:a.replace(/^data:[^;]*;/,"data:attachment/file;"),g?g.location.href=a:location=a,g=null},k.readAsDataURL(b)}else{var l=f.URL||f.webkitURL,m=l.createObjectURL(b);g?g.location=m:location.href=m,g=null,setTimeout(function(){l.revokeObjectURL(m)},4E4)}});f.saveAs=g.saveAs=g,"undefined"!=typeof module&&(module.exports=g)});') -auth_headers = {} +load_book_page(0) + +_, total_pages = get_num_pages() +print('Total number of pages:', total_pages) + +# auth_headers = {} -def form_header(input_headers): - output = {} - for item in input_headers: - output[item[0]] = item[1] - # if output.get) - return output +# def form_header(input_headers): +# output = {} +# for item in input_headers: +# output[item[0]] = item[1] +# # if output.get) +# return output # cookies = driver.get_cookies() @@ -74,41 +107,94 @@ def form_header(input_headers): # # s.cookies.set(cookie['name'], cookie['value']) # download_file('https://jigsaw.vitalsource.com/books/9781524976422/images/553246736447566b5831395573413731646d5371495171745037726a2f49564a6d574c424e424e793154513d0a/encrypted/2000', '/home/dpanzer/test.jpg', cookies=r_cookies) -img_sizes = [2000, 1600, 800] +# img_sizes = [2000, 1600, 800] -page_urls = {None} -all_images = [] -for page_num in tqdm(iterable=range(num_pages)): +# driver.maximize_window() +# screen_w, screen_h = pyautogui.size() + +page_urls = set() +page_num = 0 +bar = tqdm(total=total_pages) +while page_num < total_pages + 1: + img_data = None time.sleep(args.delay) - for i in range(3): - base_url = None - headers = {} - for i in range(60): + retry_delay = 1 + base_url = None + for page_retry in range(5): # retry the page max 5 times + largest_size = 0 + for find_img_retry in range(3): for request in driver.requests: - # print(request.headers) - if request.response and request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'): + if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'): # request.response and base_url = request.url.split('/') - del base_url[-1] - base_url = '/'.join(base_url) - if base_url in page_urls: - break - page_urls.add(base_url) - headers = form_header(request.headers) - print(headers) - time.sleep(1) + img_size = int(base_url[-1]) + if img_size > largest_size: + # del base_url[-1] + base_url = '/'.join(base_url) + # Wait for the image to load + wait = 0 + while (not request.response or not request.response.body) and wait < 30: + time.sleep(1) + wait += 1 + print(wait) + if not request.response or not request.response.body: + bar.write(f'Image failed to load! Increase your delay. {request.url}') + break + + img_data = request.response.body + page_urls.add(request.url) + if img_size == 2000: + break if base_url: - del driver.requests - download_file(base_url, '/home/dpanzer/test.jpg', headers=headers) - tqdm.write(base_url) break - else: - tqdm.write(f'Failed to find image on page {page_num}, reloading.') - load_page(page_num) - time.sleep(20) + bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}...') + time.sleep(retry_delay) + retry_delay += 1 + + if base_url: + del driver.requests + page, total_pages = get_num_pages() + + # If this isn't a numbered page we will need to increment the page count + try: + int(page) + except ValueError: + total_pages = total_pages + 1 + bar.write(f'Non-number page {page}, increasing page count by 1 to: {total_pages}') + bar.total = total_pages + bar.refresh() + + dl_file = ebook_files / f'{page}.jpg' + with open(dl_file, 'wb') as file: + file.write(img_data) + + # Re-save the image to make sure it's in the correct format + img = Image.open(dl_file) + img.save(dl_file, format='JPEG', subsampling=0, quality=100) + del img + + # Other download method. + # driver.execute_script(f'window.open("{base_url}","_blank");') + # pyautogui.moveTo(screen_h / 2, screen_w / 2) + # pyautogui.hotkey('ctrl', 's') + # time.sleep(10) + # pyautogui.write(f'{page}.jpg') + # time.sleep(10) + # pyautogui.press('enter') + # time.sleep(10) + # pyautogui.hotkey('ctrl', 'w') + + bar.write(base_url) + else: + tqdm.write(f'Failed to find image on page {page_num}, wait 20s...') + load_book_page(page_num) + time.sleep(20) actions = ActionChains(driver) actions.send_keys(Keys.RIGHT) actions.perform() + bar.update() + page_num += 1 driver.close() +bar.close() diff --git a/vitalsource_scraper/file.py b/vitalsource_scraper/file.py index 28b6f19..a761bd0 100644 --- a/vitalsource_scraper/file.py +++ b/vitalsource_scraper/file.py @@ -1,14 +1,58 @@ -import requests +from urllib.parse import urlparse +import os +import os +from mimetypes import guess_extension +from urllib.parse import urlparse -def download_file(url, full_output_path, headers): - # NOTE the stream=True parameter below - with requests.get(url, stream=True, headers=headers) as r: - r.raise_for_status() - with open(full_output_path, 'wb') as f: - for chunk in r.iter_content(chunk_size=8192): - # If you have chunk encoded response uncomment if - # and set chunk_size parameter to None. - # if chunk: - f.write(chunk) - return full_output_path +def download_assets(requests, + asset_dir="temp", + default_fname="unnamed", + skip_domains=["facebook", "google", "yahoo", "agkn", "2mdn"], + exts=[".png", ".jpeg", ".jpg", ".svg", ".gif", ".pdf", ".bmp", ".webp", ".ico"], + append_ext=False): + asset_list = {} + for req_idx, request in enumerate(requests): + # request.headers + # request.response.body is the raw response body in bytes + if request is None or request.response is None or request.response.headers is None or 'Content-Type' not in request.response.headers: + continue + + ext = guess_extension(request.response.headers['Content-Type'].split(';')[0].strip()) + if ext is None or ext == "" or ext not in exts: + # Don't know the file extention, or not in the whitelist + continue + parsed_url = urlparse(request.url) + + skip = False + for d in skip_domains: + if d in parsed_url.netloc: + skip = True + break + if skip: + continue + + frelpath = parsed_url.path.strip() + if frelpath == "": + timestamp = str(datetime.datetime.now().replace(microsecond=0).isoformat()) + frelpath = f"{default_fname}_{req_idx}_{timestamp}{ext}" + elif frelpath.endswith("\\") or frelpath.endswith("/"): + timestamp = str(datetime.datetime.now().replace(microsecond=0).isoformat()) + frelpath = frelpath + f"{default_fname}_{req_idx}_{timestamp}{ext}" + elif append_ext and not frelpath.endswith(ext): + frelpath = frelpath + f"_{default_fname}{ext}" # Missing file extension but may not be a problem + if frelpath.startswith("\\") or frelpath.startswith("/"): + frelpath = frelpath[1:] + + fpath = os.path.join(asset_dir, parsed_url.netloc, frelpath) + if os.path.isfile(fpath): + continue + os.makedirs(os.path.dirname(fpath), exist_ok=True) + print(f"Downloading {request.url} to {fpath}") + asset_list[fpath] = request.url + try: + with open(fpath, "wb") as file: + file.write(request.response.body) + except: + print(f"Cannot download {request.url} to {fpath}") + return asset_list