working pretty well

This commit is contained in:
Cyberes 2023-03-13 16:39:47 -06:00
parent 9139497871
commit 67618e443f
4 changed files with 202 additions and 69 deletions

View File

@ -1,3 +1,5 @@
# vitalsource2pdf # vitalsource2pdf
sudo apt-get install python3-tk python3-dev
unset SNAP_NAME; unset SNAP_INSTANCE_NAME unset SNAP_NAME; unset SNAP_INSTANCE_NAME

View File

@ -1,5 +1,6 @@
selenium selenium
webdriver-manager webdriver-manager
requests tqdm
pyshadow pillow
tqdm pyautogui
selenium-wire

View File

@ -2,6 +2,8 @@ import argparse
import time import time
from pathlib import Path from pathlib import Path
from PIL import Image
import pyautogui
import selenium import selenium
from selenium.webdriver import ActionChains, Keys from selenium.webdriver import ActionChains, Keys
from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.service import Service
@ -12,59 +14,90 @@ from seleniumwire import webdriver
from tqdm import tqdm from tqdm import tqdm
from webdriver_manager.chrome import ChromeDriverManager from webdriver_manager.chrome import ChromeDriverManager
from vitalsource_scraper.file import download_file
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--output', default='./VitalSource/') parser.add_argument('--output', default='./VitalSource/')
parser.add_argument('--isbn', required=True) parser.add_argument('--isbn', required=True)
parser.add_argument('--delay', default=8, type=int, help='Delay between pages to let them load.') parser.add_argument('--delay', default=2, type=int, help='Delay between pages to let them load.')
parser.add_argument('--pages', default=None, type=int, help='Override how many pages to save.') parser.add_argument('--pages', default=None, type=int, help='Override how many pages to save.')
parser.add_argument('--disable-web-security', action='store_true', help="If pages aren't loading then you can try disabling CORS protections.")
args = parser.parse_args() args = parser.parse_args()
args.output = Path(args.output) args.output = Path(args.output)
args.output.mkdir(exist_ok=True, parents=True)
ebook_output = args.output / f'{args.isbn}.pdf'
ebook_files = args.output / args.isbn
ebook_files.mkdir(exist_ok=True, parents=True)
saved_cookies = args.output / 'cookies.pkl'
# driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install())) # driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))
# version = read_version_from_cmd('/usr/bin/chromium-browser --version', PATTERN[ChromeType.CHROMIUM]) # version = read_version_from_cmd('/usr/bin/chromium-browser --version', PATTERN[ChromeType.CHROMIUM])
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager(version='111.0.5563.64', chrome_type=ChromeType.CHROMIUM).install())) # driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager(version='111.0.5563.64', chrome_type=ChromeType.CHROMIUM).install()))
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(f'https://bookshelf.vitalsource.com') options = webdriver.ChromeOptions()
input('Press ENTER once logged in...') options.add_experimental_option('prefs', {'download.default_directory': str(ebook_files)})
if args.disable_web_security:
options.add_argument('--disable-web-security')
print('DISABLED WEB SECURITY!')
options.add_argument('--disable-http2') # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=options)
driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/0') if not saved_cookies.is_file():
while True: driver.get(f'https://bookshelf.vitalsource.com')
try: input('Press ENTER once logged in...')
num_pages = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').replace(' ', '').split('/')[1]) if not args.pages else args.pages
break
except selenium.common.exceptions.JavascriptException:
time.sleep(1)
print('Total number of pages:', num_pages)
def load_page(page_id): # else:
# driver.get(f'https://bookshelf.vitalsource.com')
# driver.execute_script('window.stop()')
# cookies = pickle.load(open(saved_cookies, 'rb'))
# for cookie in cookies:
# print(cookie)
# driver.add_cookie(cookie)
# Save cookies now that we're in
# pickle.dump(driver.get_cookies(), open(saved_cookies, "wb"))
def get_num_pages():
while True:
try:
total_pages = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').strip().split('/')[-1].strip())
try:
# this element may be empty so just set it to 0
page = driver.execute_script('return document.getElementsByClassName("InputControl__input-fbzQBk hDtUvs TextField__InputControl-iza-dmV iISUBf")[0].value')
if page == '' or not page:
page = 0
except selenium.common.exceptions.JavascriptException:
page = 0
return page, total_pages
except selenium.common.exceptions.JavascriptException:
time.sleep(1)
def load_book_page(page_id):
driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/{page_id}') driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/{page_id}')
# Wait for the page to load # Wait for the page to load
while True: get_num_pages()
try:
driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML')
break
except selenium.common.exceptions.JavascriptException:
time.sleep(1)
while len(driver.find_elements(By.CLASS_NAME, "sc-AjmGg dDNaMw")): while len(driver.find_elements(By.CLASS_NAME, "sc-AjmGg dDNaMw")):
time.sleep(1) time.sleep(
1) # driver.execute_script( # https://github.com/eligrey/FileSaver.js/ # '(function(a,b){if("function"==typeof define&&define.amd)define([],b);else if("undefined"!=typeof exports)b();else{b(),a.FileSaver={exports:{}}.exports}})(this,function(){"use strict";function b(a,b){return"undefined"==typeof b?b={autoBom:!1}:"object"!=typeof b&&(console.warn("Deprecated: Expected third argument to be a object"),b={autoBom:!b}),b.autoBom&&/^\s*(?:text\/\S*|application\/xml|\S*\/\S*\+xml)\s*;.*charset\s*=\s*utf-8/i.test(a.type)?new Blob(["\uFEFF",a],{type:a.type}):a}function c(a,b,c){var d=new XMLHttpRequest;d.open("GET",a),d.responseType="blob",d.onload=function(){g(d.response,b,c)},d.onerror=function(){console.error("could not download file")},d.send()}function d(a){var b=new XMLHttpRequest;b.open("HEAD",a,!1);try{b.send()}catch(a){}return 200<=b.status&&299>=b.status}function e(a){try{a.dispatchEvent(new MouseEvent("click"))}catch(c){var b=document.createEvent("MouseEvents");b.initMouseEvent("click",!0,!0,window,0,0,0,80,20,!1,!1,!1,!1,0,null),a.dispatchEvent(b)}}var f="object"==typeof window&&window.window===window?window:"object"==typeof self&&self.self===self?self:"object"==typeof global&&global.global===global?global:void 0,a=/Macintosh/.test(navigator.userAgent)&&/AppleWebKit/.test(navigator.userAgent)&&!/Safari/.test(navigator.userAgent),g=f.saveAs||("object"!=typeof window||window!==f?function(){}:"download"in HTMLAnchorElement.prototype&&!a?function(b,g,h){var i=f.URL||f.webkitURL,j=document.createElement("a");g=g||b.name||"download",j.download=g,j.rel="noopener","string"==typeof b?(j.href=b,j.origin===location.origin?e(j):d(j.href)?c(b,g,h):e(j,j.target="_blank")):(j.href=i.createObjectURL(b),setTimeout(function(){i.revokeObjectURL(j.href)},4E4),setTimeout(function(){e(j)},0))}:"msSaveOrOpenBlob"in navigator?function(f,g,h){if(g=g||f.name||"download","string"!=typeof f)navigator.msSaveOrOpenBlob(b(f,h),g);else if(d(f))c(f,g,h);else{var i=document.createElement("a");i.href=f,i.target="_blank",setTimeout(function(){e(i)})}}:function(b,d,e,g){if(g=g||open("","_blank"),g&&(g.document.title=g.document.body.innerText="downloading..."),"string"==typeof b)return c(b,d,e);var h="application/octet-stream"===b.type,i=/constructor/i.test(f.HTMLElement)||f.safari,j=/CriOS\/[\d]+/.test(navigator.userAgent);if((j||h&&i||a)&&"undefined"!=typeof FileReader){var k=new FileReader;k.onloadend=function(){var a=k.result;a=j?a:a.replace(/^data:[^;]*;/,"data:attachment/file;"),g?g.location.href=a:location=a,g=null},k.readAsDataURL(b)}else{var l=f.URL||f.webkitURL,m=l.createObjectURL(b);g?g.location=m:location.href=m,g=null,setTimeout(function(){l.revokeObjectURL(m)},4E4)}});f.saveAs=g.saveAs=g,"undefined"!=typeof module&&(module.exports=g)});')
auth_headers = {} load_book_page(0)
_, total_pages = get_num_pages()
print('Total number of pages:', total_pages)
# auth_headers = {}
def form_header(input_headers): # def form_header(input_headers):
output = {} # output = {}
for item in input_headers: # for item in input_headers:
output[item[0]] = item[1] # output[item[0]] = item[1]
# if output.get) # # if output.get)
return output # return output
# cookies = driver.get_cookies() # cookies = driver.get_cookies()
@ -74,41 +107,94 @@ def form_header(input_headers):
# # s.cookies.set(cookie['name'], cookie['value']) # # s.cookies.set(cookie['name'], cookie['value'])
# download_file('https://jigsaw.vitalsource.com/books/9781524976422/images/553246736447566b5831395573413731646d5371495171745037726a2f49564a6d574c424e424e793154513d0a/encrypted/2000', '/home/dpanzer/test.jpg', cookies=r_cookies) # download_file('https://jigsaw.vitalsource.com/books/9781524976422/images/553246736447566b5831395573413731646d5371495171745037726a2f49564a6d574c424e424e793154513d0a/encrypted/2000', '/home/dpanzer/test.jpg', cookies=r_cookies)
img_sizes = [2000, 1600, 800] # img_sizes = [2000, 1600, 800]
page_urls = {None} # driver.maximize_window()
all_images = [] # screen_w, screen_h = pyautogui.size()
for page_num in tqdm(iterable=range(num_pages)):
page_urls = set()
page_num = 0
bar = tqdm(total=total_pages)
while page_num < total_pages + 1:
img_data = None
time.sleep(args.delay) time.sleep(args.delay)
for i in range(3): retry_delay = 1
base_url = None base_url = None
headers = {} for page_retry in range(5): # retry the page max 5 times
for i in range(60): largest_size = 0
for find_img_retry in range(3):
for request in driver.requests: for request in driver.requests:
# print(request.headers) if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'): # request.response and
if request.response and request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
base_url = request.url.split('/') base_url = request.url.split('/')
del base_url[-1] img_size = int(base_url[-1])
base_url = '/'.join(base_url) if img_size > largest_size:
if base_url in page_urls: # del base_url[-1]
break base_url = '/'.join(base_url)
page_urls.add(base_url)
headers = form_header(request.headers)
print(headers)
time.sleep(1)
# Wait for the image to load
wait = 0
while (not request.response or not request.response.body) and wait < 30:
time.sleep(1)
wait += 1
print(wait)
if not request.response or not request.response.body:
bar.write(f'Image failed to load! Increase your delay. {request.url}')
break
img_data = request.response.body
page_urls.add(request.url)
if img_size == 2000:
break
if base_url: if base_url:
del driver.requests
download_file(base_url, '/home/dpanzer/test.jpg', headers=headers)
tqdm.write(base_url)
break break
else: bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}...')
tqdm.write(f'Failed to find image on page {page_num}, reloading.') time.sleep(retry_delay)
load_page(page_num) retry_delay += 1
time.sleep(20)
if base_url:
del driver.requests
page, total_pages = get_num_pages()
# If this isn't a numbered page we will need to increment the page count
try:
int(page)
except ValueError:
total_pages = total_pages + 1
bar.write(f'Non-number page {page}, increasing page count by 1 to: {total_pages}')
bar.total = total_pages
bar.refresh()
dl_file = ebook_files / f'{page}.jpg'
with open(dl_file, 'wb') as file:
file.write(img_data)
# Re-save the image to make sure it's in the correct format
img = Image.open(dl_file)
img.save(dl_file, format='JPEG', subsampling=0, quality=100)
del img
# Other download method.
# driver.execute_script(f'window.open("{base_url}","_blank");')
# pyautogui.moveTo(screen_h / 2, screen_w / 2)
# pyautogui.hotkey('ctrl', 's')
# time.sleep(10)
# pyautogui.write(f'{page}.jpg')
# time.sleep(10)
# pyautogui.press('enter')
# time.sleep(10)
# pyautogui.hotkey('ctrl', 'w')
bar.write(base_url)
else:
tqdm.write(f'Failed to find image on page {page_num}, wait 20s...')
load_book_page(page_num)
time.sleep(20)
actions = ActionChains(driver) actions = ActionChains(driver)
actions.send_keys(Keys.RIGHT) actions.send_keys(Keys.RIGHT)
actions.perform() actions.perform()
bar.update()
page_num += 1
driver.close() driver.close()
bar.close()

View File

@ -1,14 +1,58 @@
import requests from urllib.parse import urlparse
import os
import os
from mimetypes import guess_extension
from urllib.parse import urlparse
def download_file(url, full_output_path, headers): def download_assets(requests,
# NOTE the stream=True parameter below asset_dir="temp",
with requests.get(url, stream=True, headers=headers) as r: default_fname="unnamed",
r.raise_for_status() skip_domains=["facebook", "google", "yahoo", "agkn", "2mdn"],
with open(full_output_path, 'wb') as f: exts=[".png", ".jpeg", ".jpg", ".svg", ".gif", ".pdf", ".bmp", ".webp", ".ico"],
for chunk in r.iter_content(chunk_size=8192): append_ext=False):
# If you have chunk encoded response uncomment if asset_list = {}
# and set chunk_size parameter to None. for req_idx, request in enumerate(requests):
# if chunk: # request.headers
f.write(chunk) # request.response.body is the raw response body in bytes
return full_output_path if request is None or request.response is None or request.response.headers is None or 'Content-Type' not in request.response.headers:
continue
ext = guess_extension(request.response.headers['Content-Type'].split(';')[0].strip())
if ext is None or ext == "" or ext not in exts:
# Don't know the file extention, or not in the whitelist
continue
parsed_url = urlparse(request.url)
skip = False
for d in skip_domains:
if d in parsed_url.netloc:
skip = True
break
if skip:
continue
frelpath = parsed_url.path.strip()
if frelpath == "":
timestamp = str(datetime.datetime.now().replace(microsecond=0).isoformat())
frelpath = f"{default_fname}_{req_idx}_{timestamp}{ext}"
elif frelpath.endswith("\\") or frelpath.endswith("/"):
timestamp = str(datetime.datetime.now().replace(microsecond=0).isoformat())
frelpath = frelpath + f"{default_fname}_{req_idx}_{timestamp}{ext}"
elif append_ext and not frelpath.endswith(ext):
frelpath = frelpath + f"_{default_fname}{ext}" # Missing file extension but may not be a problem
if frelpath.startswith("\\") or frelpath.startswith("/"):
frelpath = frelpath[1:]
fpath = os.path.join(asset_dir, parsed_url.netloc, frelpath)
if os.path.isfile(fpath):
continue
os.makedirs(os.path.dirname(fpath), exist_ok=True)
print(f"Downloading {request.url} to {fpath}")
asset_list[fpath] = request.url
try:
with open(fpath, "wb") as file:
file.write(request.response.body)
except:
print(f"Cannot download {request.url} to {fpath}")
return asset_list