working pretty well

This commit is contained in:
Cyberes 2023-03-13 16:39:47 -06:00
parent 9139497871
commit 67618e443f
4 changed files with 202 additions and 69 deletions

View File

@ -1,3 +1,5 @@
# vitalsource2pdf
sudo apt-get install python3-tk python3-dev
unset SNAP_NAME; unset SNAP_INSTANCE_NAME

View File

@ -1,5 +1,6 @@
selenium
webdriver-manager
requests
pyshadow
tqdm
tqdm
pillow
pyautogui
selenium-wire

View File

@ -2,6 +2,8 @@ import argparse
import time
from pathlib import Path
from PIL import Image
import pyautogui
import selenium
from selenium.webdriver import ActionChains, Keys
from selenium.webdriver.chrome.service import Service
@ -12,59 +14,90 @@ from seleniumwire import webdriver
from tqdm import tqdm
from webdriver_manager.chrome import ChromeDriverManager
from vitalsource_scraper.file import download_file
parser = argparse.ArgumentParser()
parser.add_argument('--output', default='./VitalSource/')
parser.add_argument('--isbn', required=True)
parser.add_argument('--delay', default=8, type=int, help='Delay between pages to let them load.')
parser.add_argument('--delay', default=2, type=int, help='Delay between pages to let them load.')
parser.add_argument('--pages', default=None, type=int, help='Override how many pages to save.')
parser.add_argument('--disable-web-security', action='store_true', help="If pages aren't loading then you can try disabling CORS protections.")
args = parser.parse_args()
args.output = Path(args.output)
args.output.mkdir(exist_ok=True, parents=True)
ebook_output = args.output / f'{args.isbn}.pdf'
ebook_files = args.output / args.isbn
ebook_files.mkdir(exist_ok=True, parents=True)
saved_cookies = args.output / 'cookies.pkl'
# driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))
# version = read_version_from_cmd('/usr/bin/chromium-browser --version', PATTERN[ChromeType.CHROMIUM])
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager(version='111.0.5563.64', chrome_type=ChromeType.CHROMIUM).install()))
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(f'https://bookshelf.vitalsource.com')
input('Press ENTER once logged in...')
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {'download.default_directory': str(ebook_files)})
if args.disable_web_security:
options.add_argument('--disable-web-security')
print('DISABLED WEB SECURITY!')
options.add_argument('--disable-http2') # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=options)
driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/0')
while True:
try:
num_pages = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').replace(' ', '').split('/')[1]) if not args.pages else args.pages
break
except selenium.common.exceptions.JavascriptException:
time.sleep(1)
print('Total number of pages:', num_pages)
if not saved_cookies.is_file():
driver.get(f'https://bookshelf.vitalsource.com')
input('Press ENTER once logged in...')
def load_page(page_id):
# else:
# driver.get(f'https://bookshelf.vitalsource.com')
# driver.execute_script('window.stop()')
# cookies = pickle.load(open(saved_cookies, 'rb'))
# for cookie in cookies:
# print(cookie)
# driver.add_cookie(cookie)
# Save cookies now that we're in
# pickle.dump(driver.get_cookies(), open(saved_cookies, "wb"))
def get_num_pages():
while True:
try:
total_pages = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').strip().split('/')[-1].strip())
try:
# this element may be empty so just set it to 0
page = driver.execute_script('return document.getElementsByClassName("InputControl__input-fbzQBk hDtUvs TextField__InputControl-iza-dmV iISUBf")[0].value')
if page == '' or not page:
page = 0
except selenium.common.exceptions.JavascriptException:
page = 0
return page, total_pages
except selenium.common.exceptions.JavascriptException:
time.sleep(1)
def load_book_page(page_id):
driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/{page_id}')
# Wait for the page to load
while True:
try:
driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML')
break
except selenium.common.exceptions.JavascriptException:
time.sleep(1)
get_num_pages()
while len(driver.find_elements(By.CLASS_NAME, "sc-AjmGg dDNaMw")):
time.sleep(1)
time.sleep(
1) # driver.execute_script( # https://github.com/eligrey/FileSaver.js/ # '(function(a,b){if("function"==typeof define&&define.amd)define([],b);else if("undefined"!=typeof exports)b();else{b(),a.FileSaver={exports:{}}.exports}})(this,function(){"use strict";function b(a,b){return"undefined"==typeof b?b={autoBom:!1}:"object"!=typeof b&&(console.warn("Deprecated: Expected third argument to be a object"),b={autoBom:!b}),b.autoBom&&/^\s*(?:text\/\S*|application\/xml|\S*\/\S*\+xml)\s*;.*charset\s*=\s*utf-8/i.test(a.type)?new Blob(["\uFEFF",a],{type:a.type}):a}function c(a,b,c){var d=new XMLHttpRequest;d.open("GET",a),d.responseType="blob",d.onload=function(){g(d.response,b,c)},d.onerror=function(){console.error("could not download file")},d.send()}function d(a){var b=new XMLHttpRequest;b.open("HEAD",a,!1);try{b.send()}catch(a){}return 200<=b.status&&299>=b.status}function e(a){try{a.dispatchEvent(new MouseEvent("click"))}catch(c){var b=document.createEvent("MouseEvents");b.initMouseEvent("click",!0,!0,window,0,0,0,80,20,!1,!1,!1,!1,0,null),a.dispatchEvent(b)}}var f="object"==typeof window&&window.window===window?window:"object"==typeof self&&self.self===self?self:"object"==typeof global&&global.global===global?global:void 0,a=/Macintosh/.test(navigator.userAgent)&&/AppleWebKit/.test(navigator.userAgent)&&!/Safari/.test(navigator.userAgent),g=f.saveAs||("object"!=typeof window||window!==f?function(){}:"download"in HTMLAnchorElement.prototype&&!a?function(b,g,h){var i=f.URL||f.webkitURL,j=document.createElement("a");g=g||b.name||"download",j.download=g,j.rel="noopener","string"==typeof b?(j.href=b,j.origin===location.origin?e(j):d(j.href)?c(b,g,h):e(j,j.target="_blank")):(j.href=i.createObjectURL(b),setTimeout(function(){i.revokeObjectURL(j.href)},4E4),setTimeout(function(){e(j)},0))}:"msSaveOrOpenBlob"in navigator?function(f,g,h){if(g=g||f.name||"download","string"!=typeof f)navigator.msSaveOrOpenBlob(b(f,h),g);else if(d(f))c(f,g,h);else{var i=document.createElement("a");i.href=f,i.target="_blank",setTimeout(function(){e(i)})}}:function(b,d,e,g){if(g=g||open("","_blank"),g&&(g.document.title=g.document.body.innerText="downloading..."),"string"==typeof b)return c(b,d,e);var h="application/octet-stream"===b.type,i=/constructor/i.test(f.HTMLElement)||f.safari,j=/CriOS\/[\d]+/.test(navigator.userAgent);if((j||h&&i||a)&&"undefined"!=typeof FileReader){var k=new FileReader;k.onloadend=function(){var a=k.result;a=j?a:a.replace(/^data:[^;]*;/,"data:attachment/file;"),g?g.location.href=a:location=a,g=null},k.readAsDataURL(b)}else{var l=f.URL||f.webkitURL,m=l.createObjectURL(b);g?g.location=m:location.href=m,g=null,setTimeout(function(){l.revokeObjectURL(m)},4E4)}});f.saveAs=g.saveAs=g,"undefined"!=typeof module&&(module.exports=g)});')
auth_headers = {}
load_book_page(0)
_, total_pages = get_num_pages()
print('Total number of pages:', total_pages)
# auth_headers = {}
def form_header(input_headers):
output = {}
for item in input_headers:
output[item[0]] = item[1]
# if output.get)
return output
# def form_header(input_headers):
# output = {}
# for item in input_headers:
# output[item[0]] = item[1]
# # if output.get)
# return output
# cookies = driver.get_cookies()
@ -74,41 +107,94 @@ def form_header(input_headers):
# # s.cookies.set(cookie['name'], cookie['value'])
# download_file('https://jigsaw.vitalsource.com/books/9781524976422/images/553246736447566b5831395573413731646d5371495171745037726a2f49564a6d574c424e424e793154513d0a/encrypted/2000', '/home/dpanzer/test.jpg', cookies=r_cookies)
img_sizes = [2000, 1600, 800]
# img_sizes = [2000, 1600, 800]
page_urls = {None}
all_images = []
for page_num in tqdm(iterable=range(num_pages)):
# driver.maximize_window()
# screen_w, screen_h = pyautogui.size()
page_urls = set()
page_num = 0
bar = tqdm(total=total_pages)
while page_num < total_pages + 1:
img_data = None
time.sleep(args.delay)
for i in range(3):
base_url = None
headers = {}
for i in range(60):
retry_delay = 1
base_url = None
for page_retry in range(5): # retry the page max 5 times
largest_size = 0
for find_img_retry in range(3):
for request in driver.requests:
# print(request.headers)
if request.response and request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'): # request.response and
base_url = request.url.split('/')
del base_url[-1]
base_url = '/'.join(base_url)
if base_url in page_urls:
break
page_urls.add(base_url)
headers = form_header(request.headers)
print(headers)
time.sleep(1)
img_size = int(base_url[-1])
if img_size > largest_size:
# del base_url[-1]
base_url = '/'.join(base_url)
# Wait for the image to load
wait = 0
while (not request.response or not request.response.body) and wait < 30:
time.sleep(1)
wait += 1
print(wait)
if not request.response or not request.response.body:
bar.write(f'Image failed to load! Increase your delay. {request.url}')
break
img_data = request.response.body
page_urls.add(request.url)
if img_size == 2000:
break
if base_url:
del driver.requests
download_file(base_url, '/home/dpanzer/test.jpg', headers=headers)
tqdm.write(base_url)
break
else:
tqdm.write(f'Failed to find image on page {page_num}, reloading.')
load_page(page_num)
time.sleep(20)
bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}...')
time.sleep(retry_delay)
retry_delay += 1
if base_url:
del driver.requests
page, total_pages = get_num_pages()
# If this isn't a numbered page we will need to increment the page count
try:
int(page)
except ValueError:
total_pages = total_pages + 1
bar.write(f'Non-number page {page}, increasing page count by 1 to: {total_pages}')
bar.total = total_pages
bar.refresh()
dl_file = ebook_files / f'{page}.jpg'
with open(dl_file, 'wb') as file:
file.write(img_data)
# Re-save the image to make sure it's in the correct format
img = Image.open(dl_file)
img.save(dl_file, format='JPEG', subsampling=0, quality=100)
del img
# Other download method.
# driver.execute_script(f'window.open("{base_url}","_blank");')
# pyautogui.moveTo(screen_h / 2, screen_w / 2)
# pyautogui.hotkey('ctrl', 's')
# time.sleep(10)
# pyautogui.write(f'{page}.jpg')
# time.sleep(10)
# pyautogui.press('enter')
# time.sleep(10)
# pyautogui.hotkey('ctrl', 'w')
bar.write(base_url)
else:
tqdm.write(f'Failed to find image on page {page_num}, wait 20s...')
load_book_page(page_num)
time.sleep(20)
actions = ActionChains(driver)
actions.send_keys(Keys.RIGHT)
actions.perform()
bar.update()
page_num += 1
driver.close()
bar.close()

View File

@ -1,14 +1,58 @@
import requests
from urllib.parse import urlparse
import os
import os
from mimetypes import guess_extension
from urllib.parse import urlparse
def download_file(url, full_output_path, headers):
# NOTE the stream=True parameter below
with requests.get(url, stream=True, headers=headers) as r:
r.raise_for_status()
with open(full_output_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
# If you have chunk encoded response uncomment if
# and set chunk_size parameter to None.
# if chunk:
f.write(chunk)
return full_output_path
def download_assets(requests,
asset_dir="temp",
default_fname="unnamed",
skip_domains=["facebook", "google", "yahoo", "agkn", "2mdn"],
exts=[".png", ".jpeg", ".jpg", ".svg", ".gif", ".pdf", ".bmp", ".webp", ".ico"],
append_ext=False):
asset_list = {}
for req_idx, request in enumerate(requests):
# request.headers
# request.response.body is the raw response body in bytes
if request is None or request.response is None or request.response.headers is None or 'Content-Type' not in request.response.headers:
continue
ext = guess_extension(request.response.headers['Content-Type'].split(';')[0].strip())
if ext is None or ext == "" or ext not in exts:
# Don't know the file extention, or not in the whitelist
continue
parsed_url = urlparse(request.url)
skip = False
for d in skip_domains:
if d in parsed_url.netloc:
skip = True
break
if skip:
continue
frelpath = parsed_url.path.strip()
if frelpath == "":
timestamp = str(datetime.datetime.now().replace(microsecond=0).isoformat())
frelpath = f"{default_fname}_{req_idx}_{timestamp}{ext}"
elif frelpath.endswith("\\") or frelpath.endswith("/"):
timestamp = str(datetime.datetime.now().replace(microsecond=0).isoformat())
frelpath = frelpath + f"{default_fname}_{req_idx}_{timestamp}{ext}"
elif append_ext and not frelpath.endswith(ext):
frelpath = frelpath + f"_{default_fname}{ext}" # Missing file extension but may not be a problem
if frelpath.startswith("\\") or frelpath.startswith("/"):
frelpath = frelpath[1:]
fpath = os.path.join(asset_dir, parsed_url.netloc, frelpath)
if os.path.isfile(fpath):
continue
os.makedirs(os.path.dirname(fpath), exist_ok=True)
print(f"Downloading {request.url} to {fpath}")
asset_list[fpath] = request.url
try:
with open(fpath, "wb") as file:
file.write(request.response.body)
except:
print(f"Cannot download {request.url} to {fpath}")
return asset_list