working pretty well
This commit is contained in:
parent
9139497871
commit
67618e443f
|
@ -1,3 +1,5 @@
|
|||
# vitalsource2pdf
|
||||
|
||||
sudo apt-get install python3-tk python3-dev
|
||||
|
||||
unset SNAP_NAME; unset SNAP_INSTANCE_NAME
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
selenium
|
||||
webdriver-manager
|
||||
requests
|
||||
pyshadow
|
||||
tqdm
|
||||
tqdm
|
||||
pillow
|
||||
pyautogui
|
||||
selenium-wire
|
|
@ -2,6 +2,8 @@ import argparse
|
|||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from PIL import Image
|
||||
import pyautogui
|
||||
import selenium
|
||||
from selenium.webdriver import ActionChains, Keys
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
|
@ -12,59 +14,90 @@ from seleniumwire import webdriver
|
|||
from tqdm import tqdm
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
|
||||
from vitalsource_scraper.file import download_file
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--output', default='./VitalSource/')
|
||||
parser.add_argument('--isbn', required=True)
|
||||
parser.add_argument('--delay', default=8, type=int, help='Delay between pages to let them load.')
|
||||
parser.add_argument('--delay', default=2, type=int, help='Delay between pages to let them load.')
|
||||
parser.add_argument('--pages', default=None, type=int, help='Override how many pages to save.')
|
||||
parser.add_argument('--disable-web-security', action='store_true', help="If pages aren't loading then you can try disabling CORS protections.")
|
||||
args = parser.parse_args()
|
||||
|
||||
args.output = Path(args.output)
|
||||
args.output.mkdir(exist_ok=True, parents=True)
|
||||
ebook_output = args.output / f'{args.isbn}.pdf'
|
||||
ebook_files = args.output / args.isbn
|
||||
ebook_files.mkdir(exist_ok=True, parents=True)
|
||||
saved_cookies = args.output / 'cookies.pkl'
|
||||
|
||||
# driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))
|
||||
# version = read_version_from_cmd('/usr/bin/chromium-browser --version', PATTERN[ChromeType.CHROMIUM])
|
||||
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager(version='111.0.5563.64', chrome_type=ChromeType.CHROMIUM).install()))
|
||||
|
||||
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
|
||||
|
||||
driver.get(f'https://bookshelf.vitalsource.com')
|
||||
input('Press ENTER once logged in...')
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_experimental_option('prefs', {'download.default_directory': str(ebook_files)})
|
||||
if args.disable_web_security:
|
||||
options.add_argument('--disable-web-security')
|
||||
print('DISABLED WEB SECURITY!')
|
||||
options.add_argument('--disable-http2') # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
|
||||
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=options)
|
||||
|
||||
driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/0')
|
||||
while True:
|
||||
try:
|
||||
num_pages = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').replace(' ', '').split('/')[1]) if not args.pages else args.pages
|
||||
break
|
||||
except selenium.common.exceptions.JavascriptException:
|
||||
time.sleep(1)
|
||||
print('Total number of pages:', num_pages)
|
||||
if not saved_cookies.is_file():
|
||||
driver.get(f'https://bookshelf.vitalsource.com')
|
||||
input('Press ENTER once logged in...')
|
||||
|
||||
|
||||
def load_page(page_id):
|
||||
# else:
|
||||
# driver.get(f'https://bookshelf.vitalsource.com')
|
||||
# driver.execute_script('window.stop()')
|
||||
# cookies = pickle.load(open(saved_cookies, 'rb'))
|
||||
# for cookie in cookies:
|
||||
# print(cookie)
|
||||
# driver.add_cookie(cookie)
|
||||
|
||||
# Save cookies now that we're in
|
||||
# pickle.dump(driver.get_cookies(), open(saved_cookies, "wb"))
|
||||
|
||||
def get_num_pages():
|
||||
while True:
|
||||
try:
|
||||
total_pages = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').strip().split('/')[-1].strip())
|
||||
try:
|
||||
# this element may be empty so just set it to 0
|
||||
page = driver.execute_script('return document.getElementsByClassName("InputControl__input-fbzQBk hDtUvs TextField__InputControl-iza-dmV iISUBf")[0].value')
|
||||
if page == '' or not page:
|
||||
page = 0
|
||||
except selenium.common.exceptions.JavascriptException:
|
||||
page = 0
|
||||
return page, total_pages
|
||||
except selenium.common.exceptions.JavascriptException:
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def load_book_page(page_id):
|
||||
driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/{page_id}')
|
||||
|
||||
# Wait for the page to load
|
||||
while True:
|
||||
try:
|
||||
driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML')
|
||||
break
|
||||
except selenium.common.exceptions.JavascriptException:
|
||||
time.sleep(1)
|
||||
get_num_pages()
|
||||
while len(driver.find_elements(By.CLASS_NAME, "sc-AjmGg dDNaMw")):
|
||||
time.sleep(1)
|
||||
time.sleep(
|
||||
1) # driver.execute_script( # https://github.com/eligrey/FileSaver.js/ # '(function(a,b){if("function"==typeof define&&define.amd)define([],b);else if("undefined"!=typeof exports)b();else{b(),a.FileSaver={exports:{}}.exports}})(this,function(){"use strict";function b(a,b){return"undefined"==typeof b?b={autoBom:!1}:"object"!=typeof b&&(console.warn("Deprecated: Expected third argument to be a object"),b={autoBom:!b}),b.autoBom&&/^\s*(?:text\/\S*|application\/xml|\S*\/\S*\+xml)\s*;.*charset\s*=\s*utf-8/i.test(a.type)?new Blob(["\uFEFF",a],{type:a.type}):a}function c(a,b,c){var d=new XMLHttpRequest;d.open("GET",a),d.responseType="blob",d.onload=function(){g(d.response,b,c)},d.onerror=function(){console.error("could not download file")},d.send()}function d(a){var b=new XMLHttpRequest;b.open("HEAD",a,!1);try{b.send()}catch(a){}return 200<=b.status&&299>=b.status}function e(a){try{a.dispatchEvent(new MouseEvent("click"))}catch(c){var b=document.createEvent("MouseEvents");b.initMouseEvent("click",!0,!0,window,0,0,0,80,20,!1,!1,!1,!1,0,null),a.dispatchEvent(b)}}var f="object"==typeof window&&window.window===window?window:"object"==typeof self&&self.self===self?self:"object"==typeof global&&global.global===global?global:void 0,a=/Macintosh/.test(navigator.userAgent)&&/AppleWebKit/.test(navigator.userAgent)&&!/Safari/.test(navigator.userAgent),g=f.saveAs||("object"!=typeof window||window!==f?function(){}:"download"in HTMLAnchorElement.prototype&&!a?function(b,g,h){var i=f.URL||f.webkitURL,j=document.createElement("a");g=g||b.name||"download",j.download=g,j.rel="noopener","string"==typeof b?(j.href=b,j.origin===location.origin?e(j):d(j.href)?c(b,g,h):e(j,j.target="_blank")):(j.href=i.createObjectURL(b),setTimeout(function(){i.revokeObjectURL(j.href)},4E4),setTimeout(function(){e(j)},0))}:"msSaveOrOpenBlob"in navigator?function(f,g,h){if(g=g||f.name||"download","string"!=typeof f)navigator.msSaveOrOpenBlob(b(f,h),g);else if(d(f))c(f,g,h);else{var i=document.createElement("a");i.href=f,i.target="_blank",setTimeout(function(){e(i)})}}:function(b,d,e,g){if(g=g||open("","_blank"),g&&(g.document.title=g.document.body.innerText="downloading..."),"string"==typeof b)return c(b,d,e);var h="application/octet-stream"===b.type,i=/constructor/i.test(f.HTMLElement)||f.safari,j=/CriOS\/[\d]+/.test(navigator.userAgent);if((j||h&&i||a)&&"undefined"!=typeof FileReader){var k=new FileReader;k.onloadend=function(){var a=k.result;a=j?a:a.replace(/^data:[^;]*;/,"data:attachment/file;"),g?g.location.href=a:location=a,g=null},k.readAsDataURL(b)}else{var l=f.URL||f.webkitURL,m=l.createObjectURL(b);g?g.location=m:location.href=m,g=null,setTimeout(function(){l.revokeObjectURL(m)},4E4)}});f.saveAs=g.saveAs=g,"undefined"!=typeof module&&(module.exports=g)});')
|
||||
|
||||
|
||||
auth_headers = {}
|
||||
load_book_page(0)
|
||||
|
||||
_, total_pages = get_num_pages()
|
||||
print('Total number of pages:', total_pages)
|
||||
|
||||
# auth_headers = {}
|
||||
|
||||
|
||||
def form_header(input_headers):
|
||||
output = {}
|
||||
for item in input_headers:
|
||||
output[item[0]] = item[1]
|
||||
# if output.get)
|
||||
return output
|
||||
# def form_header(input_headers):
|
||||
# output = {}
|
||||
# for item in input_headers:
|
||||
# output[item[0]] = item[1]
|
||||
# # if output.get)
|
||||
# return output
|
||||
|
||||
|
||||
# cookies = driver.get_cookies()
|
||||
|
@ -74,41 +107,94 @@ def form_header(input_headers):
|
|||
# # s.cookies.set(cookie['name'], cookie['value'])
|
||||
# download_file('https://jigsaw.vitalsource.com/books/9781524976422/images/553246736447566b5831395573413731646d5371495171745037726a2f49564a6d574c424e424e793154513d0a/encrypted/2000', '/home/dpanzer/test.jpg', cookies=r_cookies)
|
||||
|
||||
img_sizes = [2000, 1600, 800]
|
||||
# img_sizes = [2000, 1600, 800]
|
||||
|
||||
page_urls = {None}
|
||||
all_images = []
|
||||
for page_num in tqdm(iterable=range(num_pages)):
|
||||
# driver.maximize_window()
|
||||
# screen_w, screen_h = pyautogui.size()
|
||||
|
||||
page_urls = set()
|
||||
page_num = 0
|
||||
bar = tqdm(total=total_pages)
|
||||
while page_num < total_pages + 1:
|
||||
img_data = None
|
||||
time.sleep(args.delay)
|
||||
for i in range(3):
|
||||
base_url = None
|
||||
headers = {}
|
||||
for i in range(60):
|
||||
retry_delay = 1
|
||||
base_url = None
|
||||
for page_retry in range(5): # retry the page max 5 times
|
||||
largest_size = 0
|
||||
for find_img_retry in range(3):
|
||||
for request in driver.requests:
|
||||
# print(request.headers)
|
||||
if request.response and request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
|
||||
if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'): # request.response and
|
||||
base_url = request.url.split('/')
|
||||
del base_url[-1]
|
||||
base_url = '/'.join(base_url)
|
||||
if base_url in page_urls:
|
||||
break
|
||||
page_urls.add(base_url)
|
||||
headers = form_header(request.headers)
|
||||
print(headers)
|
||||
time.sleep(1)
|
||||
img_size = int(base_url[-1])
|
||||
if img_size > largest_size:
|
||||
# del base_url[-1]
|
||||
base_url = '/'.join(base_url)
|
||||
|
||||
# Wait for the image to load
|
||||
wait = 0
|
||||
while (not request.response or not request.response.body) and wait < 30:
|
||||
time.sleep(1)
|
||||
wait += 1
|
||||
print(wait)
|
||||
if not request.response or not request.response.body:
|
||||
bar.write(f'Image failed to load! Increase your delay. {request.url}')
|
||||
break
|
||||
|
||||
img_data = request.response.body
|
||||
page_urls.add(request.url)
|
||||
if img_size == 2000:
|
||||
break
|
||||
if base_url:
|
||||
del driver.requests
|
||||
download_file(base_url, '/home/dpanzer/test.jpg', headers=headers)
|
||||
tqdm.write(base_url)
|
||||
break
|
||||
else:
|
||||
tqdm.write(f'Failed to find image on page {page_num}, reloading.')
|
||||
load_page(page_num)
|
||||
time.sleep(20)
|
||||
bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}...')
|
||||
time.sleep(retry_delay)
|
||||
retry_delay += 1
|
||||
|
||||
if base_url:
|
||||
del driver.requests
|
||||
page, total_pages = get_num_pages()
|
||||
|
||||
# If this isn't a numbered page we will need to increment the page count
|
||||
try:
|
||||
int(page)
|
||||
except ValueError:
|
||||
total_pages = total_pages + 1
|
||||
bar.write(f'Non-number page {page}, increasing page count by 1 to: {total_pages}')
|
||||
bar.total = total_pages
|
||||
bar.refresh()
|
||||
|
||||
dl_file = ebook_files / f'{page}.jpg'
|
||||
with open(dl_file, 'wb') as file:
|
||||
file.write(img_data)
|
||||
|
||||
# Re-save the image to make sure it's in the correct format
|
||||
img = Image.open(dl_file)
|
||||
img.save(dl_file, format='JPEG', subsampling=0, quality=100)
|
||||
del img
|
||||
|
||||
# Other download method.
|
||||
# driver.execute_script(f'window.open("{base_url}","_blank");')
|
||||
# pyautogui.moveTo(screen_h / 2, screen_w / 2)
|
||||
# pyautogui.hotkey('ctrl', 's')
|
||||
# time.sleep(10)
|
||||
# pyautogui.write(f'{page}.jpg')
|
||||
# time.sleep(10)
|
||||
# pyautogui.press('enter')
|
||||
# time.sleep(10)
|
||||
# pyautogui.hotkey('ctrl', 'w')
|
||||
|
||||
bar.write(base_url)
|
||||
else:
|
||||
tqdm.write(f'Failed to find image on page {page_num}, wait 20s...')
|
||||
load_book_page(page_num)
|
||||
time.sleep(20)
|
||||
|
||||
actions = ActionChains(driver)
|
||||
actions.send_keys(Keys.RIGHT)
|
||||
actions.perform()
|
||||
bar.update()
|
||||
page_num += 1
|
||||
|
||||
driver.close()
|
||||
bar.close()
|
||||
|
|
|
@ -1,14 +1,58 @@
|
|||
import requests
|
||||
from urllib.parse import urlparse
|
||||
import os
|
||||
import os
|
||||
from mimetypes import guess_extension
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
def download_file(url, full_output_path, headers):
|
||||
# NOTE the stream=True parameter below
|
||||
with requests.get(url, stream=True, headers=headers) as r:
|
||||
r.raise_for_status()
|
||||
with open(full_output_path, 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
# If you have chunk encoded response uncomment if
|
||||
# and set chunk_size parameter to None.
|
||||
# if chunk:
|
||||
f.write(chunk)
|
||||
return full_output_path
|
||||
def download_assets(requests,
|
||||
asset_dir="temp",
|
||||
default_fname="unnamed",
|
||||
skip_domains=["facebook", "google", "yahoo", "agkn", "2mdn"],
|
||||
exts=[".png", ".jpeg", ".jpg", ".svg", ".gif", ".pdf", ".bmp", ".webp", ".ico"],
|
||||
append_ext=False):
|
||||
asset_list = {}
|
||||
for req_idx, request in enumerate(requests):
|
||||
# request.headers
|
||||
# request.response.body is the raw response body in bytes
|
||||
if request is None or request.response is None or request.response.headers is None or 'Content-Type' not in request.response.headers:
|
||||
continue
|
||||
|
||||
ext = guess_extension(request.response.headers['Content-Type'].split(';')[0].strip())
|
||||
if ext is None or ext == "" or ext not in exts:
|
||||
# Don't know the file extention, or not in the whitelist
|
||||
continue
|
||||
parsed_url = urlparse(request.url)
|
||||
|
||||
skip = False
|
||||
for d in skip_domains:
|
||||
if d in parsed_url.netloc:
|
||||
skip = True
|
||||
break
|
||||
if skip:
|
||||
continue
|
||||
|
||||
frelpath = parsed_url.path.strip()
|
||||
if frelpath == "":
|
||||
timestamp = str(datetime.datetime.now().replace(microsecond=0).isoformat())
|
||||
frelpath = f"{default_fname}_{req_idx}_{timestamp}{ext}"
|
||||
elif frelpath.endswith("\\") or frelpath.endswith("/"):
|
||||
timestamp = str(datetime.datetime.now().replace(microsecond=0).isoformat())
|
||||
frelpath = frelpath + f"{default_fname}_{req_idx}_{timestamp}{ext}"
|
||||
elif append_ext and not frelpath.endswith(ext):
|
||||
frelpath = frelpath + f"_{default_fname}{ext}" # Missing file extension but may not be a problem
|
||||
if frelpath.startswith("\\") or frelpath.startswith("/"):
|
||||
frelpath = frelpath[1:]
|
||||
|
||||
fpath = os.path.join(asset_dir, parsed_url.netloc, frelpath)
|
||||
if os.path.isfile(fpath):
|
||||
continue
|
||||
os.makedirs(os.path.dirname(fpath), exist_ok=True)
|
||||
print(f"Downloading {request.url} to {fpath}")
|
||||
asset_list[fpath] = request.url
|
||||
try:
|
||||
with open(fpath, "wb") as file:
|
||||
file.write(request.response.body)
|
||||
except:
|
||||
print(f"Cannot download {request.url} to {fpath}")
|
||||
return asset_list
|
||||
|
|
Loading…
Reference in New Issue