vitalsource2pdf/vitalsource2pdf.py

173 lines
6.6 KiB
Python
Executable File

import argparse
import time
from pathlib import Path
import selenium
from PIL import Image
from selenium.webdriver import ActionChains, Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# from webdriver_manager.firefox import GeckoDriverManager
# from selenium.webdriver.firefox.service import Service as FirefoxService
from seleniumwire import webdriver
from tqdm import tqdm
from webdriver_manager.chrome import ChromeDriverManager
parser = argparse.ArgumentParser()
parser.add_argument('--output', default='./VitalSource/')
parser.add_argument('--isbn', required=True)
parser.add_argument('--delay', default=2, type=int, help='Delay between pages to let them load in seconds.')
parser.add_argument('--pages', default=None, type=int, help='Override how many pages to save.') # TODO
parser.add_argument('--start-page', default=0, type=int, help='Start on this page. Pages start at zero and include any non-numbered pages.')
parser.add_argument('--disable-web-security', action='store_true', help="If pages aren't loading then you can try disabling CORS protections.")
args = parser.parse_args()
args.output = Path(args.output)
args.output.mkdir(exist_ok=True, parents=True)
ebook_output = args.output / f'{args.isbn}.pdf'
ebook_files = args.output / args.isbn
ebook_files.mkdir(exist_ok=True, parents=True)
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {'download.default_directory': str(ebook_files)})
if args.disable_web_security:
options.add_argument('--disable-web-security')
print('DISABLED WEB SECURITY!')
options.add_argument('--disable-http2') # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=options)
driver.get(f'https://bookshelf.vitalsource.com')
input('Press ENTER once logged in...')
def get_num_pages():
while True:
try:
total = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').strip().split('/')[-1].strip())
try:
# This element may be empty so just set it to 0
current_page = driver.execute_script('return document.getElementsByClassName("InputControl__input-fbzQBk hDtUvs TextField__InputControl-iza-dmV iISUBf")[0].value')
if current_page == '' or not current_page:
current_page = 0
except selenium.common.exceptions.JavascriptException:
current_page = 0
return current_page, total
except selenium.common.exceptions.JavascriptException:
time.sleep(1)
def load_book_page(page_id):
driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/{page_id}')
get_num_pages() # Wait for the page to load
while len(driver.find_elements(By.CLASS_NAME, "sc-AjmGg dDNaMw")):
time.sleep(1)
page_num = args.start_page
load_book_page(page_num)
_, total_pages = get_num_pages()
print('Total number of pages:', total_pages)
page_urls = set()
failed_pages = set()
small_pages_redo = set()
bar = tqdm(total=total_pages)
bar.update(page_num)
while page_num < total_pages + 1:
time.sleep(args.delay)
img_data = None
retry_delay = 5
base_url = None
for page_retry in range(3): # retry the page max this many times
largest_size = 0
for find_img_retry in range(3):
for request in driver.requests:
if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
# Wait for the image to load
wait = 0
while (not request.response or not request.response.body) and wait < 60:
time.sleep(1)
wait += 1
if not request.response or not request.response.body:
bar.write(f'Page {page_num} failed to load, will retry later. {request.url}')
failed_pages.add(page_num)
break
base_url = request.url.split('/')
try:
img_size = int(base_url[-1])
except ValueError:
bar.write(f'Failed to parse URL for page {page_num}, retrying later: {request.url}')
failed_pages.add(page_num)
break
if img_size > largest_size:
base_url = '/'.join(base_url)
img_data = request.response.body
page_urls.add(request.url)
# 2000 is the max size I've seen so we can just exit if it's that.
if img_size == 2000:
break
time.sleep(1)
if base_url:
break
bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}s...')
time.sleep(retry_delay)
retry_delay += 5
if not img_data:
bar.write(f'Failed to download image for page {page_num}, retrying later.')
failed_pages.add(page_num)
elif not base_url:
bar.write(f'Failed to get a URL for page {page_num}, retrying later.')
failed_pages.add(page_num)
else:
page, _ = get_num_pages()
# If this isn't a numbered page we will need to increment the page count
try:
int(page)
except ValueError:
total_pages += 1
bar.write(f'Non-number page {page}, increasing page count by 1 to: {total_pages}')
bar.total = total_pages
bar.refresh()
dl_file = ebook_files / f'{page}.jpg'
with open(dl_file, 'wb') as file:
file.write(img_data)
# Re-save the image to make sure it's in the correct format
img = Image.open(dl_file)
if img.width != 2000:
bar.write(f'Page {page_num} is only {img.width}px wide, will search for a larger image later.')
small_pages_redo.add(page_num)
img.save(dl_file, format='JPEG', subsampling=0, quality=100)
del img
bar.write(base_url)
# Move to the next page
del driver.requests
actions = ActionChains(driver)
actions.send_keys(Keys.RIGHT)
actions.perform()
bar.update()
page_num += 1
# TODO: redo failed pages in failed_pages
# TODO:
driver.close()
bar.close()
# TODO: maybe scrape book title to name the PDF file?
# TODO: also maybe embed the title in the PDF file?
# TODO: make PDF
# TODO: scrape table of contents and insert
# TODO: https://stackoverflow.com/questions/29657237/tesseract-ocr-pdf-as-input