vitalsource2pdf/vitalsource2pdf.py

412 lines
17 KiB
Python
Raw Permalink Normal View History

2023-03-14 01:01:23 -06:00
#!/usr/bin/env python3
2023-03-12 22:41:19 -06:00
import argparse
import json
import os
import shutil
2023-03-14 11:49:09 -06:00
import subprocess
import tempfile
2023-03-14 11:53:47 -06:00
import time
2023-03-12 22:41:19 -06:00
from pathlib import Path
2023-03-14 00:27:53 -06:00
import img2pdf
2023-03-12 22:41:19 -06:00
import selenium
2023-03-13 17:58:22 -06:00
from PIL import Image
2023-03-14 22:10:07 -06:00
from PyPDF2 import PdfMerger, PdfReader, PdfWriter
from pagelabels import PageLabelScheme, PageLabels
from pdfrw import PdfReader as pdfrw_reader
from pdfrw import PdfWriter as pdfrw_writer
2023-03-12 22:41:19 -06:00
from selenium.webdriver import ActionChains, Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from seleniumwire import webdriver
from tqdm import tqdm
from webdriver_manager.chrome import ChromeDriverManager
from fucts.roman import move_romans_to_front, roman_sort_with_ints, try_convert_int
2023-03-14 00:27:53 -06:00
2023-03-12 22:41:19 -06:00
parser = argparse.ArgumentParser()
parser.add_argument('--output', default='./VitalSource/')
parser.add_argument('--isbn', required=True)
2023-03-13 17:58:22 -06:00
parser.add_argument('--delay', default=2, type=int, help='Delay between pages to let them load in seconds.')
parser.add_argument('--pages', default=None, type=int, help='Override how many pages to save.') # TODO
parser.add_argument('--start-page', default=0, type=int, help='Start on this page. Pages start at zero and include any non-numbered pages.')
2023-03-14 10:46:42 -06:00
parser.add_argument('--end-page', default=-1, type=int, help='End on this page.')
2023-03-14 00:27:53 -06:00
parser.add_argument('--chrome-exe', default=None, type=str, help='Path to the Chrome executable. Leave blank to auto-detect.')
2023-03-13 16:39:47 -06:00
parser.add_argument('--disable-web-security', action='store_true', help="If pages aren't loading then you can try disabling CORS protections.")
2023-03-14 11:49:09 -06:00
parser.add_argument('--language', default='eng', help='OCR language. Default: "eng"')
2023-03-14 12:00:38 -06:00
parser.add_argument('--skip-scrape', action='store_true', help="Don't scrape anything, just re-build the PDF from existing files.")
parser.add_argument('--only-scrape-metadata', action='store_true', help="Similar to --skip-scrape, but only scrape the metadata.")
parser.add_argument('--skip-ocr', action='store_true', help="Don't do any OCR.")
2023-03-14 22:17:47 -06:00
parser.add_argument('--compress', action='store_true', help="Run compression and optimization. Probably won't do anything as there isn't much more compression that can be done.")
2023-03-12 22:41:19 -06:00
args = parser.parse_args()
args.output = Path(args.output)
2023-03-13 16:39:47 -06:00
args.output.mkdir(exist_ok=True, parents=True)
# ebook_output = args.output / f'{args.isbn}.pdf'
2023-03-13 16:39:47 -06:00
ebook_files = args.output / args.isbn
ebook_files.mkdir(exist_ok=True, parents=True)
2023-03-12 22:41:19 -06:00
book_info = {}
2023-03-14 22:10:07 -06:00
non_number_pages = 0
def get_num_pages():
while True:
try:
total = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').strip().split('/')[-1].strip())
try:
2023-03-14 22:10:07 -06:00
# Get the value of the page number textbox
current_page = driver.execute_script('return document.getElementsByClassName("InputControl__input-fbzQBk hDtUvs TextField__InputControl-iza-dmV iISUBf")[0].value')
if current_page == '' or not current_page:
2023-03-14 22:10:07 -06:00
# This element may be empty so just set it to 0
current_page = 0
except selenium.common.exceptions.JavascriptException:
current_page = 0
return current_page, total
except selenium.common.exceptions.JavascriptException:
time.sleep(1)
def load_book_page(page_id):
driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/{page_id}')
get_num_pages() # Wait for the page to load
2023-03-14 22:10:07 -06:00
# Wait for the page loader animation to disappear
while len(driver.find_elements(By.CLASS_NAME, "sc-AjmGg dDNaMw")):
time.sleep(1)
if not args.skip_scrape or args.only_scrape_metadata:
2023-03-14 12:00:38 -06:00
chrome_options = webdriver.ChromeOptions()
if args.disable_web_security:
chrome_options.add_argument('--disable-web-security')
print('DISABLED WEB SECURITY!')
chrome_options.add_argument('--disable-http2') # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
if args.chrome_exe:
chrome_options.binary_location = args.chrome_exe # '/usr/bin/google-chrome'
2023-03-14 22:10:07 -06:00
seleniumwire_options = {
'disable_encoding': True # Ask the server not to compress the response
}
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=chrome_options, seleniumwire_options=seleniumwire_options)
2023-03-12 22:41:19 -06:00
2023-03-14 12:00:38 -06:00
driver.get(f'https://bookshelf.vitalsource.com')
input('Press ENTER once logged in...')
2023-03-12 22:41:19 -06:00
2023-03-14 12:00:38 -06:00
driver.maximize_window()
page_num = args.start_page
load_book_page(page_num)
# Get book info
print('Scraping metadata...')
2023-03-14 22:17:47 -06:00
time.sleep(args.delay * 2)
2023-03-14 22:10:07 -06:00
failed = True
for i in range(5):
for request in driver.requests:
if request.url == f'https://jigsaw.vitalsource.com/books/{args.isbn}/pages':
wait = 0
while not request.response and wait < 30:
time.sleep(1)
wait += 1
if not request.response or not request.response.body:
print('Failed to get pages information.')
else:
book_info['pages'] = json.loads(request.response.body.decode())
elif request.url == f'https://jigsaw.vitalsource.com/info/books.json?isbns={args.isbn}':
wait = 0
while not request.response and wait < 30:
time.sleep(1)
wait += 1
if not request.response or not request.response.body:
print('Failed to get book information.')
else:
book_info['book'] = json.loads(request.response.body.decode())
elif request.url == f'https://jigsaw.vitalsource.com/books/{args.isbn}/toc':
wait = 0
while not request.response and wait < 30:
time.sleep(1)
wait += 1
if not request.response or not request.response.body:
2023-03-14 22:10:07 -06:00
print('Failed to get TOC information, only got:', list(book_info.keys()))
else:
book_info['toc'] = json.loads(request.response.body.decode())
2023-03-14 22:10:07 -06:00
if 'pages' not in book_info.keys() or 'book' not in book_info.keys() or 'toc' not in book_info.keys():
print('Missing some book data, only got:', list(book_info.keys()))
else:
failed = False
if not failed:
break
print('Retrying metadata scrape in 10s...')
load_book_page(page_num)
time.sleep(10)
if args.only_scrape_metadata:
driver.close()
del driver
2023-03-14 22:10:07 -06:00
if not args.only_scrape_metadata:
_, total_pages = get_num_pages()
2023-03-13 17:58:22 -06:00
2023-03-14 22:42:23 -06:00
if args.start_page > 0:
print('You specified a start page so ignore the very large page count.')
2023-03-14 22:10:07 -06:00
total_pages = 99999999999999999 if args.start_page > 0 else total_pages
print('Total number of pages:', total_pages)
print('Scraping pages...')
page_urls = set()
failed_pages = set()
small_pages_redo = set()
bar = tqdm(total=total_pages)
bar.update(page_num)
while page_num < total_pages + 1:
time.sleep(args.delay)
retry_delay = 5
base_url = None
for page_retry in range(3): # retry the page max this many times
largest_size = 0
for find_img_retry in range(3):
for request in driver.requests:
if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
base_url = request.url.split('/')
del base_url[-1]
base_url = '/'.join(base_url)
time.sleep(1)
if base_url:
2023-03-14 12:00:38 -06:00
break
2023-03-14 22:10:07 -06:00
bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}s...')
time.sleep(retry_delay)
retry_delay += 5
page, _ = get_num_pages()
if not base_url:
bar.write(f'Failed to get a URL for page {page_num}, retrying later.')
failed_pages.add(page_num)
else:
page_urls.add((page, base_url))
bar.write(base_url)
# If this isn't a numbered page we will need to increment the page count
try:
int(page)
except ValueError:
total_pages += 1
non_number_pages += 1
bar.write(f'Non-number page {page}, increasing page count by 1 to: {total_pages}')
bar.total = total_pages
bar.refresh()
if page_num == args.end_page:
bar.write(f'Exiting on page {page_num}.')
2023-03-14 12:00:38 -06:00
break
2023-03-14 22:10:07 -06:00
# On the first page the back arrow is disabled and will trigger this
if isinstance(page_num, int) and page_num > 0:
try:
# If a page forward/backwards button is disabled
if driver.execute_script(f'return document.getElementsByClassName("IconButton__button-bQttMI gHMmeA sc-oXPCX mwNce")[0].disabled'):
bar.write(f'Book completed, exiting.')
break
except selenium.common.exceptions.JavascriptException:
pass
# Move to the next page
2023-03-14 12:00:38 -06:00
del driver.requests
2023-03-14 22:10:07 -06:00
actions = ActionChains(driver)
actions.send_keys(Keys.RIGHT)
actions.perform()
bar.update()
page_num += 1
bar.close()
print('Re-doing failed pages...')
bar = tqdm(total=len(failed_pages))
for page in failed_pages:
load_book_page(page)
2023-03-14 12:00:38 -06:00
time.sleep(args.delay)
retry_delay = 5
2023-03-14 22:10:07 -06:00
base_url = None
2023-03-14 12:00:38 -06:00
for page_retry in range(3): # retry the page max this many times
largest_size = 0
for find_img_retry in range(3):
for request in driver.requests:
if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
2023-03-14 22:10:07 -06:00
base_url = request.url.split('/')
del base_url[-1]
base_url = '/'.join(base_url)
time.sleep(1)
if base_url:
break
bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}s...')
time.sleep(retry_delay)
retry_delay += 5
page, _ = get_num_pages()
if not base_url:
bar.write(f'Failed to get a URL for page {page_num}, retrying later.')
failed_pages.add(page_num)
else:
page_urls.add((page, base_url))
bar.write(base_url)
del driver.requests
bar.update(1)
bar.close()
time.sleep(1)
print('All pages scraped! Now downloading images...')
bar = tqdm(total=len(page_urls))
for page, base_url in page_urls:
success = False
for retry in range(6):
del driver.requests
time.sleep(args.delay / 2)
driver.get(f'{base_url.strip("/")}/2000')
time.sleep(args.delay / 2)
retry_delay = 5
img_data = None
for page_retry in range(3): # retry the page max this many times
largest_size = 0
for find_img_retry in range(3):
for request in driver.requests:
if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
img_data = request.response.body
break
dl_file = ebook_files / f'{page}.jpg'
if img_data:
with open(dl_file, 'wb') as file:
file.write(img_data)
# Re-save the image to make sure it's in the correct format
img = Image.open(dl_file)
if img.width != 2000:
bar.write(f'Image too small at {img.width}px wide, retrying: {base_url}')
driver.get('https://google.com')
time.sleep(8)
load_book_page(0)
time.sleep(8)
continue
img.save(dl_file, format='JPEG', subsampling=0, quality=100)
del img
success = True
if success:
break
if not success:
bar.write(f'Failed to download image: {base_url}')
bar.update()
bar.close()
driver.close()
del driver
2023-03-14 12:00:38 -06:00
else:
print('Page scrape skipped...')
2023-03-14 00:27:53 -06:00
2023-03-14 22:10:07 -06:00
# Sometimes the book skips a page. Add a blank page if thats the case.
print('Checking for blank pages...')
existing_page_files = move_romans_to_front(roman_sort_with_ints([try_convert_int(str(x.stem)) for x in list(ebook_files.iterdir())]))
if non_number_pages == 0: # We might not have scraped so this number needs to be updated.
for item in existing_page_files:
if isinstance(try_convert_int(item), str):
non_number_pages += 1
for page in tqdm(iterable=existing_page_files):
page_i = try_convert_int(page)
if isinstance(page_i, int) and page_i > 0:
page_i += non_number_pages
last_page_i = try_convert_int(existing_page_files[page_i - 1])
if isinstance(last_page_i, int):
last_page_i = last_page_i + non_number_pages
if last_page_i != page_i - 1:
img = Image.new('RGB', (2000, 2588), (255, 255, 255))
img.save(ebook_files / f'{int(page) - 1}.jpg')
tqdm.write(f'Created blank image for page {int(page) - 1}.')
2023-03-14 11:49:09 -06:00
print('Building PDF...')
raw_pdf_file = args.output / f'{args.isbn} RAW.pdf'
2023-03-14 22:10:07 -06:00
existing_page_files = move_romans_to_front(roman_sort_with_ints([try_convert_int(str(x.stem)) for x in list(ebook_files.iterdir())]))
page_files = [str(ebook_files / f'{x}.jpg') for x in existing_page_files]
2023-03-14 00:27:53 -06:00
pdf = img2pdf.convert(page_files)
with open(raw_pdf_file, 'wb') as f:
2023-03-14 00:27:53 -06:00
f.write(pdf)
2023-03-13 17:58:22 -06:00
if 'book' in book_info.keys() and 'books' in book_info['book'].keys() and len(book_info['book']['books']):
title = book_info['book']['books'][0]['title']
author = book_info['book']['books'][0]['author']
else:
title = args.isbn
author = 'Unknown'
if not args.skip_ocr:
print('Running OCR...')
ocr_in = raw_pdf_file
_, raw_pdf_file = tempfile.mkstemp()
subprocess.run(f'ocrmypdf -l {args.language} --title "{title}" --jobs $(nproc) --output-type pdfa "{ocr_in}" "{raw_pdf_file}"', shell=True)
else:
ebook_output_ocr = args.output / f'{args.isbn}.pdf'
print('Skipping OCR...')
# Add metadata
print('Adding metadata...')
file_in = open(raw_pdf_file, 'rb')
pdf_reader = PdfReader(file_in)
pdf_merger = PdfMerger()
pdf_merger.append(file_in)
2023-03-14 11:49:09 -06:00
pdf_merger.add_metadata({'/Author': author, '/Title': title, '/Creator': f'ISBN: {args.isbn}'})
2023-03-13 17:58:22 -06:00
if 'toc' in book_info.keys():
print('Creating TOC...')
for item in book_info['toc']:
pdf_merger.add_outline_item(item['title'], int(item['cfi'].strip('/')) - 1)
else:
print('Not creating TOC...')
_, tmpfile = tempfile.mkstemp()
pdf_merger.write(open(tmpfile, 'wb'))
romans_end = 0
2023-03-14 22:10:07 -06:00
for p in existing_page_files:
if isinstance(p, str):
romans_end += 1
if romans_end > 0:
print('Renumbering pages...')
reader = pdfrw_reader(tmpfile)
labels = PageLabels.from_pdf(reader)
roman_labels = PageLabelScheme(
startpage=0,
style='none',
prefix='Cover',
firstpagenum=1
)
labels.append(roman_labels)
roman_labels = PageLabelScheme(
startpage=1,
style='roman lowercase',
firstpagenum=1
)
labels.append(roman_labels)
normal_labels = PageLabelScheme(
startpage=romans_end,
style='arabic',
firstpagenum=1
)
labels.append(normal_labels)
labels.write(reader)
writer = pdfrw_writer()
writer.trailer = reader
writer.write(args.output / f'{title}.pdf')
else:
shutil.move(tmpfile, args.output / f'{title}.pdf')
2023-03-13 17:58:22 -06:00
os.remove(tmpfile)
2023-03-13 17:58:22 -06:00
2023-03-14 22:10:07 -06:00
if args.compress:
print('Compressing PDF...')
# https://pypdf2.readthedocs.io/en/latest/user/file-size.html
reader = PdfReader(args.output / f'{title}.pdf')
writer = PdfWriter()
for page in reader.pages:
page.compress_content_streams() # This is CPU intensive!
writer.add_page(page)
with open(args.output / f'{title} compressed.pdf', 'wb') as f:
writer.write(f)