vitalsource2pdf/vitalsource2pdf.py

#!/usr/bin/env python3
import argparse
import json
import os
import shutil
import subprocess
import tempfile
import time
from pathlib import Path

import img2pdf
import selenium
from PIL import Image
from PyPDF2 import PdfMerger, PdfReader, PdfWriter
from pagelabels import PageLabelScheme, PageLabels
from pdfrw import PdfReader as pdfrw_reader
from pdfrw import PdfWriter as pdfrw_writer
from selenium.webdriver import ActionChains, Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from seleniumwire import webdriver
from tqdm import tqdm
from webdriver_manager.chrome import ChromeDriverManager

from fucts.roman import move_romans_to_front, roman_sort_with_ints, try_convert_int

parser = argparse.ArgumentParser()
parser.add_argument('--output', default='./VitalSource/')
parser.add_argument('--isbn', required=True)
parser.add_argument('--delay', default=2, type=int, help='Delay between pages to let them load in seconds.')
parser.add_argument('--pages', default=None, type=int, help='Override how many pages to save.')  # TODO
parser.add_argument('--start-page', default=0, type=int, help='Start on this page. Pages start at zero and include any non-numbered pages.')
parser.add_argument('--end-page', default=-1, type=int, help='End on this page.')
parser.add_argument('--chrome-exe', default=None, type=str, help='Path to the Chrome executable. Leave blank to auto-detect.')
parser.add_argument('--disable-web-security', action='store_true', help="If pages aren't loading then you can try disabling CORS protections.")
parser.add_argument('--language', default='eng', help='OCR language. Default: "eng"')
parser.add_argument('--skip-scrape', action='store_true', help="Don't scrape anything, just re-build the PDF from existing files.")
parser.add_argument('--only-scrape-metadata', action='store_true', help="Similar to --skip-scrape, but only scrape the metadata.")
parser.add_argument('--skip-ocr', action='store_true', help="Don't do any OCR.")
parser.add_argument('--compress', action='store_true', help="Run compression and optimization. Probably won't do anything as there isn't much more compression that can be done.")
args = parser.parse_args()

args.output = Path(args.output)
args.output.mkdir(exist_ok=True, parents=True)
# ebook_output = args.output / f'{args.isbn}.pdf'
ebook_files = args.output / args.isbn
ebook_files.mkdir(exist_ok=True, parents=True)

book_info = {}
non_number_pages = 0


def get_num_pages():
    while True:
        try:
            total = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').strip().split('/')[-1].strip())
            try:
                # Get the value of the page number textbox
                current_page = driver.execute_script('return document.getElementsByClassName("InputControl__input-fbzQBk hDtUvs TextField__InputControl-iza-dmV iISUBf")[0].value')
                if current_page == '' or not current_page:
                    # This element may be empty so just set it to 0
                    current_page = 0
            except selenium.common.exceptions.JavascriptException:
                current_page = 0
            return current_page, total
        except selenium.common.exceptions.JavascriptException:
            time.sleep(1)


def load_book_page(page_id):
    driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/{page_id}')
    get_num_pages()  # Wait for the page to load
    # Wait for the page loader animation to disappear
    while len(driver.find_elements(By.CLASS_NAME, "sc-AjmGg dDNaMw")):
        time.sleep(1)


if not args.skip_scrape or args.only_scrape_metadata:
    chrome_options = webdriver.ChromeOptions()
    if args.disable_web_security:
        chrome_options.add_argument('--disable-web-security')
        print('DISABLED WEB SECURITY!')
    chrome_options.add_argument('--disable-http2')  # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
    if args.chrome_exe:
        chrome_options.binary_location = args.chrome_exe  # '/usr/bin/google-chrome'
    seleniumwire_options = {
        'disable_encoding': True  # Ask the server not to compress the response
    }
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=chrome_options, seleniumwire_options=seleniumwire_options)

    driver.get(f'https://bookshelf.vitalsource.com')
    input('Press ENTER once logged in...')

    driver.maximize_window()
    page_num = args.start_page
    load_book_page(page_num)

    # Get book info
    print('Scraping metadata...')
    time.sleep(args.delay * 2)
    failed = True
    for i in range(5):
        for request in driver.requests:
            if request.url == f'https://jigsaw.vitalsource.com/books/{args.isbn}/pages':
                wait = 0
                while not request.response and wait < 30:
                    time.sleep(1)
                    wait += 1
                if not request.response or not request.response.body:
                    print('Failed to get pages information.')
                else:
                    book_info['pages'] = json.loads(request.response.body.decode())
            elif request.url == f'https://jigsaw.vitalsource.com/info/books.json?isbns={args.isbn}':
                wait = 0
                while not request.response and wait < 30:
                    time.sleep(1)
                    wait += 1
                if not request.response or not request.response.body:
                    print('Failed to get book information.')
                else:
                    book_info['book'] = json.loads(request.response.body.decode())
            elif request.url == f'https://jigsaw.vitalsource.com/books/{args.isbn}/toc':
                wait = 0
                while not request.response and wait < 30:
                    time.sleep(1)
                    wait += 1
                if not request.response or not request.response.body:
                    print('Failed to get TOC information, only got:', list(book_info.keys()))
                else:
                    book_info['toc'] = json.loads(request.response.body.decode())
        if 'pages' not in book_info.keys() or 'book' not in book_info.keys() or 'toc' not in book_info.keys():
            print('Missing some book data, only got:', list(book_info.keys()))
        else:
            failed = False
        if not failed:
            break
        print('Retrying metadata scrape in 10s...')
        load_book_page(page_num)
        time.sleep(10)

    if args.only_scrape_metadata:
        driver.close()
        del driver

    if not args.only_scrape_metadata:
        _, total_pages = get_num_pages()

        if args.start_page > 0:
            print('You specified a start page so ignore the very large page count.')
        total_pages = 99999999999999999 if args.start_page > 0 else total_pages

        print('Total number of pages:', total_pages)
        print('Scraping pages...')

        page_urls = set()
        failed_pages = set()
        small_pages_redo = set()
        bar = tqdm(total=total_pages)
        bar.update(page_num)
        while page_num < total_pages + 1:
            time.sleep(args.delay)
            retry_delay = 5
            base_url = None
            for page_retry in range(3):  # retry the page max this many times
                largest_size = 0
                for find_img_retry in range(3):
                    for request in driver.requests:
                        if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
                            base_url = request.url.split('/')
                            del base_url[-1]
                            base_url = '/'.join(base_url)
                    time.sleep(1)
                if base_url:
                    break
                bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}s...')
                time.sleep(retry_delay)
                retry_delay += 5

            page, _ = get_num_pages()

            if not base_url:
                bar.write(f'Failed to get a URL for page {page_num}, retrying later.')
                failed_pages.add(page_num)
            else:
                page_urls.add((page, base_url))
                bar.write(base_url)
                # If this isn't a numbered page we will need to increment the page count
                try:
                    int(page)
                except ValueError:
                    total_pages += 1
                    non_number_pages += 1
                    bar.write(f'Non-number page {page}, increasing page count by 1 to: {total_pages}')
                    bar.total = total_pages
                    bar.refresh()

            if page_num == args.end_page:
                bar.write(f'Exiting on page {page_num}.')
                break

            # On the first page the back arrow is disabled and will trigger this
            if isinstance(page_num, int) and page_num > 0:
                try:
                    # If a page forward/backwards button is disabled
                    if driver.execute_script(f'return document.getElementsByClassName("IconButton__button-bQttMI gHMmeA sc-oXPCX mwNce")[0].disabled'):
                        bar.write(f'Book completed, exiting.')
                        break
                except selenium.common.exceptions.JavascriptException:
                    pass

            # Move to the next page
            del driver.requests
            actions = ActionChains(driver)
            actions.send_keys(Keys.RIGHT)
            actions.perform()
            bar.update()
            page_num += 1
        bar.close()

        print('Re-doing failed pages...')
        bar = tqdm(total=len(failed_pages))
        for page in failed_pages:
            load_book_page(page)
            time.sleep(args.delay)
            retry_delay = 5
            base_url = None
            for page_retry in range(3):  # retry the page max this many times
                largest_size = 0
                for find_img_retry in range(3):
                    for request in driver.requests:
                        if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
                            base_url = request.url.split('/')
                            del base_url[-1]
                            base_url = '/'.join(base_url)
                    time.sleep(1)
                if base_url:
                    break
                bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}s...')
                time.sleep(retry_delay)
                retry_delay += 5
            page, _ = get_num_pages()
            if not base_url:
                bar.write(f'Failed to get a URL for page {page_num}, retrying later.')
                failed_pages.add(page_num)
            else:
                page_urls.add((page, base_url))
                bar.write(base_url)
                del driver.requests
            bar.update(1)
        bar.close()

        time.sleep(1)
        print('All pages scraped! Now downloading images...')

        bar = tqdm(total=len(page_urls))
        for page, base_url in page_urls:
            success = False
            for retry in range(6):
                del driver.requests
                time.sleep(args.delay / 2)
                driver.get(f'{base_url.strip("/")}/2000')
                time.sleep(args.delay / 2)
                retry_delay = 5
                img_data = None
                for page_retry in range(3):  # retry the page max this many times
                    largest_size = 0
                    for find_img_retry in range(3):
                        for request in driver.requests:
                            if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
                                img_data = request.response.body
                                break
                dl_file = ebook_files / f'{page}.jpg'
                if img_data:
                    with open(dl_file, 'wb') as file:
                        file.write(img_data)
                    # Re-save the image to make sure it's in the correct format
                    img = Image.open(dl_file)
                    if img.width != 2000:
                        bar.write(f'Image too small at {img.width}px wide, retrying: {base_url}')
                        driver.get('https://google.com')
                        time.sleep(8)
                        load_book_page(0)
                        time.sleep(8)
                        continue
                    img.save(dl_file, format='JPEG', subsampling=0, quality=100)
                    del img
                    success = True
                if success:
                    break
            if not success:
                bar.write(f'Failed to download image: {base_url}')
            bar.update()
        bar.close()
        driver.close()
        del driver
else:
    print('Page scrape skipped...')

# Sometimes the book skips a page. Add a blank page if thats the case.
print('Checking for blank pages...')
existing_page_files = move_romans_to_front(roman_sort_with_ints([try_convert_int(str(x.stem)) for x in list(ebook_files.iterdir())]))
if non_number_pages == 0:  # We might not have scraped so this number needs to be updated.
    for item in existing_page_files:
        if isinstance(try_convert_int(item), str):
            non_number_pages += 1
for page in tqdm(iterable=existing_page_files):
    page_i = try_convert_int(page)
    if isinstance(page_i, int) and page_i > 0:
        page_i += non_number_pages
        last_page_i = try_convert_int(existing_page_files[page_i - 1])
        if isinstance(last_page_i, int):
            last_page_i = last_page_i + non_number_pages
            if last_page_i != page_i - 1:
                img = Image.new('RGB', (2000, 2588), (255, 255, 255))
                img.save(ebook_files / f'{int(page) - 1}.jpg')
                tqdm.write(f'Created blank image for page {int(page) - 1}.')

print('Building PDF...')
raw_pdf_file = args.output / f'{args.isbn} RAW.pdf'
existing_page_files = move_romans_to_front(roman_sort_with_ints([try_convert_int(str(x.stem)) for x in list(ebook_files.iterdir())]))
page_files = [str(ebook_files / f'{x}.jpg') for x in existing_page_files]
pdf = img2pdf.convert(page_files)
with open(raw_pdf_file, 'wb') as f:
    f.write(pdf)

if 'book' in book_info.keys() and 'books' in book_info['book'].keys() and len(book_info['book']['books']):
    title = book_info['book']['books'][0]['title']
    author = book_info['book']['books'][0]['author']
else:
    title = args.isbn
    author = 'Unknown'

if not args.skip_ocr:
    print('Running OCR...')
    ocr_in = raw_pdf_file
    _, raw_pdf_file = tempfile.mkstemp()
    subprocess.run(f'ocrmypdf -l {args.language} --title "{title}" --jobs $(nproc) --output-type pdfa "{ocr_in}" "{raw_pdf_file}"', shell=True)
else:
    ebook_output_ocr = args.output / f'{args.isbn}.pdf'
    print('Skipping OCR...')

# Add metadata
print('Adding metadata...')
file_in = open(raw_pdf_file, 'rb')
pdf_reader = PdfReader(file_in)
pdf_merger = PdfMerger()
pdf_merger.append(file_in)

pdf_merger.add_metadata({'/Author': author, '/Title': title, '/Creator': f'ISBN: {args.isbn}'})

if 'toc' in book_info.keys():
    print('Creating TOC...')
    for item in book_info['toc']:
        pdf_merger.add_outline_item(item['title'], int(item['cfi'].strip('/')) - 1)
else:
    print('Not creating TOC...')

_, tmpfile = tempfile.mkstemp()
pdf_merger.write(open(tmpfile, 'wb'))

romans_end = 0
for p in existing_page_files:
    if isinstance(p, str):
        romans_end += 1

if romans_end > 0:
    print('Renumbering pages...')
    reader = pdfrw_reader(tmpfile)
    labels = PageLabels.from_pdf(reader)

    roman_labels = PageLabelScheme(
        startpage=0,
        style='none',
        prefix='Cover',
        firstpagenum=1
    )
    labels.append(roman_labels)

    roman_labels = PageLabelScheme(
        startpage=1,
        style='roman lowercase',
        firstpagenum=1
    )
    labels.append(roman_labels)

    normal_labels = PageLabelScheme(
        startpage=romans_end,
        style='arabic',
        firstpagenum=1
    )
    labels.append(normal_labels)

    labels.write(reader)
    writer = pdfrw_writer()
    writer.trailer = reader
    writer.write(args.output / f'{title}.pdf')
else:
    shutil.move(tmpfile, args.output / f'{title}.pdf')

os.remove(tmpfile)

if args.compress:
    print('Compressing PDF...')
    # https://pypdf2.readthedocs.io/en/latest/user/file-size.html
    reader = PdfReader(args.output / f'{title}.pdf')
    writer = PdfWriter()
    for page in reader.pages:
        page.compress_content_streams()  # This is CPU intensive!
        writer.add_page(page)
    with open(args.output / f'{title} compressed.pdf', 'wb') as f:
        writer.write(f)
when to end scraping pages 2023-03-14 01:01:23 -06:00			`#!/usr/bin/env python3`
initial prototype 2023-03-12 22:41:19 -06:00			`import argparse`
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`import json`
			`import os`
			`import shutil`
ocr 2023-03-14 11:49:09 -06:00			`import subprocess`
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`import tempfile`
change ocr command 2023-03-14 11:53:47 -06:00			`import time`
initial prototype 2023-03-12 22:41:19 -06:00			`from pathlib import Path`

pretty much working, PDFs generated 2023-03-14 00:27:53 -06:00			`import img2pdf`
initial prototype 2023-03-12 22:41:19 -06:00			`import selenium`
clean up, fix bugs 2023-03-13 17:58:22 -06:00			`from PIL import Image`
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`from PyPDF2 import PdfMerger, PdfReader, PdfWriter`
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`from pagelabels import PageLabelScheme, PageLabels`
			`from pdfrw import PdfReader as pdfrw_reader`
			`from pdfrw import PdfWriter as pdfrw_writer`
initial prototype 2023-03-12 22:41:19 -06:00			`from selenium.webdriver import ActionChains, Keys`
			`from selenium.webdriver.chrome.service import Service`
			`from selenium.webdriver.common.by import By`
			`from seleniumwire import webdriver`
			`from tqdm import tqdm`
			`from webdriver_manager.chrome import ChromeDriverManager`

all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`from fucts.roman import move_romans_to_front, roman_sort_with_ints, try_convert_int`
pretty much working, PDFs generated 2023-03-14 00:27:53 -06:00
initial prototype 2023-03-12 22:41:19 -06:00			`parser = argparse.ArgumentParser()`
			`parser.add_argument('--output', default='./VitalSource/')`
			`parser.add_argument('--isbn', required=True)`
clean up, fix bugs 2023-03-13 17:58:22 -06:00			`parser.add_argument('--delay', default=2, type=int, help='Delay between pages to let them load in seconds.')`
			`parser.add_argument('--pages', default=None, type=int, help='Override how many pages to save.') # TODO`
			`parser.add_argument('--start-page', default=0, type=int, help='Start on this page. Pages start at zero and include any non-numbered pages.')`
redo failed pages, fixes 2023-03-14 10:46:42 -06:00			`parser.add_argument('--end-page', default=-1, type=int, help='End on this page.')`
pretty much working, PDFs generated 2023-03-14 00:27:53 -06:00			`parser.add_argument('--chrome-exe', default=None, type=str, help='Path to the Chrome executable. Leave blank to auto-detect.')`
working pretty well 2023-03-13 16:39:47 -06:00			`parser.add_argument('--disable-web-security', action='store_true', help="If pages aren't loading then you can try disabling CORS protections.")`
ocr 2023-03-14 11:49:09 -06:00			`parser.add_argument('--language', default='eng', help='OCR language. Default: "eng"')`
skip scrape arg 2023-03-14 12:00:38 -06:00			`parser.add_argument('--skip-scrape', action='store_true', help="Don't scrape anything, just re-build the PDF from existing files.")`
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`parser.add_argument('--only-scrape-metadata', action='store_true', help="Similar to --skip-scrape, but only scrape the metadata.")`
			`parser.add_argument('--skip-ocr', action='store_true', help="Don't do any OCR.")`
pretty much done 2023-03-14 22:17:47 -06:00			`parser.add_argument('--compress', action='store_true', help="Run compression and optimization. Probably won't do anything as there isn't much more compression that can be done.")`
initial prototype 2023-03-12 22:41:19 -06:00			`args = parser.parse_args()`

			`args.output = Path(args.output)`
working pretty well 2023-03-13 16:39:47 -06:00			`args.output.mkdir(exist_ok=True, parents=True)`
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`# ebook_output = args.output / f'{args.isbn}.pdf'`
working pretty well 2023-03-13 16:39:47 -06:00			`ebook_files = args.output / args.isbn`
			`ebook_files.mkdir(exist_ok=True, parents=True)`
initial prototype 2023-03-12 22:41:19 -06:00
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`book_info = {}`
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`non_number_pages = 0`
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00

			`def get_num_pages():`
			`while True:`
			`try:`
			`total = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').strip().split('/')[-1].strip())`
			`try:`
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`# Get the value of the page number textbox`
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`current_page = driver.execute_script('return document.getElementsByClassName("InputControl__input-fbzQBk hDtUvs TextField__InputControl-iza-dmV iISUBf")[0].value')`
			`if current_page == '' or not current_page:`
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`# This element may be empty so just set it to 0`
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`current_page = 0`
			`except selenium.common.exceptions.JavascriptException:`
			`current_page = 0`
			`return current_page, total`
			`except selenium.common.exceptions.JavascriptException:`
			`time.sleep(1)`


			`def load_book_page(page_id):`
			`driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/{page_id}')`
			`get_num_pages() # Wait for the page to load`
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`# Wait for the page loader animation to disappear`
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`while len(driver.find_elements(By.CLASS_NAME, "sc-AjmGg dDNaMw")):`
			`time.sleep(1)`


			`if not args.skip_scrape or args.only_scrape_metadata:`
skip scrape arg 2023-03-14 12:00:38 -06:00			`chrome_options = webdriver.ChromeOptions()`
			`if args.disable_web_security:`
			`chrome_options.add_argument('--disable-web-security')`
			`print('DISABLED WEB SECURITY!')`
			`chrome_options.add_argument('--disable-http2') # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.`
			`if args.chrome_exe:`
			`chrome_options.binary_location = args.chrome_exe # '/usr/bin/google-chrome'`
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`seleniumwire_options = {`
			`'disable_encoding': True # Ask the server not to compress the response`
			`}`
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=chrome_options, seleniumwire_options=seleniumwire_options)`
initial prototype 2023-03-12 22:41:19 -06:00
skip scrape arg 2023-03-14 12:00:38 -06:00			`driver.get(f'https://bookshelf.vitalsource.com')`
			`input('Press ENTER once logged in...')`
initial prototype 2023-03-12 22:41:19 -06:00
skip scrape arg 2023-03-14 12:00:38 -06:00			`driver.maximize_window()`
			`page_num = args.start_page`
			`load_book_page(page_num)`

all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`# Get book info`
			`print('Scraping metadata...')`
pretty much done 2023-03-14 22:17:47 -06:00			`time.sleep(args.delay * 2)`
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`failed = True`
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`for i in range(5):`
			`for request in driver.requests:`
			`if request.url == f'https://jigsaw.vitalsource.com/books/{args.isbn}/pages':`
			`wait = 0`
			`while not request.response and wait < 30:`
			`time.sleep(1)`
			`wait += 1`
			`if not request.response or not request.response.body:`
			`print('Failed to get pages information.')`
			`else:`
			`book_info['pages'] = json.loads(request.response.body.decode())`
			`elif request.url == f'https://jigsaw.vitalsource.com/info/books.json?isbns={args.isbn}':`
			`wait = 0`
			`while not request.response and wait < 30:`
			`time.sleep(1)`
			`wait += 1`
			`if not request.response or not request.response.body:`
			`print('Failed to get book information.')`
			`else:`
			`book_info['book'] = json.loads(request.response.body.decode())`
			`elif request.url == f'https://jigsaw.vitalsource.com/books/{args.isbn}/toc':`
			`wait = 0`
			`while not request.response and wait < 30:`
			`time.sleep(1)`
			`wait += 1`
			`if not request.response or not request.response.body:`
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`print('Failed to get TOC information, only got:', list(book_info.keys()))`
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`else:`
			`book_info['toc'] = json.loads(request.response.body.decode())`
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`if 'pages' not in book_info.keys() or 'book' not in book_info.keys() or 'toc' not in book_info.keys():`
			`print('Missing some book data, only got:', list(book_info.keys()))`
			`else:`
			`failed = False`
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`if not failed:`
			`break`
			`print('Retrying metadata scrape in 10s...')`
			`load_book_page(page_num)`
			`time.sleep(10)`

			`if args.only_scrape_metadata:`
			`driver.close()`
			`del driver`

fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`if not args.only_scrape_metadata:`
			`_, total_pages = get_num_pages()`
clean up, fix bugs 2023-03-13 17:58:22 -06:00
some final fixes 2023-03-14 22:42:23 -06:00			`if args.start_page > 0:`
			`print('You specified a start page so ignore the very large page count.')`
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`total_pages = 99999999999999999 if args.start_page > 0 else total_pages`

			`print('Total number of pages:', total_pages)`
			`print('Scraping pages...')`

			`page_urls = set()`
			`failed_pages = set()`
			`small_pages_redo = set()`
			`bar = tqdm(total=total_pages)`
			`bar.update(page_num)`
			`while page_num < total_pages + 1:`
			`time.sleep(args.delay)`
			`retry_delay = 5`
			`base_url = None`
			`for page_retry in range(3): # retry the page max this many times`
			`largest_size = 0`
			`for find_img_retry in range(3):`
			`for request in driver.requests:`
			`if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):`
			`base_url = request.url.split('/')`
			`del base_url[-1]`
			`base_url = '/'.join(base_url)`
			`time.sleep(1)`
			`if base_url:`
skip scrape arg 2023-03-14 12:00:38 -06:00			`break`
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}s...')`
			`time.sleep(retry_delay)`
			`retry_delay += 5`

			`page, _ = get_num_pages()`

			`if not base_url:`
			`bar.write(f'Failed to get a URL for page {page_num}, retrying later.')`
			`failed_pages.add(page_num)`
			`else:`
			`page_urls.add((page, base_url))`
			`bar.write(base_url)`
			`# If this isn't a numbered page we will need to increment the page count`
			`try:`
			`int(page)`
			`except ValueError:`
			`total_pages += 1`
			`non_number_pages += 1`
			`bar.write(f'Non-number page {page}, increasing page count by 1 to: {total_pages}')`
			`bar.total = total_pages`
			`bar.refresh()`

			`if page_num == args.end_page:`
			`bar.write(f'Exiting on page {page_num}.')`
skip scrape arg 2023-03-14 12:00:38 -06:00			`break`

fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`# On the first page the back arrow is disabled and will trigger this`
			`if isinstance(page_num, int) and page_num > 0:`
			`try:`
			`# If a page forward/backwards button is disabled`
			`if driver.execute_script(f'return document.getElementsByClassName("IconButton__button-bQttMI gHMmeA sc-oXPCX mwNce")[0].disabled'):`
			`bar.write(f'Book completed, exiting.')`
			`break`
			`except selenium.common.exceptions.JavascriptException:`
			`pass`

			`# Move to the next page`
skip scrape arg 2023-03-14 12:00:38 -06:00			`del driver.requests`
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`actions = ActionChains(driver)`
			`actions.send_keys(Keys.RIGHT)`
			`actions.perform()`
			`bar.update()`
			`page_num += 1`
			`bar.close()`

			`print('Re-doing failed pages...')`
			`bar = tqdm(total=len(failed_pages))`
			`for page in failed_pages:`
			`load_book_page(page)`
skip scrape arg 2023-03-14 12:00:38 -06:00			`time.sleep(args.delay)`
			`retry_delay = 5`
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`base_url = None`
skip scrape arg 2023-03-14 12:00:38 -06:00			`for page_retry in range(3): # retry the page max this many times`
			`largest_size = 0`
			`for find_img_retry in range(3):`
			`for request in driver.requests:`
			`if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):`
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`base_url = request.url.split('/')`
			`del base_url[-1]`
			`base_url = '/'.join(base_url)`
			`time.sleep(1)`
			`if base_url:`
			`break`
			`bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}s...')`
			`time.sleep(retry_delay)`
			`retry_delay += 5`
			`page, _ = get_num_pages()`
			`if not base_url:`
			`bar.write(f'Failed to get a URL for page {page_num}, retrying later.')`
			`failed_pages.add(page_num)`
			`else:`
			`page_urls.add((page, base_url))`
			`bar.write(base_url)`
			`del driver.requests`
			`bar.update(1)`
			`bar.close()`

			`time.sleep(1)`
			`print('All pages scraped! Now downloading images...')`

			`bar = tqdm(total=len(page_urls))`
			`for page, base_url in page_urls:`
			`success = False`
			`for retry in range(6):`
			`del driver.requests`
			`time.sleep(args.delay / 2)`
			`driver.get(f'{base_url.strip("/")}/2000')`
			`time.sleep(args.delay / 2)`
			`retry_delay = 5`
			`img_data = None`
			`for page_retry in range(3): # retry the page max this many times`
			`largest_size = 0`
			`for find_img_retry in range(3):`
			`for request in driver.requests:`
			`if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):`
			`img_data = request.response.body`
			`break`
			`dl_file = ebook_files / f'{page}.jpg'`
			`if img_data:`
			`with open(dl_file, 'wb') as file:`
			`file.write(img_data)`
			`# Re-save the image to make sure it's in the correct format`
			`img = Image.open(dl_file)`
			`if img.width != 2000:`
			`bar.write(f'Image too small at {img.width}px wide, retrying: {base_url}')`
			`driver.get('https://google.com')`
			`time.sleep(8)`
			`load_book_page(0)`
			`time.sleep(8)`
			`continue`
			`img.save(dl_file, format='JPEG', subsampling=0, quality=100)`
			`del img`
			`success = True`
			`if success:`
			`break`
			`if not success:`
			`bar.write(f'Failed to download image: {base_url}')`
			`bar.update()`
			`bar.close()`
			`driver.close()`
			`del driver`
skip scrape arg 2023-03-14 12:00:38 -06:00			`else:`
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`print('Page scrape skipped...')`
pretty much working, PDFs generated 2023-03-14 00:27:53 -06:00
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`# Sometimes the book skips a page. Add a blank page if thats the case.`
			`print('Checking for blank pages...')`
			`existing_page_files = move_romans_to_front(roman_sort_with_ints([try_convert_int(str(x.stem)) for x in list(ebook_files.iterdir())]))`
			`if non_number_pages == 0: # We might not have scraped so this number needs to be updated.`
			`for item in existing_page_files:`
			`if isinstance(try_convert_int(item), str):`
			`non_number_pages += 1`
			`for page in tqdm(iterable=existing_page_files):`
			`page_i = try_convert_int(page)`
			`if isinstance(page_i, int) and page_i > 0:`
			`page_i += non_number_pages`
			`last_page_i = try_convert_int(existing_page_files[page_i - 1])`
			`if isinstance(last_page_i, int):`
			`last_page_i = last_page_i + non_number_pages`
			`if last_page_i != page_i - 1:`
			`img = Image.new('RGB', (2000, 2588), (255, 255, 255))`
			`img.save(ebook_files / f'{int(page) - 1}.jpg')`
			`tqdm.write(f'Created blank image for page {int(page) - 1}.')`

ocr 2023-03-14 11:49:09 -06:00			`print('Building PDF...')`
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`raw_pdf_file = args.output / f'{args.isbn} RAW.pdf'`
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`existing_page_files = move_romans_to_front(roman_sort_with_ints([try_convert_int(str(x.stem)) for x in list(ebook_files.iterdir())]))`
			`page_files = [str(ebook_files / f'{x}.jpg') for x in existing_page_files]`
pretty much working, PDFs generated 2023-03-14 00:27:53 -06:00			`pdf = img2pdf.convert(page_files)`
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`with open(raw_pdf_file, 'wb') as f:`
pretty much working, PDFs generated 2023-03-14 00:27:53 -06:00			`f.write(pdf)`
clean up, fix bugs 2023-03-13 17:58:22 -06:00
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`if 'book' in book_info.keys() and 'books' in book_info['book'].keys() and len(book_info['book']['books']):`
			`title = book_info['book']['books'][0]['title']`
			`author = book_info['book']['books'][0]['author']`
			`else:`
			`title = args.isbn`
			`author = 'Unknown'`

			`if not args.skip_ocr:`
			`print('Running OCR...')`
			`ocr_in = raw_pdf_file`
			`_, raw_pdf_file = tempfile.mkstemp()`
			`subprocess.run(f'ocrmypdf -l {args.language} --title "{title}" --jobs $(nproc) --output-type pdfa "{ocr_in}" "{raw_pdf_file}"', shell=True)`
			`else:`
			`ebook_output_ocr = args.output / f'{args.isbn}.pdf'`
			`print('Skipping OCR...')`

			`# Add metadata`
			`print('Adding metadata...')`
			`file_in = open(raw_pdf_file, 'rb')`
			`pdf_reader = PdfReader(file_in)`
			`pdf_merger = PdfMerger()`
			`pdf_merger.append(file_in)`
ocr 2023-03-14 11:49:09 -06:00
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`pdf_merger.add_metadata({'/Author': author, '/Title': title, '/Creator': f'ISBN: {args.isbn}'})`
clean up, fix bugs 2023-03-13 17:58:22 -06:00
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`if 'toc' in book_info.keys():`
			`print('Creating TOC...')`
			`for item in book_info['toc']:`
			`pdf_merger.add_outline_item(item['title'], int(item['cfi'].strip('/')) - 1)`
			`else:`
			`print('Not creating TOC...')`

			`_, tmpfile = tempfile.mkstemp()`
			`pdf_merger.write(open(tmpfile, 'wb'))`

			`romans_end = 0`
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`for p in existing_page_files:`
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`if isinstance(p, str):`
			`romans_end += 1`

			`if romans_end > 0:`
			`print('Renumbering pages...')`
			`reader = pdfrw_reader(tmpfile)`
			`labels = PageLabels.from_pdf(reader)`

			`roman_labels = PageLabelScheme(`
			`startpage=0,`
			`style='none',`
			`prefix='Cover',`
			`firstpagenum=1`
			`)`
			`labels.append(roman_labels)`

			`roman_labels = PageLabelScheme(`
			`startpage=1,`
			`style='roman lowercase',`
			`firstpagenum=1`
			`)`
			`labels.append(roman_labels)`

			`normal_labels = PageLabelScheme(`
			`startpage=romans_end,`
			`style='arabic',`
			`firstpagenum=1`
			`)`
			`labels.append(normal_labels)`

			`labels.write(reader)`
			`writer = pdfrw_writer()`
			`writer.trailer = reader`
			`writer.write(args.output / f'{title}.pdf')`
			`else:`
			`shutil.move(tmpfile, args.output / f'{title}.pdf')`
clean up, fix bugs 2023-03-13 17:58:22 -06:00
all parts working, but still need to fix duplicate pages 2023-03-14 18:12:38 -06:00			`os.remove(tmpfile)`
clean up, fix bugs 2023-03-13 17:58:22 -06:00
fix blank page issue, fix some bugs 2023-03-14 22:10:07 -06:00			`if args.compress:`
			`print('Compressing PDF...')`
			`# https://pypdf2.readthedocs.io/en/latest/user/file-size.html`
			`reader = PdfReader(args.output / f'{title}.pdf')`
			`writer = PdfWriter()`
			`for page in reader.pages:`
			`page.compress_content_streams() # This is CPU intensive!`
			`writer.add_page(page)`
			`with open(args.output / f'{title} compressed.pdf', 'wb') as f:`
			`writer.write(f)`