fix blank page issue, fix some bugs

2023-03-14 22:10:07 -06:00 · 2023-03-14 22:10:07 -06:00 · edeba3b981
parent aa48f24022
commit edeba3b981
2 changed files with 194 additions and 156 deletions
--- a/README.md
+++ b/README.md
@ -6,14 +6,13 @@ This is an automated, all-in-one scraper to convert VitalSource textbooks into P

 - Automated download of pages.
 - Automated OCR.
- Correct page numbers (including Roman numerals at the beginning). There might be some issues with wierd page numbers at the end
-  of the book.
+- Correct page numbering (including Roman numerals at the beginning).
 - Table of contents creation.
- No funny stuff. No wierd endpoints are used and no hacky scraping is preformed.
+- No funny stuff. No weird endpoints are used and no hacky scraping is preformed.
 - Almost completly transparent. All actions are ones that a normal user would do.

-The goal of this project is for this to "just work." There are many other VitalSource scrapers out there that are wierd, poorly
-designed, or are broken. I designed my scraper to be as simple while producing the highest-quality PDF possible.
+The goal of this project is for this to "just work." There are many other VitalSource scrapers out there that are weird, poorly
+designed, or broken. I designed my scraper to be simple while producing the highest-quality PDF possible.

 ## Install

@ -22,8 +21,6 @@ sudo apt install ocrmypdf jbig2dec
 pip install -r requirements.txt
 ```

-[//]: # (You also need the JBIG2 encoder, which can either be [built from source]&#40;https://ocrmypdf.readthedocs.io/en/latest/jbig2.html&#41;.)
-
 Make sure you have Chrome installed. If you have both Chrome and Chrominium you can use `--chrome-exe` to specify the path to `google-chrome`.

 The Webdriver binary will be automatically downloaded.
@ -42,6 +39,8 @@ If your network is slow, use `--delay` to allow more time for the files to downl

 Make sure to leave the window maximized as the content scaling will mess with the scraper.

+You may want to run the scraper two or three times to make sure you have downloaded all the pages.
+
 ### What This Scraper Doesn't Do

 Guide you through step-by-step. You are expected to have the required technical knowledge and understand what
@ -53,10 +52,4 @@ You will also have to double check the output PDF to make sure everything is as

 This scraper uses Selenium to load the ebook viewer webpage. It then navigates through the book page by page and records network
 requests. After each page it will analyze the requests and find one matching the format of the page image. It then saves
-that request to a `.jpg`.
-
-Once all images are downloaded, a PDF is created.
-
-Then `pytesseract` is used to add text to the page images.
-
-Finally, the table of contents is scraped and added to the PDF. 
+that request to a `.jpg`.
--- a/vitalsource2pdf.py
+++ b/vitalsource2pdf.py
@ -11,7 +11,7 @@ from pathlib import Path
 import img2pdf
 import selenium
 from PIL import Image
-from PyPDF2 import PdfMerger, PdfReader
+from PyPDF2 import PdfMerger, PdfReader, PdfWriter
 from pagelabels import PageLabelScheme, PageLabels
 from pdfrw import PdfReader as pdfrw_reader
 from pdfrw import PdfWriter as pdfrw_writer
@ -37,6 +37,7 @@ parser.add_argument('--language', default='eng', help='OCR language. Default: "e
 parser.add_argument('--skip-scrape', action='store_true', help="Don't scrape anything, just re-build the PDF from existing files.")
 parser.add_argument('--only-scrape-metadata', action='store_true', help="Similar to --skip-scrape, but only scrape the metadata.")
 parser.add_argument('--skip-ocr', action='store_true', help="Don't do any OCR.")
+parser.add_argument('--compress', action='store_true', help="Run compression and optimization.")
 args = parser.parse_args()

 args.output = Path(args.output)
@ -46,6 +47,7 @@ ebook_files = args.output / args.isbn
 ebook_files.mkdir(exist_ok=True, parents=True)

 book_info = {}
+non_number_pages = 0


 def get_num_pages():
@ -53,9 +55,10 @@ def get_num_pages():
        try:
            total = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').strip().split('/')[-1].strip())
            try:
-                # This element may be empty so just set it to 0
+                # Get the value of the page number textbox
                current_page = driver.execute_script('return document.getElementsByClassName("InputControl__input-fbzQBk hDtUvs TextField__InputControl-iza-dmV iISUBf")[0].value')
                if current_page == '' or not current_page:
+                    # This element may be empty so just set it to 0
                    current_page = 0
            except selenium.common.exceptions.JavascriptException:
                current_page = 0
@ -67,6 +70,7 @@ def get_num_pages():
 def load_book_page(page_id):
    driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/{page_id}')
    get_num_pages()  # Wait for the page to load
+    # Wait for the page loader animation to disappear
    while len(driver.find_elements(By.CLASS_NAME, "sc-AjmGg dDNaMw")):
        time.sleep(1)

@ -79,8 +83,9 @@ if not args.skip_scrape or args.only_scrape_metadata:
    chrome_options.add_argument('--disable-http2')  # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
    if args.chrome_exe:
        chrome_options.binary_location = args.chrome_exe  # '/usr/bin/google-chrome'
-    seleniumwire_options = {'disable_encoding': True  # Ask the server not to compress the response
-                            }
+    seleniumwire_options = {
+        'disable_encoding': True  # Ask the server not to compress the response
+    }
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=chrome_options, seleniumwire_options=seleniumwire_options)

    driver.get(f'https://bookshelf.vitalsource.com')
@ -92,7 +97,7 @@ if not args.skip_scrape or args.only_scrape_metadata:

    # Get book info
    print('Scraping metadata...')
-    failed = False
+    failed = True
    for i in range(5):
        for request in driver.requests:
            if request.url == f'https://jigsaw.vitalsource.com/books/{args.isbn}/pages':
@ -102,7 +107,6 @@ if not args.skip_scrape or args.only_scrape_metadata:
                    wait += 1
                if not request.response or not request.response.body:
                    print('Failed to get pages information.')
-                    failed = True
                else:
                    book_info['pages'] = json.loads(request.response.body.decode())
            elif request.url == f'https://jigsaw.vitalsource.com/info/books.json?isbns={args.isbn}':
@ -112,7 +116,6 @@ if not args.skip_scrape or args.only_scrape_metadata:
                    wait += 1
                if not request.response or not request.response.body:
                    print('Failed to get book information.')
-                    failed = True
                else:
                    book_info['book'] = json.loads(request.response.body.decode())
            elif request.url == f'https://jigsaw.vitalsource.com/books/{args.isbn}/toc':
@ -121,10 +124,13 @@ if not args.skip_scrape or args.only_scrape_metadata:
                    time.sleep(1)
                    wait += 1
                if not request.response or not request.response.body:
-                    print('Failed to get TOC information.')
-                    failed = True
+                    print('Failed to get TOC information, only got:', list(book_info.keys()))
                else:
                    book_info['toc'] = json.loads(request.response.body.decode())
+        if 'pages' not in book_info.keys() or 'book' not in book_info.keys() or 'toc' not in book_info.keys():
+            print('Missing some book data, only got:', list(book_info.keys()))
+        else:
+            failed = False
        if not failed:
            break
        print('Retrying metadata scrape in 10s...')
@ -135,154 +141,182 @@ if not args.skip_scrape or args.only_scrape_metadata:
        driver.close()
        del driver

-if not args.skip_scrape and not args.only_scrape_metadata:
-    _, total_pages = get_num_pages()
-    total_pages = 99999999999999999 if args.start_page > 0 else total_pages
-    print('Total number of pages:', total_pages)
+    if not args.only_scrape_metadata:
+        _, total_pages = get_num_pages()

-    page_urls = set()
-    failed_pages = set()
-    small_pages_redo = set()
-    bar = tqdm(total=total_pages)
-    bar.update(page_num)
-    while page_num < total_pages + 1:
-        time.sleep(args.delay)
-        retry_delay = 5
-        base_url = None
-        for page_retry in range(3):  # retry the page max this many times
-            largest_size = 0
-            for find_img_retry in range(3):
-                for request in driver.requests:
-                    if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
-                        base_url = request.url.split('/')
-                        del base_url[-1]
-                        base_url = '/'.join(base_url)
-                time.sleep(1)
-            if base_url:
-                break
-            bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}s...')
-            time.sleep(retry_delay)
-            retry_delay += 5
+        print('You specified a start page so ignore the very large page count.')
+        total_pages = 99999999999999999 if args.start_page > 0 else total_pages

-        page, _ = get_num_pages()
-        if not base_url:
-            bar.write(f'Failed to get a URL for page {page_num}, retrying later.')
-            failed_pages.add(page_num)
-        else:
-            page_urls.add((page, base_url))
-            bar.write(base_url)
-            # If this isn't a numbered page we will need to increment the page count
-            try:
-                int(page)
-            except ValueError:
-                total_pages += 1
-                bar.write(f'Non-number page {page}, increasing page count by 1 to: {total_pages}')
-                bar.total = total_pages
-                bar.refresh()
+        print('Total number of pages:', total_pages)
+        print('Scraping pages...')

-        if page_num == args.end_page:
-            bar.write(f'Exiting on page {page_num}.')
-            break
-
-        # On the first page the back arrow is disabled and will trigger this
-        if isinstance(page_num, int) and page_num > 0:
-            try:
-                if driver.execute_script(f'return document.getElementsByClassName("IconButton__button-bQttMI gHMmeA sc-oXPCX mwNce")[0].disabled'):  # not driver.find_elements(By.CLASS_NAME, 'IconButton__button-bQttMI gHMmeA sc-oXPCX mwNce')[0].is_enabled():
-                    bar.write(f'Book completed, exiting.')
-                    break
-            except selenium.common.exceptions.JavascriptException:
-                pass
-
-        # Move to the next page
-        del driver.requests
-        actions = ActionChains(driver)
-        actions.send_keys(Keys.RIGHT)
-        actions.perform()
-        bar.update()
-        page_num += 1
-    bar.close()
-
-    print('Re-doing failed pages...')
-    bar = tqdm(total=len(failed_pages))
-    for page in failed_pages:
-        load_book_page(page)
-        time.sleep(args.delay)
-        retry_delay = 5
-        base_url = None
-        for page_retry in range(3):  # retry the page max this many times
-            largest_size = 0
-            for find_img_retry in range(3):
-                for request in driver.requests:
-                    if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
-                        base_url = request.url.split('/')
-                        del base_url[-1]
-                        base_url = '/'.join(base_url)
-                time.sleep(1)
-            if base_url:
-                break
-            bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}s...')
-            time.sleep(retry_delay)
-            retry_delay += 5
-        page, _ = get_num_pages()
-        if not base_url:
-            bar.write(f'Failed to get a URL for page {page_num}, retrying later.')
-            failed_pages.add(page_num)
-        else:
-            page_urls.add((page, base_url))
-            bar.write(base_url)
-            del driver.requests
-
-    time.sleep(1)
-    print('All pages scraped! Now downloading images...')
-
-    bar = tqdm(total=len(page_urls))
-    for page, base_url in page_urls:
-        success = False
-        for retry in range(6):
-            del driver.requests
-            time.sleep(args.delay)
-            driver.get(f'{base_url.strip("/")}/2000')  # have to load the page first for cookies reasons
+        page_urls = set()
+        failed_pages = set()
+        small_pages_redo = set()
+        bar = tqdm(total=total_pages)
+        bar.update(page_num)
+        while page_num < total_pages + 1:
            time.sleep(args.delay)
            retry_delay = 5
-            img_data = None
+            base_url = None
            for page_retry in range(3):  # retry the page max this many times
                largest_size = 0
                for find_img_retry in range(3):
                    for request in driver.requests:
                        if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
-                            img_data = request.response.body
-                            break
-            dl_file = ebook_files / f'{page}.jpg'
-            if img_data:
-                with open(dl_file, 'wb') as file:
-                    file.write(img_data)
-                # Re-save the image to make sure it's in the correct format
-                img = Image.open(dl_file)
-                if img.width != 2000:
-                    bar.write(f'Image too small at {img.width}px wide, retrying: {base_url}')
-                    driver.get('https://google.com')
-                    time.sleep(8)
-                    load_book_page(0)
-                    time.sleep(8)
-                    continue
-                img.save(dl_file, format='JPEG', subsampling=0, quality=100)
-                del img
-                success = True
-            if success:
+                            base_url = request.url.split('/')
+                            del base_url[-1]
+                            base_url = '/'.join(base_url)
+                    time.sleep(1)
+                if base_url:
+                    break
+                bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}s...')
+                time.sleep(retry_delay)
+                retry_delay += 5
+
+            page, _ = get_num_pages()
+
+            if not base_url:
+                bar.write(f'Failed to get a URL for page {page_num}, retrying later.')
+                failed_pages.add(page_num)
+            else:
+                page_urls.add((page, base_url))
+                bar.write(base_url)
+                # If this isn't a numbered page we will need to increment the page count
+                try:
+                    int(page)
+                except ValueError:
+                    total_pages += 1
+                    non_number_pages += 1
+                    bar.write(f'Non-number page {page}, increasing page count by 1 to: {total_pages}')
+                    bar.total = total_pages
+                    bar.refresh()
+
+            if page_num == args.end_page:
+                bar.write(f'Exiting on page {page_num}.')
                break
-        if not success:
-            bar.write(f'Failed to download image: {base_url}')
-        bar.update()
-    bar.close()
-    driver.close()
-    del driver
+
+            # On the first page the back arrow is disabled and will trigger this
+            if isinstance(page_num, int) and page_num > 0:
+                try:
+                    # If a page forward/backwards button is disabled
+                    if driver.execute_script(f'return document.getElementsByClassName("IconButton__button-bQttMI gHMmeA sc-oXPCX mwNce")[0].disabled'):
+                        bar.write(f'Book completed, exiting.')
+                        break
+                except selenium.common.exceptions.JavascriptException:
+                    pass
+
+            # Move to the next page
+            del driver.requests
+            actions = ActionChains(driver)
+            actions.send_keys(Keys.RIGHT)
+            actions.perform()
+            bar.update()
+            page_num += 1
+        bar.close()
+
+        print('Re-doing failed pages...')
+        bar = tqdm(total=len(failed_pages))
+        for page in failed_pages:
+            load_book_page(page)
+            time.sleep(args.delay)
+            retry_delay = 5
+            base_url = None
+            for page_retry in range(3):  # retry the page max this many times
+                largest_size = 0
+                for find_img_retry in range(3):
+                    for request in driver.requests:
+                        if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
+                            base_url = request.url.split('/')
+                            del base_url[-1]
+                            base_url = '/'.join(base_url)
+                    time.sleep(1)
+                if base_url:
+                    break
+                bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}s...')
+                time.sleep(retry_delay)
+                retry_delay += 5
+            page, _ = get_num_pages()
+            if not base_url:
+                bar.write(f'Failed to get a URL for page {page_num}, retrying later.')
+                failed_pages.add(page_num)
+            else:
+                page_urls.add((page, base_url))
+                bar.write(base_url)
+                del driver.requests
+            bar.update(1)
+        bar.close()
+
+        time.sleep(1)
+        print('All pages scraped! Now downloading images...')
+
+        bar = tqdm(total=len(page_urls))
+        for page, base_url in page_urls:
+            success = False
+            for retry in range(6):
+                del driver.requests
+                time.sleep(args.delay / 2)
+                driver.get(f'{base_url.strip("/")}/2000')
+                time.sleep(args.delay / 2)
+                retry_delay = 5
+                img_data = None
+                for page_retry in range(3):  # retry the page max this many times
+                    largest_size = 0
+                    for find_img_retry in range(3):
+                        for request in driver.requests:
+                            if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
+                                img_data = request.response.body
+                                break
+                dl_file = ebook_files / f'{page}.jpg'
+                if img_data:
+                    with open(dl_file, 'wb') as file:
+                        file.write(img_data)
+                    # Re-save the image to make sure it's in the correct format
+                    img = Image.open(dl_file)
+                    if img.width != 2000:
+                        bar.write(f'Image too small at {img.width}px wide, retrying: {base_url}')
+                        driver.get('https://google.com')
+                        time.sleep(8)
+                        load_book_page(0)
+                        time.sleep(8)
+                        continue
+                    img.save(dl_file, format='JPEG', subsampling=0, quality=100)
+                    del img
+                    success = True
+                if success:
+                    break
+            if not success:
+                bar.write(f'Failed to download image: {base_url}')
+            bar.update()
+        bar.close()
+        driver.close()
+        del driver
 else:
    print('Page scrape skipped...')

+# Sometimes the book skips a page. Add a blank page if thats the case.
+print('Checking for blank pages...')
+existing_page_files = move_romans_to_front(roman_sort_with_ints([try_convert_int(str(x.stem)) for x in list(ebook_files.iterdir())]))
+if non_number_pages == 0:  # We might not have scraped so this number needs to be updated.
+    for item in existing_page_files:
+        if isinstance(try_convert_int(item), str):
+            non_number_pages += 1
+for page in tqdm(iterable=existing_page_files):
+    page_i = try_convert_int(page)
+    if isinstance(page_i, int) and page_i > 0:
+        page_i += non_number_pages
+        last_page_i = try_convert_int(existing_page_files[page_i - 1])
+        if isinstance(last_page_i, int):
+            last_page_i = last_page_i + non_number_pages
+            if last_page_i != page_i - 1:
+                img = Image.new('RGB', (2000, 2588), (255, 255, 255))
+                img.save(ebook_files / f'{int(page) - 1}.jpg')
+                tqdm.write(f'Created blank image for page {int(page) - 1}.')
+
 print('Building PDF...')
 raw_pdf_file = args.output / f'{args.isbn} RAW.pdf'
-pages = move_romans_to_front(roman_sort_with_ints([try_convert_int(str(x.stem)) for x in list(ebook_files.iterdir())]))
-page_files = [str(ebook_files / f'{x}.jpg') for x in pages]
+existing_page_files = move_romans_to_front(roman_sort_with_ints([try_convert_int(str(x.stem)) for x in list(ebook_files.iterdir())]))
+page_files = [str(ebook_files / f'{x}.jpg') for x in existing_page_files]
 pdf = img2pdf.convert(page_files)
 with open(raw_pdf_file, 'wb') as f:
    f.write(pdf)
@ -323,7 +357,7 @@ _, tmpfile = tempfile.mkstemp()
 pdf_merger.write(open(tmpfile, 'wb'))

 romans_end = 0
-for p in pages:
+for p in existing_page_files:
    if isinstance(p, str):
        romans_end += 1

@ -363,4 +397,15 @@ else:

 os.remove(tmpfile)

+if args.compress:
+    print('Compressing PDF...')
+    # https://pypdf2.readthedocs.io/en/latest/user/file-size.html
+    reader = PdfReader(args.output / f'{title}.pdf')
+    writer = PdfWriter()
+    for page in reader.pages:
+        page.compress_content_streams()  # This is CPU intensive!
+        writer.add_page(page)
+    with open(args.output / f'{title} compressed.pdf', 'wb') as f:
+        writer.write(f)
+
 # TODO: fix blank pages causing duplicaged pages