pretty much working, PDFs generated

2023-03-14 00:27:53 -06:00 · 2023-03-14 00:27:53 -06:00 · 8d87428033
parent 03fd8219f4
commit 8d87428033
5 changed files with 136 additions and 57 deletions
--- a/README.md
+++ b/README.md
@ -40,6 +40,8 @@ If your network is slow, use `--delay` to allow more time for the files to downl
 Guide you through step-by-step. You are expected to have the required technical knowledge and understand what
 is happening behind the scenes in order to troubleshoot any issues.

+You will also have to double check the output PDF to make sure everything is as it should be.
+
 ### How it Works

 This scraper uses Selenium to load the ebook viewer webpage. It then navigates through the book page by page and records network
--- a/fucts/init.py
+++ b/fucts/init.py
--- a/fucts/roman.py
+++ b/fucts/roman.py
@ -0,0 +1,37 @@
+from typing import List
+
+
+def roman_sort(nums: List[str]) -> List[str]:
+    """
+    Contributed by ChatGPT.
+    """
+    values = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
+    sorted_nums = sorted(nums, key=lambda x: sum(values[c.upper()] for c in x))
+    return sorted_nums
+
+
+def roman_sort_with_ints(arr):
+    """
+    Contributed by ChatGPT, who didn't know how to use .upper()
+    """
+    roman_dict = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
+
+    def roman_to_int(num):
+        if isinstance(num, str):
+            num = num.upper()
+        result = 0
+        for i in range(len(num)):
+            if i > 0 and roman_dict[num[i]] > roman_dict[num[i - 1]]:
+                result += roman_dict[num[i]] - 2 * roman_dict[num[i - 1]]
+            else:
+                result += roman_dict[num[i]]
+        return result
+
+    def int_or_roman(elem):
+        try:
+            return int(elem)
+        except ValueError:
+            return roman_to_int(elem)
+
+    sorted_arr = sorted(arr, key=int_or_roman)
+    return sorted_arr
--- a/requirements.txt
+++ b/requirements.txt
@ -3,4 +3,6 @@ webdriver-manager
 tqdm
 pillow
 pyautogui
-selenium-wire
+selenium-wire
+img2pdf
+selenium-requests
--- a/vitalsource2pdf.py
+++ b/vitalsource2pdf.py
@ -1,24 +1,28 @@
+#!/usr/bin/env python
 import argparse
 import time
 from pathlib import Path

+import img2pdf
 import selenium
 from PIL import Image
 from selenium.webdriver import ActionChains, Keys
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
-# from webdriver_manager.firefox import GeckoDriverManager
-# from selenium.webdriver.firefox.service import Service as FirefoxService
 from seleniumwire import webdriver
 from tqdm import tqdm
 from webdriver_manager.chrome import ChromeDriverManager

+from fucts.roman import roman_sort_with_ints
+
 parser = argparse.ArgumentParser()
 parser.add_argument('--output', default='./VitalSource/')
 parser.add_argument('--isbn', required=True)
 parser.add_argument('--delay', default=2, type=int, help='Delay between pages to let them load in seconds.')
 parser.add_argument('--pages', default=None, type=int, help='Override how many pages to save.')  # TODO
 parser.add_argument('--start-page', default=0, type=int, help='Start on this page. Pages start at zero and include any non-numbered pages.')
+parser.add_argument('--end-page', default=0, type=int, help='End on this page.')
+parser.add_argument('--chrome-exe', default=None, type=str, help='Path to the Chrome executable. Leave blank to auto-detect.')
 parser.add_argument('--disable-web-security', action='store_true', help="If pages aren't loading then you can try disabling CORS protections.")
 args = parser.parse_args()

@ -28,13 +32,14 @@ ebook_output = args.output / f'{args.isbn}.pdf'
 ebook_files = args.output / args.isbn
 ebook_files.mkdir(exist_ok=True, parents=True)

-options = webdriver.ChromeOptions()
-options.add_experimental_option('prefs', {'download.default_directory': str(ebook_files)})
+chrome_options = webdriver.ChromeOptions()
 if args.disable_web_security:
-    options.add_argument('--disable-web-security')
+    chrome_options.add_argument('--disable-web-security')
    print('DISABLED WEB SECURITY!')
-options.add_argument('--disable-http2')  # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
-driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=options)
+chrome_options.add_argument('--disable-http2')  # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
+if args.chrome_exe:
+    chrome_options.binary_location = args.chrome_exe  # '/usr/bin/google-chrome'
+driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=chrome_options)

 driver.get(f'https://bookshelf.vitalsource.com')
 input('Press ENTER once logged in...')
@ -76,7 +81,6 @@ bar = tqdm(total=total_pages)
 bar.update(page_num)
 while page_num < total_pages + 1:
    time.sleep(args.delay)
-    img_data = None
    retry_delay = 5
    base_url = None
    for page_retry in range(3):  # retry the page max this many times
@ -84,30 +88,9 @@ while page_num < total_pages + 1:
        for find_img_retry in range(3):
            for request in driver.requests:
                if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
-                    # Wait for the image to load
-                    wait = 0
-                    while (not request.response or not request.response.body) and wait < 60:
-                        time.sleep(1)
-                        wait += 1
-                    if not request.response or not request.response.body:
-                        bar.write(f'Page {page_num} failed to load, will retry later. {request.url}')
-                        failed_pages.add(page_num)
-                        break
-
                    base_url = request.url.split('/')
-                    try:
-                        img_size = int(base_url[-1])
-                    except ValueError:
-                        bar.write(f'Failed to parse URL for page {page_num}, retrying later: {request.url}')
-                        failed_pages.add(page_num)
-                        break
-                    if img_size > largest_size:
-                        base_url = '/'.join(base_url)
-                        img_data = request.response.body
-                        page_urls.add(request.url)
-                    # 2000 is the max size I've seen so we can just exit if it's that.
-                    if img_size == 2000:
-                        break
+                    del base_url[-1]
+                    base_url = '/'.join(base_url)
            time.sleep(1)
        if base_url:
            break
@ -115,14 +98,14 @@ while page_num < total_pages + 1:
        time.sleep(retry_delay)
        retry_delay += 5

-    if not img_data:
-        bar.write(f'Failed to download image for page {page_num}, retrying later.')
-        failed_pages.add(page_num)
-    elif not base_url:
+    page, _ = get_num_pages()
+    if not base_url:
        bar.write(f'Failed to get a URL for page {page_num}, retrying later.')
        failed_pages.add(page_num)
+        continue
    else:
-        page, _ = get_num_pages()
+        page_urls.add((page, base_url))
+        bar.write(base_url)
        # If this isn't a numbered page we will need to increment the page count
        try:
            int(page)
@ -132,40 +115,95 @@ while page_num < total_pages + 1:
            bar.total = total_pages
            bar.refresh()

-        dl_file = ebook_files / f'{page}.jpg'
-        with open(dl_file, 'wb') as file:
-            file.write(img_data)
+    #     dl_file = ebook_files / f'{page}.jpg'
+    #     with open(dl_file, 'wb') as file:
+    #         file.write(img_data)
+    #
+    #     # Re-save the image to make sure it's in the correct format
+    #     img = Image.open(dl_file)
+    #     if img.width != 2000:
+    #         bar.write(f'Page {page_num} is only {img.width}px wide, will search for a larger image later.')
+    #         small_pages_redo.add(page_num)
+    #     img.save(dl_file, format='JPEG', subsampling=0, quality=100)
+    #     del img

-        # Re-save the image to make sure it's in the correct format
-        img = Image.open(dl_file)
-        if img.width != 2000:
-            bar.write(f'Page {page_num} is only {img.width}px wide, will search for a larger image later.')
-            small_pages_redo.add(page_num)
-        img.save(dl_file, format='JPEG', subsampling=0, quality=100)
-        del img
-
-        bar.write(base_url)
+    if page_num == args.end_page:
+        bar.write(f'Exiting on page {page_num}.')
+        break
+    if page == total_pages:
+        bar.write(f'Book completed, exiting.')
+        break
+    if not driver.find_elements(By.CLASS_NAME, 'IconButton__button-bQttMI gHMmeA sc-oXPCX mwNce')[0].is_enabled():
+        bar.write(f'Book completed, exiting.')
+        break

    # Move to the next page
    del driver.requests
    actions = ActionChains(driver)
    actions.send_keys(Keys.RIGHT)
    actions.perform()
-
    bar.update()
    page_num += 1
-
-# TODO: redo failed pages in failed_pages
-# TODO:
-
-driver.close()
 bar.close()

+# TODO: redo failed_pages items
+
+time.sleep(1)
+
+# print('All pages scraped! Now we must change driver modes to download the pages. Please log back in once the new window pops up.')
+# from seleniumrequests import Chrome
+#
+# driver = Chrome(service=Service(ChromeDriverManager().install()), chrome_options=chrome_options)
+# driver.get(f'https://bookshelf.vitalsource.com')
+# input('Press ENTER once logged in...')
+# load_book_page(0)
+
+print('All pages scraped! Now downloading images...')
+
+bar = tqdm(total=len(page_urls))
+for page, base_url in page_urls:
+    time.sleep(args.delay)
+    driver.get(f'{base_url.strip("/")}/2000')  # have to load the page first for cookies reasons
+    time.sleep(args.delay)
+    retry_delay = 5
+    img_data = None
+    for page_retry in range(3):  # retry the page max this many times
+        largest_size = 0
+        for find_img_retry in range(3):
+            for request in driver.requests:
+                if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
+                    img_data = request.response.body
+                    break
+    # response = driver.request('GET', f'{base_url.strip("/")}/2000')
+    # print(response)
+    # response.raise_for_status()
+    dl_file = ebook_files / f'{page}.jpg'
+    # with open(dl_file, 'wb') as f:
+    #     response.raw.decode_content = True
+    #     shutil.copyfileobj(response.raw, f)
+    if img_data:
+        with open(dl_file, 'wb') as file:
+            file.write(img_data)
+        # Re-save the image to make sure it's in the correct format
+        img = Image.open(dl_file)
+        img.save(dl_file, format='JPEG', subsampling=0, quality=100)
+        del img
+    else:
+        bar.write(f'Failed to download image: {base_url}')
+    bar.update()
+    del driver.requests
+bar.close()
+driver.close()
+del driver
+
+page_files = [str(ebook_files / f'{x}.jpg') for x in roman_sort_with_ints([str(x.stem) for x in list(ebook_files.iterdir())])]
+pdf = img2pdf.convert(page_files)
+with open(ebook_output, 'wb') as f:
+    f.write(pdf)
+
 # TODO: maybe scrape book title to name the PDF file?
 # TODO: also maybe embed the title in the PDF file?

-# TODO: make PDF
-
 # TODO: scrape table of contents and insert