skip scrape arg

2023-03-14 12:00:38 -06:00 · 2023-03-14 12:00:38 -06:00 · e8f369a94e
parent 336084d414
commit e8f369a94e
1 changed files with 165 additions and 161 deletions
--- a/vitalsource2pdf.py
+++ b/vitalsource2pdf.py
@ -26,6 +26,7 @@ parser.add_argument('--end-page', default=-1, type=int, help='End on this page.'
 parser.add_argument('--chrome-exe', default=None, type=str, help='Path to the Chrome executable. Leave blank to auto-detect.')
 parser.add_argument('--disable-web-security', action='store_true', help="If pages aren't loading then you can try disabling CORS protections.")
 parser.add_argument('--language', default='eng', help='OCR language. Default: "eng"')
+parser.add_argument('--skip-scrape', action='store_true', help="Don't scrape anything, just re-build the PDF from existing files.")
 args = parser.parse_args()

 args.output = Path(args.output)
@ -35,186 +36,189 @@ ebook_output_ocr = args.output / f'{args.isbn} OCR.pdf'
 ebook_files = args.output / args.isbn
 ebook_files.mkdir(exist_ok=True, parents=True)

-chrome_options = webdriver.ChromeOptions()
-if args.disable_web_security:
-    chrome_options.add_argument('--disable-web-security')
-    print('DISABLED WEB SECURITY!')
-chrome_options.add_argument('--disable-http2')  # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
-if args.chrome_exe:
-    chrome_options.binary_location = args.chrome_exe  # '/usr/bin/google-chrome'
-driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=chrome_options)
+if not args.skip_scrape:
+    chrome_options = webdriver.ChromeOptions()
+    if args.disable_web_security:
+        chrome_options.add_argument('--disable-web-security')
+        print('DISABLED WEB SECURITY!')
+    chrome_options.add_argument('--disable-http2')  # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
+    if args.chrome_exe:
+        chrome_options.binary_location = args.chrome_exe  # '/usr/bin/google-chrome'
+    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=chrome_options)

-driver.get(f'https://bookshelf.vitalsource.com')
-input('Press ENTER once logged in...')
+    driver.get(f'https://bookshelf.vitalsource.com')
+    input('Press ENTER once logged in...')


-def get_num_pages():
-    while True:
-        try:
-            total = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').strip().split('/')[-1].strip())
+    def get_num_pages():
+        while True:
            try:
-                # This element may be empty so just set it to 0
-                current_page = driver.execute_script('return document.getElementsByClassName("InputControl__input-fbzQBk hDtUvs TextField__InputControl-iza-dmV iISUBf")[0].value')
-                if current_page == '' or not current_page:
+                total = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').strip().split('/')[-1].strip())
+                try:
+                    # This element may be empty so just set it to 0
+                    current_page = driver.execute_script('return document.getElementsByClassName("InputControl__input-fbzQBk hDtUvs TextField__InputControl-iza-dmV iISUBf")[0].value')
+                    if current_page == '' or not current_page:
+                        current_page = 0
+                except selenium.common.exceptions.JavascriptException:
                    current_page = 0
+                return current_page, total
            except selenium.common.exceptions.JavascriptException:
-                current_page = 0
-            return current_page, total
-        except selenium.common.exceptions.JavascriptException:
+                time.sleep(1)
+
+
+    def load_book_page(page_id):
+        driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/{page_id}')
+        get_num_pages()  # Wait for the page to load
+        while len(driver.find_elements(By.CLASS_NAME, "sc-AjmGg dDNaMw")):
            time.sleep(1)


-def load_book_page(page_id):
-    driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/{page_id}')
-    get_num_pages()  # Wait for the page to load
-    while len(driver.find_elements(By.CLASS_NAME, "sc-AjmGg dDNaMw")):
-        time.sleep(1)
+    driver.maximize_window()
+    page_num = args.start_page
+    load_book_page(page_num)

+    _, total_pages = get_num_pages()
+    total_pages = 99999999999999999 if args.start_page > 0 else total_pages
+    print('Total number of pages:', total_pages)

-driver.maximize_window()
-page_num = args.start_page
-load_book_page(page_num)
-
-_, total_pages = get_num_pages()
-total_pages = 99999999999999999 if args.start_page > 0 else total_pages
-print('Total number of pages:', total_pages)
-
-page_urls = set()
-failed_pages = set()
-small_pages_redo = set()
-bar = tqdm(total=total_pages)
-bar.update(page_num)
-while page_num < total_pages + 1:
-    time.sleep(args.delay)
-    retry_delay = 5
-    base_url = None
-    for page_retry in range(3):  # retry the page max this many times
-        largest_size = 0
-        for find_img_retry in range(3):
-            for request in driver.requests:
-                if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
-                    base_url = request.url.split('/')
-                    del base_url[-1]
-                    base_url = '/'.join(base_url)
-            time.sleep(1)
-        if base_url:
-            break
-        bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}s...')
-        time.sleep(retry_delay)
-        retry_delay += 5
-
-    page, _ = get_num_pages()
-    if not base_url:
-        bar.write(f'Failed to get a URL for page {page_num}, retrying later.')
-        failed_pages.add(page_num)
-    else:
-        page_urls.add((page, base_url))
-        bar.write(base_url)
-        # If this isn't a numbered page we will need to increment the page count
-        try:
-            int(page)
-        except ValueError:
-            total_pages += 1
-            bar.write(f'Non-number page {page}, increasing page count by 1 to: {total_pages}')
-            bar.total = total_pages
-            bar.refresh()
-
-    if page_num == args.end_page:
-        bar.write(f'Exiting on page {page_num}.')
-        break
-
-    # On the first page the back arrow is disabled and will trigger this
-    if isinstance(page_num, int) and page_num > 0:
-        try:
-            if driver.execute_script(f'return document.getElementsByClassName("IconButton__button-bQttMI gHMmeA sc-oXPCX mwNce")[0].disabled'):  # not driver.find_elements(By.CLASS_NAME, 'IconButton__button-bQttMI gHMmeA sc-oXPCX mwNce')[0].is_enabled():
-                bar.write(f'Book completed, exiting.')
-                break
-        except selenium.common.exceptions.JavascriptException:
-            pass
-
-    # Move to the next page
-    del driver.requests
-    actions = ActionChains(driver)
-    actions.send_keys(Keys.RIGHT)
-    actions.perform()
-    bar.update()
-    page_num += 1
-bar.close()
-
-print('Re-doing failed pages...')
-bar = tqdm(total=len(failed_pages))
-for page in failed_pages:
-    load_book_page(page)
-    time.sleep(args.delay)
-    retry_delay = 5
-    base_url = None
-    for page_retry in range(3):  # retry the page max this many times
-        largest_size = 0
-        for find_img_retry in range(3):
-            for request in driver.requests:
-                if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
-                    base_url = request.url.split('/')
-                    del base_url[-1]
-                    base_url = '/'.join(base_url)
-            time.sleep(1)
-        if base_url:
-            break
-        bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}s...')
-        time.sleep(retry_delay)
-        retry_delay += 5
-    page, _ = get_num_pages()
-    if not base_url:
-        bar.write(f'Failed to get a URL for page {page_num}, retrying later.')
-        failed_pages.add(page_num)
-    else:
-        page_urls.add((page, base_url))
-        bar.write(base_url)
-        del driver.requests
-
-time.sleep(1)
-print('All pages scraped! Now downloading images...')
-
-bar = tqdm(total=len(page_urls))
-for page, base_url in page_urls:
-    success = False
-    for retry in range(6):
-        del driver.requests
-        time.sleep(args.delay)
-        driver.get(f'{base_url.strip("/")}/2000')  # have to load the page first for cookies reasons
+    page_urls = set()
+    failed_pages = set()
+    small_pages_redo = set()
+    bar = tqdm(total=total_pages)
+    bar.update(page_num)
+    while page_num < total_pages + 1:
        time.sleep(args.delay)
        retry_delay = 5
-        img_data = None
+        base_url = None
        for page_retry in range(3):  # retry the page max this many times
            largest_size = 0
            for find_img_retry in range(3):
                for request in driver.requests:
                    if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
-                        img_data = request.response.body
-                        break
-        dl_file = ebook_files / f'{page}.jpg'
-        if img_data:
-            with open(dl_file, 'wb') as file:
-                file.write(img_data)
-            # Re-save the image to make sure it's in the correct format
-            img = Image.open(dl_file)
-            if img.width != 2000:
-                bar.write(f'Image too small at {img.width}px wide, retrying: {base_url}')
-                driver.get('https://google.com')
-                time.sleep(8)
-                load_book_page(0)
-                time.sleep(8)
-                continue
-            img.save(dl_file, format='JPEG', subsampling=0, quality=100)
-            del img
-            success = True
-        if success:
+                        base_url = request.url.split('/')
+                        del base_url[-1]
+                        base_url = '/'.join(base_url)
+                time.sleep(1)
+            if base_url:
+                break
+            bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}s...')
+            time.sleep(retry_delay)
+            retry_delay += 5
+
+        page, _ = get_num_pages()
+        if not base_url:
+            bar.write(f'Failed to get a URL for page {page_num}, retrying later.')
+            failed_pages.add(page_num)
+        else:
+            page_urls.add((page, base_url))
+            bar.write(base_url)
+            # If this isn't a numbered page we will need to increment the page count
+            try:
+                int(page)
+            except ValueError:
+                total_pages += 1
+                bar.write(f'Non-number page {page}, increasing page count by 1 to: {total_pages}')
+                bar.total = total_pages
+                bar.refresh()
+
+        if page_num == args.end_page:
+            bar.write(f'Exiting on page {page_num}.')
            break
-    if not success:
-        bar.write(f'Failed to download image: {base_url}')
-    bar.update()
-bar.close()
-driver.close()
-del driver
+
+        # On the first page the back arrow is disabled and will trigger this
+        if isinstance(page_num, int) and page_num > 0:
+            try:
+                if driver.execute_script(f'return document.getElementsByClassName("IconButton__button-bQttMI gHMmeA sc-oXPCX mwNce")[0].disabled'):  # not driver.find_elements(By.CLASS_NAME, 'IconButton__button-bQttMI gHMmeA sc-oXPCX mwNce')[0].is_enabled():
+                    bar.write(f'Book completed, exiting.')
+                    break
+            except selenium.common.exceptions.JavascriptException:
+                pass
+
+        # Move to the next page
+        del driver.requests
+        actions = ActionChains(driver)
+        actions.send_keys(Keys.RIGHT)
+        actions.perform()
+        bar.update()
+        page_num += 1
+    bar.close()
+
+    print('Re-doing failed pages...')
+    bar = tqdm(total=len(failed_pages))
+    for page in failed_pages:
+        load_book_page(page)
+        time.sleep(args.delay)
+        retry_delay = 5
+        base_url = None
+        for page_retry in range(3):  # retry the page max this many times
+            largest_size = 0
+            for find_img_retry in range(3):
+                for request in driver.requests:
+                    if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
+                        base_url = request.url.split('/')
+                        del base_url[-1]
+                        base_url = '/'.join(base_url)
+                time.sleep(1)
+            if base_url:
+                break
+            bar.write(f'Could not find a matching image for page {page_num}, sleeping {retry_delay}s...')
+            time.sleep(retry_delay)
+            retry_delay += 5
+        page, _ = get_num_pages()
+        if not base_url:
+            bar.write(f'Failed to get a URL for page {page_num}, retrying later.')
+            failed_pages.add(page_num)
+        else:
+            page_urls.add((page, base_url))
+            bar.write(base_url)
+            del driver.requests
+
+    time.sleep(1)
+    print('All pages scraped! Now downloading images...')
+
+    bar = tqdm(total=len(page_urls))
+    for page, base_url in page_urls:
+        success = False
+        for retry in range(6):
+            del driver.requests
+            time.sleep(args.delay)
+            driver.get(f'{base_url.strip("/")}/2000')  # have to load the page first for cookies reasons
+            time.sleep(args.delay)
+            retry_delay = 5
+            img_data = None
+            for page_retry in range(3):  # retry the page max this many times
+                largest_size = 0
+                for find_img_retry in range(3):
+                    for request in driver.requests:
+                        if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
+                            img_data = request.response.body
+                            break
+            dl_file = ebook_files / f'{page}.jpg'
+            if img_data:
+                with open(dl_file, 'wb') as file:
+                    file.write(img_data)
+                # Re-save the image to make sure it's in the correct format
+                img = Image.open(dl_file)
+                if img.width != 2000:
+                    bar.write(f'Image too small at {img.width}px wide, retrying: {base_url}')
+                    driver.get('https://google.com')
+                    time.sleep(8)
+                    load_book_page(0)
+                    time.sleep(8)
+                    continue
+                img.save(dl_file, format='JPEG', subsampling=0, quality=100)
+                del img
+                success = True
+            if success:
+                break
+        if not success:
+            bar.write(f'Failed to download image: {base_url}')
+        bar.update()
+    bar.close()
+    driver.close()
+    del driver
+else:
+    print('Scrape skipped...')

 print('Building PDF...')
 page_files = [str(ebook_files / f'{x}.jpg') for x in move_romans_to_front(roman_sort_with_ints([try_convert_int(str(x.stem)) for x in list(ebook_files.iterdir())]))]