ocr

2023-03-14 11:49:09 -06:00 · 2023-03-14 11:49:09 -06:00 · b577b81fb5
parent b766e6e567
commit b577b81fb5
3 changed files with 39 additions and 3 deletions
--- a/README.md
+++ b/README.md
@ -18,9 +18,12 @@ designed, or are broken. I designed my scraper to be as simple while producing t
 ## Install

 ```bash
+sudo apt install ocrmypdf jbig2dec
 pip install -r requirements.txt
 ```

+[//]: # (You also need the JBIG2 encoder, which can either be [built from source]&#40;https://ocrmypdf.readthedocs.io/en/latest/jbig2.html&#41;.)
+
 Make sure you have Chrome installed as it uses Selenium. The Webdriver binary will be automatically downloaded.

 ## Use
--- a/fucts/roman.py
+++ b/fucts/roman.py
@ -35,3 +35,27 @@ def roman_sort_with_ints(arr):

    sorted_arr = sorted(arr, key=int_or_roman)
    return sorted_arr
+
+
+def try_convert_int(item):
+    try:
+        return int(item)
+    except ValueError:
+        return item
+
+
+def move_integers_to_end(lst):
+    non_integers = []
+    integers = []
+    for elem in lst:
+        if isinstance(elem, int):
+            integers.append(elem)
+        else:
+            non_integers.append(elem)
+    return non_integers + integers
+
+
+def move_romans_to_front(arr):
+    arr_sorted = sorted(arr, key=lambda x: isinstance(x, int))
+    arr_sorted.insert(0, arr_sorted.pop(arr_sorted.index(0)))
+    return arr_sorted
--- a/vitalsource2pdf.py
+++ b/vitalsource2pdf.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import argparse
 import time
+import subprocess
 from pathlib import Path

 import img2pdf
@ -13,7 +14,7 @@ from seleniumwire import webdriver
 from tqdm import tqdm
 from webdriver_manager.chrome import ChromeDriverManager

-from fucts.roman import roman_sort_with_ints
+from fucts.roman import roman_sort_with_ints, move_romans_to_front, try_convert_int

 parser = argparse.ArgumentParser()
 parser.add_argument('--output', default='./VitalSource/')
@ -24,11 +25,13 @@ parser.add_argument('--start-page', default=0, type=int, help='Start on this pag
 parser.add_argument('--end-page', default=-1, type=int, help='End on this page.')
 parser.add_argument('--chrome-exe', default=None, type=str, help='Path to the Chrome executable. Leave blank to auto-detect.')
 parser.add_argument('--disable-web-security', action='store_true', help="If pages aren't loading then you can try disabling CORS protections.")
+parser.add_argument('--language', default='eng', help='OCR language. Default: "eng"')
 args = parser.parse_args()

 args.output = Path(args.output)
 args.output.mkdir(exist_ok=True, parents=True)
 ebook_output = args.output / f'{args.isbn}.pdf'
+ebook_output_ocr = args.output / f'{args.isbn} OCR.pdf'
 ebook_files = args.output / args.isbn
 ebook_files.mkdir(exist_ok=True, parents=True)

@ -213,15 +216,21 @@ bar.close()
 driver.close()
 del driver

-page_files = [str(ebook_files / f'{x}.jpg') for x in roman_sort_with_ints([str(x.stem) for x in list(ebook_files.iterdir())])]
+
+print('Building PDF...')
+page_files = [str(ebook_files / f'{x}.jpg') for x in move_romans_to_front(roman_sort_with_ints([try_convert_int(str(x.stem)) for x in list(ebook_files.iterdir())]))]
 pdf = img2pdf.convert(page_files)
 with open(ebook_output, 'wb') as f:
    f.write(pdf)

 # TODO: maybe scrape book title to name the PDF file?
 # TODO: also maybe embed the title in the PDF file?
+title = 'test title'
+
+print('Running OCR...')
+subprocess.run(f'ocrmypdf -l {args.language} --rotate-pages --deskew --title "{title}" --jobs $(nproc) --output-type pdfa "{ebook_output}" "{ebook_output_ocr}"', shell=True)

 # TODO: scrape table of contents and insert


-# TODO: https://stackoverflow.com/questions/29657237/tesseract-ocr-pdf-as-input
+# TODO: fix blank pages causing duplicaged pages