ocr
This commit is contained in:
parent
b766e6e567
commit
b577b81fb5
|
@ -18,9 +18,12 @@ designed, or are broken. I designed my scraper to be as simple while producing t
|
|||
## Install
|
||||
|
||||
```bash
|
||||
sudo apt install ocrmypdf jbig2dec
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
[//]: # (You also need the JBIG2 encoder, which can either be [built from source](https://ocrmypdf.readthedocs.io/en/latest/jbig2.html).)
|
||||
|
||||
Make sure you have Chrome installed as it uses Selenium. The Webdriver binary will be automatically downloaded.
|
||||
|
||||
## Use
|
||||
|
|
|
@ -35,3 +35,27 @@ def roman_sort_with_ints(arr):
|
|||
|
||||
sorted_arr = sorted(arr, key=int_or_roman)
|
||||
return sorted_arr
|
||||
|
||||
|
||||
def try_convert_int(item):
|
||||
try:
|
||||
return int(item)
|
||||
except ValueError:
|
||||
return item
|
||||
|
||||
|
||||
def move_integers_to_end(lst):
|
||||
non_integers = []
|
||||
integers = []
|
||||
for elem in lst:
|
||||
if isinstance(elem, int):
|
||||
integers.append(elem)
|
||||
else:
|
||||
non_integers.append(elem)
|
||||
return non_integers + integers
|
||||
|
||||
|
||||
def move_romans_to_front(arr):
|
||||
arr_sorted = sorted(arr, key=lambda x: isinstance(x, int))
|
||||
arr_sorted.insert(0, arr_sorted.pop(arr_sorted.index(0)))
|
||||
return arr_sorted
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import time
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import img2pdf
|
||||
|
@ -13,7 +14,7 @@ from seleniumwire import webdriver
|
|||
from tqdm import tqdm
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
|
||||
from fucts.roman import roman_sort_with_ints
|
||||
from fucts.roman import roman_sort_with_ints, move_romans_to_front, try_convert_int
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--output', default='./VitalSource/')
|
||||
|
@ -24,11 +25,13 @@ parser.add_argument('--start-page', default=0, type=int, help='Start on this pag
|
|||
parser.add_argument('--end-page', default=-1, type=int, help='End on this page.')
|
||||
parser.add_argument('--chrome-exe', default=None, type=str, help='Path to the Chrome executable. Leave blank to auto-detect.')
|
||||
parser.add_argument('--disable-web-security', action='store_true', help="If pages aren't loading then you can try disabling CORS protections.")
|
||||
parser.add_argument('--language', default='eng', help='OCR language. Default: "eng"')
|
||||
args = parser.parse_args()
|
||||
|
||||
args.output = Path(args.output)
|
||||
args.output.mkdir(exist_ok=True, parents=True)
|
||||
ebook_output = args.output / f'{args.isbn}.pdf'
|
||||
ebook_output_ocr = args.output / f'{args.isbn} OCR.pdf'
|
||||
ebook_files = args.output / args.isbn
|
||||
ebook_files.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
|
@ -213,15 +216,21 @@ bar.close()
|
|||
driver.close()
|
||||
del driver
|
||||
|
||||
page_files = [str(ebook_files / f'{x}.jpg') for x in roman_sort_with_ints([str(x.stem) for x in list(ebook_files.iterdir())])]
|
||||
|
||||
print('Building PDF...')
|
||||
page_files = [str(ebook_files / f'{x}.jpg') for x in move_romans_to_front(roman_sort_with_ints([try_convert_int(str(x.stem)) for x in list(ebook_files.iterdir())]))]
|
||||
pdf = img2pdf.convert(page_files)
|
||||
with open(ebook_output, 'wb') as f:
|
||||
f.write(pdf)
|
||||
|
||||
# TODO: maybe scrape book title to name the PDF file?
|
||||
# TODO: also maybe embed the title in the PDF file?
|
||||
title = 'test title'
|
||||
|
||||
print('Running OCR...')
|
||||
subprocess.run(f'ocrmypdf -l {args.language} --rotate-pages --deskew --title "{title}" --jobs $(nproc) --output-type pdfa "{ebook_output}" "{ebook_output_ocr}"', shell=True)
|
||||
|
||||
# TODO: scrape table of contents and insert
|
||||
|
||||
|
||||
# TODO: https://stackoverflow.com/questions/29657237/tesseract-ocr-pdf-as-input
|
||||
# TODO: fix blank pages causing duplicaged pages
|
||||
|
|
Loading…
Reference in New Issue