This commit is contained in:
Cyberes 2023-03-14 11:49:09 -06:00
parent b766e6e567
commit b577b81fb5
3 changed files with 39 additions and 3 deletions

View File

@ -18,9 +18,12 @@ designed, or are broken. I designed my scraper to be as simple while producing t
## Install
```bash
sudo apt install ocrmypdf jbig2dec
pip install -r requirements.txt
```
[//]: # (You also need the JBIG2 encoder, which can either be [built from source](https://ocrmypdf.readthedocs.io/en/latest/jbig2.html).)
Make sure you have Chrome installed as it uses Selenium. The Webdriver binary will be automatically downloaded.
## Use

View File

@ -35,3 +35,27 @@ def roman_sort_with_ints(arr):
sorted_arr = sorted(arr, key=int_or_roman)
return sorted_arr
def try_convert_int(item):
try:
return int(item)
except ValueError:
return item
def move_integers_to_end(lst):
non_integers = []
integers = []
for elem in lst:
if isinstance(elem, int):
integers.append(elem)
else:
non_integers.append(elem)
return non_integers + integers
def move_romans_to_front(arr):
arr_sorted = sorted(arr, key=lambda x: isinstance(x, int))
arr_sorted.insert(0, arr_sorted.pop(arr_sorted.index(0)))
return arr_sorted

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python3
import argparse
import time
import subprocess
from pathlib import Path
import img2pdf
@ -13,7 +14,7 @@ from seleniumwire import webdriver
from tqdm import tqdm
from webdriver_manager.chrome import ChromeDriverManager
from fucts.roman import roman_sort_with_ints
from fucts.roman import roman_sort_with_ints, move_romans_to_front, try_convert_int
parser = argparse.ArgumentParser()
parser.add_argument('--output', default='./VitalSource/')
@ -24,11 +25,13 @@ parser.add_argument('--start-page', default=0, type=int, help='Start on this pag
parser.add_argument('--end-page', default=-1, type=int, help='End on this page.')
parser.add_argument('--chrome-exe', default=None, type=str, help='Path to the Chrome executable. Leave blank to auto-detect.')
parser.add_argument('--disable-web-security', action='store_true', help="If pages aren't loading then you can try disabling CORS protections.")
parser.add_argument('--language', default='eng', help='OCR language. Default: "eng"')
args = parser.parse_args()
args.output = Path(args.output)
args.output.mkdir(exist_ok=True, parents=True)
ebook_output = args.output / f'{args.isbn}.pdf'
ebook_output_ocr = args.output / f'{args.isbn} OCR.pdf'
ebook_files = args.output / args.isbn
ebook_files.mkdir(exist_ok=True, parents=True)
@ -213,15 +216,21 @@ bar.close()
driver.close()
del driver
page_files = [str(ebook_files / f'{x}.jpg') for x in roman_sort_with_ints([str(x.stem) for x in list(ebook_files.iterdir())])]
print('Building PDF...')
page_files = [str(ebook_files / f'{x}.jpg') for x in move_romans_to_front(roman_sort_with_ints([try_convert_int(str(x.stem)) for x in list(ebook_files.iterdir())]))]
pdf = img2pdf.convert(page_files)
with open(ebook_output, 'wb') as f:
f.write(pdf)
# TODO: maybe scrape book title to name the PDF file?
# TODO: also maybe embed the title in the PDF file?
title = 'test title'
print('Running OCR...')
subprocess.run(f'ocrmypdf -l {args.language} --rotate-pages --deskew --title "{title}" --jobs $(nproc) --output-type pdfa "{ebook_output}" "{ebook_output_ocr}"', shell=True)
# TODO: scrape table of contents and insert
# TODO: https://stackoverflow.com/questions/29657237/tesseract-ocr-pdf-as-input
# TODO: fix blank pages causing duplicaged pages