skip scrape arg
This commit is contained in:
parent
336084d414
commit
e8f369a94e
|
@ -26,6 +26,7 @@ parser.add_argument('--end-page', default=-1, type=int, help='End on this page.'
|
|||
parser.add_argument('--chrome-exe', default=None, type=str, help='Path to the Chrome executable. Leave blank to auto-detect.')
|
||||
parser.add_argument('--disable-web-security', action='store_true', help="If pages aren't loading then you can try disabling CORS protections.")
|
||||
parser.add_argument('--language', default='eng', help='OCR language. Default: "eng"')
|
||||
parser.add_argument('--skip-scrape', action='store_true', help="Don't scrape anything, just re-build the PDF from existing files.")
|
||||
args = parser.parse_args()
|
||||
|
||||
args.output = Path(args.output)
|
||||
|
@ -35,20 +36,21 @@ ebook_output_ocr = args.output / f'{args.isbn} OCR.pdf'
|
|||
ebook_files = args.output / args.isbn
|
||||
ebook_files.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
chrome_options = webdriver.ChromeOptions()
|
||||
if args.disable_web_security:
|
||||
if not args.skip_scrape:
|
||||
chrome_options = webdriver.ChromeOptions()
|
||||
if args.disable_web_security:
|
||||
chrome_options.add_argument('--disable-web-security')
|
||||
print('DISABLED WEB SECURITY!')
|
||||
chrome_options.add_argument('--disable-http2') # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
|
||||
if args.chrome_exe:
|
||||
chrome_options.add_argument('--disable-http2') # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
|
||||
if args.chrome_exe:
|
||||
chrome_options.binary_location = args.chrome_exe # '/usr/bin/google-chrome'
|
||||
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=chrome_options)
|
||||
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=chrome_options)
|
||||
|
||||
driver.get(f'https://bookshelf.vitalsource.com')
|
||||
input('Press ENTER once logged in...')
|
||||
driver.get(f'https://bookshelf.vitalsource.com')
|
||||
input('Press ENTER once logged in...')
|
||||
|
||||
|
||||
def get_num_pages():
|
||||
def get_num_pages():
|
||||
while True:
|
||||
try:
|
||||
total = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').strip().split('/')[-1].strip())
|
||||
|
@ -64,27 +66,27 @@ def get_num_pages():
|
|||
time.sleep(1)
|
||||
|
||||
|
||||
def load_book_page(page_id):
|
||||
def load_book_page(page_id):
|
||||
driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/{page_id}')
|
||||
get_num_pages() # Wait for the page to load
|
||||
while len(driver.find_elements(By.CLASS_NAME, "sc-AjmGg dDNaMw")):
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
driver.maximize_window()
|
||||
page_num = args.start_page
|
||||
load_book_page(page_num)
|
||||
driver.maximize_window()
|
||||
page_num = args.start_page
|
||||
load_book_page(page_num)
|
||||
|
||||
_, total_pages = get_num_pages()
|
||||
total_pages = 99999999999999999 if args.start_page > 0 else total_pages
|
||||
print('Total number of pages:', total_pages)
|
||||
_, total_pages = get_num_pages()
|
||||
total_pages = 99999999999999999 if args.start_page > 0 else total_pages
|
||||
print('Total number of pages:', total_pages)
|
||||
|
||||
page_urls = set()
|
||||
failed_pages = set()
|
||||
small_pages_redo = set()
|
||||
bar = tqdm(total=total_pages)
|
||||
bar.update(page_num)
|
||||
while page_num < total_pages + 1:
|
||||
page_urls = set()
|
||||
failed_pages = set()
|
||||
small_pages_redo = set()
|
||||
bar = tqdm(total=total_pages)
|
||||
bar.update(page_num)
|
||||
while page_num < total_pages + 1:
|
||||
time.sleep(args.delay)
|
||||
retry_delay = 5
|
||||
base_url = None
|
||||
|
@ -139,11 +141,11 @@ while page_num < total_pages + 1:
|
|||
actions.perform()
|
||||
bar.update()
|
||||
page_num += 1
|
||||
bar.close()
|
||||
bar.close()
|
||||
|
||||
print('Re-doing failed pages...')
|
||||
bar = tqdm(total=len(failed_pages))
|
||||
for page in failed_pages:
|
||||
print('Re-doing failed pages...')
|
||||
bar = tqdm(total=len(failed_pages))
|
||||
for page in failed_pages:
|
||||
load_book_page(page)
|
||||
time.sleep(args.delay)
|
||||
retry_delay = 5
|
||||
|
@ -171,11 +173,11 @@ for page in failed_pages:
|
|||
bar.write(base_url)
|
||||
del driver.requests
|
||||
|
||||
time.sleep(1)
|
||||
print('All pages scraped! Now downloading images...')
|
||||
time.sleep(1)
|
||||
print('All pages scraped! Now downloading images...')
|
||||
|
||||
bar = tqdm(total=len(page_urls))
|
||||
for page, base_url in page_urls:
|
||||
bar = tqdm(total=len(page_urls))
|
||||
for page, base_url in page_urls:
|
||||
success = False
|
||||
for retry in range(6):
|
||||
del driver.requests
|
||||
|
@ -212,9 +214,11 @@ for page, base_url in page_urls:
|
|||
if not success:
|
||||
bar.write(f'Failed to download image: {base_url}')
|
||||
bar.update()
|
||||
bar.close()
|
||||
driver.close()
|
||||
del driver
|
||||
bar.close()
|
||||
driver.close()
|
||||
del driver
|
||||
else:
|
||||
print('Scrape skipped...')
|
||||
|
||||
print('Building PDF...')
|
||||
page_files = [str(ebook_files / f'{x}.jpg') for x in move_romans_to_front(roman_sort_with_ints([try_convert_int(str(x.stem)) for x in list(ebook_files.iterdir())]))]
|
||||
|
|
Loading…
Reference in New Issue