when to end scraping pages
This commit is contained in:
parent
8d87428033
commit
7ed81015d9
|
@ -4,5 +4,4 @@ tqdm
|
|||
pillow
|
||||
pyautogui
|
||||
selenium-wire
|
||||
img2pdf
|
||||
selenium-requests
|
||||
img2pdf
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env python
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
@ -72,6 +72,7 @@ page_num = args.start_page
|
|||
load_book_page(page_num)
|
||||
|
||||
_, total_pages = get_num_pages()
|
||||
total_pages = 99999999999999999 if args.start_page > 0 else total_pages
|
||||
print('Total number of pages:', total_pages)
|
||||
|
||||
page_urls = set()
|
||||
|
@ -115,27 +116,21 @@ while page_num < total_pages + 1:
|
|||
bar.total = total_pages
|
||||
bar.refresh()
|
||||
|
||||
# dl_file = ebook_files / f'{page}.jpg'
|
||||
# with open(dl_file, 'wb') as file:
|
||||
# file.write(img_data)
|
||||
#
|
||||
# # Re-save the image to make sure it's in the correct format
|
||||
# img = Image.open(dl_file)
|
||||
# if img.width != 2000:
|
||||
# bar.write(f'Page {page_num} is only {img.width}px wide, will search for a larger image later.')
|
||||
# small_pages_redo.add(page_num)
|
||||
# img.save(dl_file, format='JPEG', subsampling=0, quality=100)
|
||||
# del img
|
||||
|
||||
if page_num == args.end_page:
|
||||
bar.write(f'Exiting on page {page_num}.')
|
||||
break
|
||||
if page == total_pages:
|
||||
bar.write(f'Book completed, exiting.')
|
||||
break
|
||||
if not driver.find_elements(By.CLASS_NAME, 'IconButton__button-bQttMI gHMmeA sc-oXPCX mwNce')[0].is_enabled():
|
||||
bar.write(f'Book completed, exiting.')
|
||||
break
|
||||
|
||||
try:
|
||||
if driver.execute_script(f'return document.getElementsByClassName("IconButton__button-bQttMI gHMmeA sc-oXPCX mwNce")[0].disabled'): # not driver.find_elements(By.CLASS_NAME, 'IconButton__button-bQttMI gHMmeA sc-oXPCX mwNce')[0].is_enabled():
|
||||
bar.write(f'Book completed, exiting.')
|
||||
break
|
||||
except IndexError:
|
||||
pass
|
||||
except selenium.common.exceptions.JavascriptException:
|
||||
pass
|
||||
|
||||
# Move to the next page
|
||||
del driver.requests
|
||||
|
@ -149,49 +144,47 @@ bar.close()
|
|||
# TODO: redo failed_pages items
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
# print('All pages scraped! Now we must change driver modes to download the pages. Please log back in once the new window pops up.')
|
||||
# from seleniumrequests import Chrome
|
||||
#
|
||||
# driver = Chrome(service=Service(ChromeDriverManager().install()), chrome_options=chrome_options)
|
||||
# driver.get(f'https://bookshelf.vitalsource.com')
|
||||
# input('Press ENTER once logged in...')
|
||||
# load_book_page(0)
|
||||
|
||||
print('All pages scraped! Now downloading images...')
|
||||
|
||||
bar = tqdm(total=len(page_urls))
|
||||
for page, base_url in page_urls:
|
||||
time.sleep(args.delay)
|
||||
driver.get(f'{base_url.strip("/")}/2000') # have to load the page first for cookies reasons
|
||||
time.sleep(args.delay)
|
||||
retry_delay = 5
|
||||
img_data = None
|
||||
for page_retry in range(3): # retry the page max this many times
|
||||
largest_size = 0
|
||||
for find_img_retry in range(3):
|
||||
for request in driver.requests:
|
||||
if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
|
||||
img_data = request.response.body
|
||||
break
|
||||
# response = driver.request('GET', f'{base_url.strip("/")}/2000')
|
||||
# print(response)
|
||||
# response.raise_for_status()
|
||||
dl_file = ebook_files / f'{page}.jpg'
|
||||
# with open(dl_file, 'wb') as f:
|
||||
# response.raw.decode_content = True
|
||||
# shutil.copyfileobj(response.raw, f)
|
||||
if img_data:
|
||||
with open(dl_file, 'wb') as file:
|
||||
file.write(img_data)
|
||||
# Re-save the image to make sure it's in the correct format
|
||||
img = Image.open(dl_file)
|
||||
img.save(dl_file, format='JPEG', subsampling=0, quality=100)
|
||||
del img
|
||||
else:
|
||||
bar.write(f'Failed to download image: {base_url}')
|
||||
for retry in range(6):
|
||||
success = False
|
||||
del driver.requests
|
||||
time.sleep(args.delay)
|
||||
driver.get(f'{base_url.strip("/")}/2000') # have to load the page first for cookies reasons
|
||||
time.sleep(args.delay)
|
||||
retry_delay = 5
|
||||
img_data = None
|
||||
for page_retry in range(3): # retry the page max this many times
|
||||
largest_size = 0
|
||||
for find_img_retry in range(3):
|
||||
for request in driver.requests:
|
||||
if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
|
||||
img_data = request.response.body
|
||||
break
|
||||
dl_file = ebook_files / f'{page}.jpg'
|
||||
if img_data:
|
||||
with open(dl_file, 'wb') as file:
|
||||
file.write(img_data)
|
||||
# Re-save the image to make sure it's in the correct format
|
||||
img = Image.open(dl_file)
|
||||
if img.width != 2000:
|
||||
bar.write(f'Image too small at {img.width}px wide, retrying: {base_url}')
|
||||
driver.get('https://google.com')
|
||||
time.sleep(8)
|
||||
load_book_page(0)
|
||||
time.sleep(8)
|
||||
continue
|
||||
img.save(dl_file, format='JPEG', subsampling=0, quality=100)
|
||||
del img
|
||||
success = True
|
||||
else:
|
||||
bar.write(f'Failed to download image: {base_url}')
|
||||
break
|
||||
if success:
|
||||
break
|
||||
bar.update()
|
||||
del driver.requests
|
||||
bar.close()
|
||||
driver.close()
|
||||
del driver
|
||||
|
|
Loading…
Reference in New Issue