pretty much working, PDFs generated

This commit is contained in:
Cyberes 2023-03-14 00:27:53 -06:00
parent 03fd8219f4
commit 8d87428033
5 changed files with 136 additions and 57 deletions

View File

@ -40,6 +40,8 @@ If your network is slow, use `--delay` to allow more time for the files to downl
Guide you through step-by-step. You are expected to have the required technical knowledge and understand what
is happening behind the scenes in order to troubleshoot any issues.
You will also have to double check the output PDF to make sure everything is as it should be.
### How it Works
This scraper uses Selenium to load the ebook viewer webpage. It then navigates through the book page by page and records network

0
fucts/__init__.py Normal file
View File

37
fucts/roman.py Normal file
View File

@ -0,0 +1,37 @@
from typing import List
def roman_sort(nums: List[str]) -> List[str]:
"""
Contributed by ChatGPT.
"""
values = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
sorted_nums = sorted(nums, key=lambda x: sum(values[c.upper()] for c in x))
return sorted_nums
def roman_sort_with_ints(arr):
"""
Contributed by ChatGPT, who didn't know how to use .upper()
"""
roman_dict = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
def roman_to_int(num):
if isinstance(num, str):
num = num.upper()
result = 0
for i in range(len(num)):
if i > 0 and roman_dict[num[i]] > roman_dict[num[i - 1]]:
result += roman_dict[num[i]] - 2 * roman_dict[num[i - 1]]
else:
result += roman_dict[num[i]]
return result
def int_or_roman(elem):
try:
return int(elem)
except ValueError:
return roman_to_int(elem)
sorted_arr = sorted(arr, key=int_or_roman)
return sorted_arr

View File

@ -3,4 +3,6 @@ webdriver-manager
tqdm
pillow
pyautogui
selenium-wire
selenium-wire
img2pdf
selenium-requests

View File

@ -1,24 +1,28 @@
#!/usr/bin/env python
import argparse
import time
from pathlib import Path
import img2pdf
import selenium
from PIL import Image
from selenium.webdriver import ActionChains, Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# from webdriver_manager.firefox import GeckoDriverManager
# from selenium.webdriver.firefox.service import Service as FirefoxService
from seleniumwire import webdriver
from tqdm import tqdm
from webdriver_manager.chrome import ChromeDriverManager
from fucts.roman import roman_sort_with_ints
parser = argparse.ArgumentParser()
parser.add_argument('--output', default='./VitalSource/')
parser.add_argument('--isbn', required=True)
parser.add_argument('--delay', default=2, type=int, help='Delay between pages to let them load in seconds.')
parser.add_argument('--pages', default=None, type=int, help='Override how many pages to save.') # TODO
parser.add_argument('--start-page', default=0, type=int, help='Start on this page. Pages start at zero and include any non-numbered pages.')
parser.add_argument('--end-page', default=0, type=int, help='End on this page.')
parser.add_argument('--chrome-exe', default=None, type=str, help='Path to the Chrome executable. Leave blank to auto-detect.')
parser.add_argument('--disable-web-security', action='store_true', help="If pages aren't loading then you can try disabling CORS protections.")
args = parser.parse_args()
@ -28,13 +32,14 @@ ebook_output = args.output / f'{args.isbn}.pdf'
ebook_files = args.output / args.isbn
ebook_files.mkdir(exist_ok=True, parents=True)
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {'download.default_directory': str(ebook_files)})
chrome_options = webdriver.ChromeOptions()
if args.disable_web_security:
options.add_argument('--disable-web-security')
chrome_options.add_argument('--disable-web-security')
print('DISABLED WEB SECURITY!')
options.add_argument('--disable-http2') # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=options)
chrome_options.add_argument('--disable-http2') # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
if args.chrome_exe:
chrome_options.binary_location = args.chrome_exe # '/usr/bin/google-chrome'
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=chrome_options)
driver.get(f'https://bookshelf.vitalsource.com')
input('Press ENTER once logged in...')
@ -76,7 +81,6 @@ bar = tqdm(total=total_pages)
bar.update(page_num)
while page_num < total_pages + 1:
time.sleep(args.delay)
img_data = None
retry_delay = 5
base_url = None
for page_retry in range(3): # retry the page max this many times
@ -84,30 +88,9 @@ while page_num < total_pages + 1:
for find_img_retry in range(3):
for request in driver.requests:
if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
# Wait for the image to load
wait = 0
while (not request.response or not request.response.body) and wait < 60:
time.sleep(1)
wait += 1
if not request.response or not request.response.body:
bar.write(f'Page {page_num} failed to load, will retry later. {request.url}')
failed_pages.add(page_num)
break
base_url = request.url.split('/')
try:
img_size = int(base_url[-1])
except ValueError:
bar.write(f'Failed to parse URL for page {page_num}, retrying later: {request.url}')
failed_pages.add(page_num)
break
if img_size > largest_size:
base_url = '/'.join(base_url)
img_data = request.response.body
page_urls.add(request.url)
# 2000 is the max size I've seen so we can just exit if it's that.
if img_size == 2000:
break
del base_url[-1]
base_url = '/'.join(base_url)
time.sleep(1)
if base_url:
break
@ -115,14 +98,14 @@ while page_num < total_pages + 1:
time.sleep(retry_delay)
retry_delay += 5
if not img_data:
bar.write(f'Failed to download image for page {page_num}, retrying later.')
failed_pages.add(page_num)
elif not base_url:
page, _ = get_num_pages()
if not base_url:
bar.write(f'Failed to get a URL for page {page_num}, retrying later.')
failed_pages.add(page_num)
continue
else:
page, _ = get_num_pages()
page_urls.add((page, base_url))
bar.write(base_url)
# If this isn't a numbered page we will need to increment the page count
try:
int(page)
@ -132,40 +115,95 @@ while page_num < total_pages + 1:
bar.total = total_pages
bar.refresh()
dl_file = ebook_files / f'{page}.jpg'
with open(dl_file, 'wb') as file:
file.write(img_data)
# dl_file = ebook_files / f'{page}.jpg'
# with open(dl_file, 'wb') as file:
# file.write(img_data)
#
# # Re-save the image to make sure it's in the correct format
# img = Image.open(dl_file)
# if img.width != 2000:
# bar.write(f'Page {page_num} is only {img.width}px wide, will search for a larger image later.')
# small_pages_redo.add(page_num)
# img.save(dl_file, format='JPEG', subsampling=0, quality=100)
# del img
# Re-save the image to make sure it's in the correct format
img = Image.open(dl_file)
if img.width != 2000:
bar.write(f'Page {page_num} is only {img.width}px wide, will search for a larger image later.')
small_pages_redo.add(page_num)
img.save(dl_file, format='JPEG', subsampling=0, quality=100)
del img
bar.write(base_url)
if page_num == args.end_page:
bar.write(f'Exiting on page {page_num}.')
break
if page == total_pages:
bar.write(f'Book completed, exiting.')
break
if not driver.find_elements(By.CLASS_NAME, 'IconButton__button-bQttMI gHMmeA sc-oXPCX mwNce')[0].is_enabled():
bar.write(f'Book completed, exiting.')
break
# Move to the next page
del driver.requests
actions = ActionChains(driver)
actions.send_keys(Keys.RIGHT)
actions.perform()
bar.update()
page_num += 1
# TODO: redo failed pages in failed_pages
# TODO:
driver.close()
bar.close()
# TODO: redo failed_pages items
time.sleep(1)
# print('All pages scraped! Now we must change driver modes to download the pages. Please log back in once the new window pops up.')
# from seleniumrequests import Chrome
#
# driver = Chrome(service=Service(ChromeDriverManager().install()), chrome_options=chrome_options)
# driver.get(f'https://bookshelf.vitalsource.com')
# input('Press ENTER once logged in...')
# load_book_page(0)
print('All pages scraped! Now downloading images...')
bar = tqdm(total=len(page_urls))
for page, base_url in page_urls:
time.sleep(args.delay)
driver.get(f'{base_url.strip("/")}/2000') # have to load the page first for cookies reasons
time.sleep(args.delay)
retry_delay = 5
img_data = None
for page_retry in range(3): # retry the page max this many times
largest_size = 0
for find_img_retry in range(3):
for request in driver.requests:
if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
img_data = request.response.body
break
# response = driver.request('GET', f'{base_url.strip("/")}/2000')
# print(response)
# response.raise_for_status()
dl_file = ebook_files / f'{page}.jpg'
# with open(dl_file, 'wb') as f:
# response.raw.decode_content = True
# shutil.copyfileobj(response.raw, f)
if img_data:
with open(dl_file, 'wb') as file:
file.write(img_data)
# Re-save the image to make sure it's in the correct format
img = Image.open(dl_file)
img.save(dl_file, format='JPEG', subsampling=0, quality=100)
del img
else:
bar.write(f'Failed to download image: {base_url}')
bar.update()
del driver.requests
bar.close()
driver.close()
del driver
page_files = [str(ebook_files / f'{x}.jpg') for x in roman_sort_with_ints([str(x.stem) for x in list(ebook_files.iterdir())])]
pdf = img2pdf.convert(page_files)
with open(ebook_output, 'wb') as f:
f.write(pdf)
# TODO: maybe scrape book title to name the PDF file?
# TODO: also maybe embed the title in the PDF file?
# TODO: make PDF
# TODO: scrape table of contents and insert