pretty much working, PDFs generated
This commit is contained in:
parent
03fd8219f4
commit
8d87428033
|
@ -40,6 +40,8 @@ If your network is slow, use `--delay` to allow more time for the files to downl
|
|||
Guide you through step-by-step. You are expected to have the required technical knowledge and understand what
|
||||
is happening behind the scenes in order to troubleshoot any issues.
|
||||
|
||||
You will also have to double check the output PDF to make sure everything is as it should be.
|
||||
|
||||
### How it Works
|
||||
|
||||
This scraper uses Selenium to load the ebook viewer webpage. It then navigates through the book page by page and records network
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
from typing import List
|
||||
|
||||
|
||||
def roman_sort(nums: List[str]) -> List[str]:
|
||||
"""
|
||||
Contributed by ChatGPT.
|
||||
"""
|
||||
values = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
|
||||
sorted_nums = sorted(nums, key=lambda x: sum(values[c.upper()] for c in x))
|
||||
return sorted_nums
|
||||
|
||||
|
||||
def roman_sort_with_ints(arr):
|
||||
"""
|
||||
Contributed by ChatGPT, who didn't know how to use .upper()
|
||||
"""
|
||||
roman_dict = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
|
||||
|
||||
def roman_to_int(num):
|
||||
if isinstance(num, str):
|
||||
num = num.upper()
|
||||
result = 0
|
||||
for i in range(len(num)):
|
||||
if i > 0 and roman_dict[num[i]] > roman_dict[num[i - 1]]:
|
||||
result += roman_dict[num[i]] - 2 * roman_dict[num[i - 1]]
|
||||
else:
|
||||
result += roman_dict[num[i]]
|
||||
return result
|
||||
|
||||
def int_or_roman(elem):
|
||||
try:
|
||||
return int(elem)
|
||||
except ValueError:
|
||||
return roman_to_int(elem)
|
||||
|
||||
sorted_arr = sorted(arr, key=int_or_roman)
|
||||
return sorted_arr
|
|
@ -3,4 +3,6 @@ webdriver-manager
|
|||
tqdm
|
||||
pillow
|
||||
pyautogui
|
||||
selenium-wire
|
||||
selenium-wire
|
||||
img2pdf
|
||||
selenium-requests
|
|
@ -1,24 +1,28 @@
|
|||
#!/usr/bin/env python
|
||||
import argparse
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import img2pdf
|
||||
import selenium
|
||||
from PIL import Image
|
||||
from selenium.webdriver import ActionChains, Keys
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.common.by import By
|
||||
# from webdriver_manager.firefox import GeckoDriverManager
|
||||
# from selenium.webdriver.firefox.service import Service as FirefoxService
|
||||
from seleniumwire import webdriver
|
||||
from tqdm import tqdm
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
|
||||
from fucts.roman import roman_sort_with_ints
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--output', default='./VitalSource/')
|
||||
parser.add_argument('--isbn', required=True)
|
||||
parser.add_argument('--delay', default=2, type=int, help='Delay between pages to let them load in seconds.')
|
||||
parser.add_argument('--pages', default=None, type=int, help='Override how many pages to save.') # TODO
|
||||
parser.add_argument('--start-page', default=0, type=int, help='Start on this page. Pages start at zero and include any non-numbered pages.')
|
||||
parser.add_argument('--end-page', default=0, type=int, help='End on this page.')
|
||||
parser.add_argument('--chrome-exe', default=None, type=str, help='Path to the Chrome executable. Leave blank to auto-detect.')
|
||||
parser.add_argument('--disable-web-security', action='store_true', help="If pages aren't loading then you can try disabling CORS protections.")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
@ -28,13 +32,14 @@ ebook_output = args.output / f'{args.isbn}.pdf'
|
|||
ebook_files = args.output / args.isbn
|
||||
ebook_files.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_experimental_option('prefs', {'download.default_directory': str(ebook_files)})
|
||||
chrome_options = webdriver.ChromeOptions()
|
||||
if args.disable_web_security:
|
||||
options.add_argument('--disable-web-security')
|
||||
chrome_options.add_argument('--disable-web-security')
|
||||
print('DISABLED WEB SECURITY!')
|
||||
options.add_argument('--disable-http2') # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
|
||||
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=options)
|
||||
chrome_options.add_argument('--disable-http2') # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
|
||||
if args.chrome_exe:
|
||||
chrome_options.binary_location = args.chrome_exe # '/usr/bin/google-chrome'
|
||||
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), chrome_options=chrome_options)
|
||||
|
||||
driver.get(f'https://bookshelf.vitalsource.com')
|
||||
input('Press ENTER once logged in...')
|
||||
|
@ -76,7 +81,6 @@ bar = tqdm(total=total_pages)
|
|||
bar.update(page_num)
|
||||
while page_num < total_pages + 1:
|
||||
time.sleep(args.delay)
|
||||
img_data = None
|
||||
retry_delay = 5
|
||||
base_url = None
|
||||
for page_retry in range(3): # retry the page max this many times
|
||||
|
@ -84,30 +88,9 @@ while page_num < total_pages + 1:
|
|||
for find_img_retry in range(3):
|
||||
for request in driver.requests:
|
||||
if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
|
||||
# Wait for the image to load
|
||||
wait = 0
|
||||
while (not request.response or not request.response.body) and wait < 60:
|
||||
time.sleep(1)
|
||||
wait += 1
|
||||
if not request.response or not request.response.body:
|
||||
bar.write(f'Page {page_num} failed to load, will retry later. {request.url}')
|
||||
failed_pages.add(page_num)
|
||||
break
|
||||
|
||||
base_url = request.url.split('/')
|
||||
try:
|
||||
img_size = int(base_url[-1])
|
||||
except ValueError:
|
||||
bar.write(f'Failed to parse URL for page {page_num}, retrying later: {request.url}')
|
||||
failed_pages.add(page_num)
|
||||
break
|
||||
if img_size > largest_size:
|
||||
base_url = '/'.join(base_url)
|
||||
img_data = request.response.body
|
||||
page_urls.add(request.url)
|
||||
# 2000 is the max size I've seen so we can just exit if it's that.
|
||||
if img_size == 2000:
|
||||
break
|
||||
del base_url[-1]
|
||||
base_url = '/'.join(base_url)
|
||||
time.sleep(1)
|
||||
if base_url:
|
||||
break
|
||||
|
@ -115,14 +98,14 @@ while page_num < total_pages + 1:
|
|||
time.sleep(retry_delay)
|
||||
retry_delay += 5
|
||||
|
||||
if not img_data:
|
||||
bar.write(f'Failed to download image for page {page_num}, retrying later.')
|
||||
failed_pages.add(page_num)
|
||||
elif not base_url:
|
||||
page, _ = get_num_pages()
|
||||
if not base_url:
|
||||
bar.write(f'Failed to get a URL for page {page_num}, retrying later.')
|
||||
failed_pages.add(page_num)
|
||||
continue
|
||||
else:
|
||||
page, _ = get_num_pages()
|
||||
page_urls.add((page, base_url))
|
||||
bar.write(base_url)
|
||||
# If this isn't a numbered page we will need to increment the page count
|
||||
try:
|
||||
int(page)
|
||||
|
@ -132,40 +115,95 @@ while page_num < total_pages + 1:
|
|||
bar.total = total_pages
|
||||
bar.refresh()
|
||||
|
||||
dl_file = ebook_files / f'{page}.jpg'
|
||||
with open(dl_file, 'wb') as file:
|
||||
file.write(img_data)
|
||||
# dl_file = ebook_files / f'{page}.jpg'
|
||||
# with open(dl_file, 'wb') as file:
|
||||
# file.write(img_data)
|
||||
#
|
||||
# # Re-save the image to make sure it's in the correct format
|
||||
# img = Image.open(dl_file)
|
||||
# if img.width != 2000:
|
||||
# bar.write(f'Page {page_num} is only {img.width}px wide, will search for a larger image later.')
|
||||
# small_pages_redo.add(page_num)
|
||||
# img.save(dl_file, format='JPEG', subsampling=0, quality=100)
|
||||
# del img
|
||||
|
||||
# Re-save the image to make sure it's in the correct format
|
||||
img = Image.open(dl_file)
|
||||
if img.width != 2000:
|
||||
bar.write(f'Page {page_num} is only {img.width}px wide, will search for a larger image later.')
|
||||
small_pages_redo.add(page_num)
|
||||
img.save(dl_file, format='JPEG', subsampling=0, quality=100)
|
||||
del img
|
||||
|
||||
bar.write(base_url)
|
||||
if page_num == args.end_page:
|
||||
bar.write(f'Exiting on page {page_num}.')
|
||||
break
|
||||
if page == total_pages:
|
||||
bar.write(f'Book completed, exiting.')
|
||||
break
|
||||
if not driver.find_elements(By.CLASS_NAME, 'IconButton__button-bQttMI gHMmeA sc-oXPCX mwNce')[0].is_enabled():
|
||||
bar.write(f'Book completed, exiting.')
|
||||
break
|
||||
|
||||
# Move to the next page
|
||||
del driver.requests
|
||||
actions = ActionChains(driver)
|
||||
actions.send_keys(Keys.RIGHT)
|
||||
actions.perform()
|
||||
|
||||
bar.update()
|
||||
page_num += 1
|
||||
|
||||
# TODO: redo failed pages in failed_pages
|
||||
# TODO:
|
||||
|
||||
driver.close()
|
||||
bar.close()
|
||||
|
||||
# TODO: redo failed_pages items
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
# print('All pages scraped! Now we must change driver modes to download the pages. Please log back in once the new window pops up.')
|
||||
# from seleniumrequests import Chrome
|
||||
#
|
||||
# driver = Chrome(service=Service(ChromeDriverManager().install()), chrome_options=chrome_options)
|
||||
# driver.get(f'https://bookshelf.vitalsource.com')
|
||||
# input('Press ENTER once logged in...')
|
||||
# load_book_page(0)
|
||||
|
||||
print('All pages scraped! Now downloading images...')
|
||||
|
||||
bar = tqdm(total=len(page_urls))
|
||||
for page, base_url in page_urls:
|
||||
time.sleep(args.delay)
|
||||
driver.get(f'{base_url.strip("/")}/2000') # have to load the page first for cookies reasons
|
||||
time.sleep(args.delay)
|
||||
retry_delay = 5
|
||||
img_data = None
|
||||
for page_retry in range(3): # retry the page max this many times
|
||||
largest_size = 0
|
||||
for find_img_retry in range(3):
|
||||
for request in driver.requests:
|
||||
if request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
|
||||
img_data = request.response.body
|
||||
break
|
||||
# response = driver.request('GET', f'{base_url.strip("/")}/2000')
|
||||
# print(response)
|
||||
# response.raise_for_status()
|
||||
dl_file = ebook_files / f'{page}.jpg'
|
||||
# with open(dl_file, 'wb') as f:
|
||||
# response.raw.decode_content = True
|
||||
# shutil.copyfileobj(response.raw, f)
|
||||
if img_data:
|
||||
with open(dl_file, 'wb') as file:
|
||||
file.write(img_data)
|
||||
# Re-save the image to make sure it's in the correct format
|
||||
img = Image.open(dl_file)
|
||||
img.save(dl_file, format='JPEG', subsampling=0, quality=100)
|
||||
del img
|
||||
else:
|
||||
bar.write(f'Failed to download image: {base_url}')
|
||||
bar.update()
|
||||
del driver.requests
|
||||
bar.close()
|
||||
driver.close()
|
||||
del driver
|
||||
|
||||
page_files = [str(ebook_files / f'{x}.jpg') for x in roman_sort_with_ints([str(x.stem) for x in list(ebook_files.iterdir())])]
|
||||
pdf = img2pdf.convert(page_files)
|
||||
with open(ebook_output, 'wb') as f:
|
||||
f.write(pdf)
|
||||
|
||||
# TODO: maybe scrape book title to name the PDF file?
|
||||
# TODO: also maybe embed the title in the PDF file?
|
||||
|
||||
# TODO: make PDF
|
||||
|
||||
# TODO: scrape table of contents and insert
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue