initial prototype

This commit is contained in:
Cyberes 2023-03-12 22:41:19 -06:00
parent 2c5c266d20
commit 9139497871
6 changed files with 139 additions and 0 deletions

5
.gitignore vendored
View File

@ -1,3 +1,8 @@
venv/
.idea/
geckodriver
VitalSource/
# ---> Python
# Byte-compiled / optimized / DLL files
__pycache__/

View File

@ -1,2 +1,3 @@
# vitalsource2pdf
unset SNAP_NAME; unset SNAP_INSTANCE_NAME

5
requirements.txt Normal file
View File

@ -0,0 +1,5 @@
selenium
webdriver-manager
requests
pyshadow
tqdm

114
vitalsource2pdf.py Normal file
View File

@ -0,0 +1,114 @@
import argparse
import time
from pathlib import Path
import selenium
from selenium.webdriver import ActionChains, Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# from webdriver_manager.firefox import GeckoDriverManager
# from selenium.webdriver.firefox.service import Service as FirefoxService
from seleniumwire import webdriver
from tqdm import tqdm
from webdriver_manager.chrome import ChromeDriverManager
from vitalsource_scraper.file import download_file
parser = argparse.ArgumentParser()
parser.add_argument('--output', default='./VitalSource/')
parser.add_argument('--isbn', required=True)
parser.add_argument('--delay', default=8, type=int, help='Delay between pages to let them load.')
parser.add_argument('--pages', default=None, type=int, help='Override how many pages to save.')
args = parser.parse_args()
args.output = Path(args.output)
# driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))
# version = read_version_from_cmd('/usr/bin/chromium-browser --version', PATTERN[ChromeType.CHROMIUM])
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager(version='111.0.5563.64', chrome_type=ChromeType.CHROMIUM).install()))
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(f'https://bookshelf.vitalsource.com')
input('Press ENTER once logged in...')
driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/0')
while True:
try:
num_pages = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').replace(' ', '').split('/')[1]) if not args.pages else args.pages
break
except selenium.common.exceptions.JavascriptException:
time.sleep(1)
print('Total number of pages:', num_pages)
def load_page(page_id):
driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/{page_id}')
# Wait for the page to load
while True:
try:
driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML')
break
except selenium.common.exceptions.JavascriptException:
time.sleep(1)
while len(driver.find_elements(By.CLASS_NAME, "sc-AjmGg dDNaMw")):
time.sleep(1)
auth_headers = {}
def form_header(input_headers):
output = {}
for item in input_headers:
output[item[0]] = item[1]
# if output.get)
return output
# cookies = driver.get_cookies()
# r_cookies = {}
# for cookie in cookies:
# r_cookies[cookie['name']] = cookie['value']
# # s.cookies.set(cookie['name'], cookie['value'])
# download_file('https://jigsaw.vitalsource.com/books/9781524976422/images/553246736447566b5831395573413731646d5371495171745037726a2f49564a6d574c424e424e793154513d0a/encrypted/2000', '/home/dpanzer/test.jpg', cookies=r_cookies)
img_sizes = [2000, 1600, 800]
page_urls = {None}
all_images = []
for page_num in tqdm(iterable=range(num_pages)):
time.sleep(args.delay)
for i in range(3):
base_url = None
headers = {}
for i in range(60):
for request in driver.requests:
# print(request.headers)
if request.response and request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
base_url = request.url.split('/')
del base_url[-1]
base_url = '/'.join(base_url)
if base_url in page_urls:
break
page_urls.add(base_url)
headers = form_header(request.headers)
print(headers)
time.sleep(1)
if base_url:
del driver.requests
download_file(base_url, '/home/dpanzer/test.jpg', headers=headers)
tqdm.write(base_url)
break
else:
tqdm.write(f'Failed to find image on page {page_num}, reloading.')
load_page(page_num)
time.sleep(20)
actions = ActionChains(driver)
actions.send_keys(Keys.RIGHT)
actions.perform()
driver.close()

View File

View File

@ -0,0 +1,14 @@
import requests
def download_file(url, full_output_path, headers):
# NOTE the stream=True parameter below
with requests.get(url, stream=True, headers=headers) as r:
r.raise_for_status()
with open(full_output_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
# If you have chunk encoded response uncomment if
# and set chunk_size parameter to None.
# if chunk:
f.write(chunk)
return full_output_path