initial prototype
This commit is contained in:
parent
2c5c266d20
commit
9139497871
|
@ -1,3 +1,8 @@
|
||||||
|
venv/
|
||||||
|
.idea/
|
||||||
|
geckodriver
|
||||||
|
VitalSource/
|
||||||
|
|
||||||
# ---> Python
|
# ---> Python
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
|
|
@ -1,2 +1,3 @@
|
||||||
# vitalsource2pdf
|
# vitalsource2pdf
|
||||||
|
|
||||||
|
unset SNAP_NAME; unset SNAP_INSTANCE_NAME
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
selenium
|
||||||
|
webdriver-manager
|
||||||
|
requests
|
||||||
|
pyshadow
|
||||||
|
tqdm
|
|
@ -0,0 +1,114 @@
|
||||||
|
import argparse
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import selenium
|
||||||
|
from selenium.webdriver import ActionChains, Keys
|
||||||
|
from selenium.webdriver.chrome.service import Service
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
# from webdriver_manager.firefox import GeckoDriverManager
|
||||||
|
# from selenium.webdriver.firefox.service import Service as FirefoxService
|
||||||
|
from seleniumwire import webdriver
|
||||||
|
from tqdm import tqdm
|
||||||
|
from webdriver_manager.chrome import ChromeDriverManager
|
||||||
|
|
||||||
|
from vitalsource_scraper.file import download_file
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--output', default='./VitalSource/')
|
||||||
|
parser.add_argument('--isbn', required=True)
|
||||||
|
parser.add_argument('--delay', default=8, type=int, help='Delay between pages to let them load.')
|
||||||
|
parser.add_argument('--pages', default=None, type=int, help='Override how many pages to save.')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
args.output = Path(args.output)
|
||||||
|
|
||||||
|
# driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))
|
||||||
|
# version = read_version_from_cmd('/usr/bin/chromium-browser --version', PATTERN[ChromeType.CHROMIUM])
|
||||||
|
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager(version='111.0.5563.64', chrome_type=ChromeType.CHROMIUM).install()))
|
||||||
|
|
||||||
|
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
|
||||||
|
|
||||||
|
driver.get(f'https://bookshelf.vitalsource.com')
|
||||||
|
input('Press ENTER once logged in...')
|
||||||
|
|
||||||
|
driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/0')
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
num_pages = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').replace(' ', '').split('/')[1]) if not args.pages else args.pages
|
||||||
|
break
|
||||||
|
except selenium.common.exceptions.JavascriptException:
|
||||||
|
time.sleep(1)
|
||||||
|
print('Total number of pages:', num_pages)
|
||||||
|
|
||||||
|
|
||||||
|
def load_page(page_id):
|
||||||
|
driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/{page_id}')
|
||||||
|
|
||||||
|
# Wait for the page to load
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML')
|
||||||
|
break
|
||||||
|
except selenium.common.exceptions.JavascriptException:
|
||||||
|
time.sleep(1)
|
||||||
|
while len(driver.find_elements(By.CLASS_NAME, "sc-AjmGg dDNaMw")):
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
|
||||||
|
auth_headers = {}
|
||||||
|
|
||||||
|
|
||||||
|
def form_header(input_headers):
|
||||||
|
output = {}
|
||||||
|
for item in input_headers:
|
||||||
|
output[item[0]] = item[1]
|
||||||
|
# if output.get)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
# cookies = driver.get_cookies()
|
||||||
|
# r_cookies = {}
|
||||||
|
# for cookie in cookies:
|
||||||
|
# r_cookies[cookie['name']] = cookie['value']
|
||||||
|
# # s.cookies.set(cookie['name'], cookie['value'])
|
||||||
|
# download_file('https://jigsaw.vitalsource.com/books/9781524976422/images/553246736447566b5831395573413731646d5371495171745037726a2f49564a6d574c424e424e793154513d0a/encrypted/2000', '/home/dpanzer/test.jpg', cookies=r_cookies)
|
||||||
|
|
||||||
|
img_sizes = [2000, 1600, 800]
|
||||||
|
|
||||||
|
page_urls = {None}
|
||||||
|
all_images = []
|
||||||
|
for page_num in tqdm(iterable=range(num_pages)):
|
||||||
|
time.sleep(args.delay)
|
||||||
|
for i in range(3):
|
||||||
|
base_url = None
|
||||||
|
headers = {}
|
||||||
|
for i in range(60):
|
||||||
|
for request in driver.requests:
|
||||||
|
# print(request.headers)
|
||||||
|
if request.response and request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
|
||||||
|
base_url = request.url.split('/')
|
||||||
|
del base_url[-1]
|
||||||
|
base_url = '/'.join(base_url)
|
||||||
|
if base_url in page_urls:
|
||||||
|
break
|
||||||
|
page_urls.add(base_url)
|
||||||
|
headers = form_header(request.headers)
|
||||||
|
print(headers)
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
if base_url:
|
||||||
|
del driver.requests
|
||||||
|
download_file(base_url, '/home/dpanzer/test.jpg', headers=headers)
|
||||||
|
tqdm.write(base_url)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
tqdm.write(f'Failed to find image on page {page_num}, reloading.')
|
||||||
|
load_page(page_num)
|
||||||
|
time.sleep(20)
|
||||||
|
|
||||||
|
actions = ActionChains(driver)
|
||||||
|
actions.send_keys(Keys.RIGHT)
|
||||||
|
actions.perform()
|
||||||
|
|
||||||
|
driver.close()
|
|
@ -0,0 +1,14 @@
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
def download_file(url, full_output_path, headers):
|
||||||
|
# NOTE the stream=True parameter below
|
||||||
|
with requests.get(url, stream=True, headers=headers) as r:
|
||||||
|
r.raise_for_status()
|
||||||
|
with open(full_output_path, 'wb') as f:
|
||||||
|
for chunk in r.iter_content(chunk_size=8192):
|
||||||
|
# If you have chunk encoded response uncomment if
|
||||||
|
# and set chunk_size parameter to None.
|
||||||
|
# if chunk:
|
||||||
|
f.write(chunk)
|
||||||
|
return full_output_path
|
Loading…
Reference in New Issue