initial prototype
This commit is contained in:
parent
2c5c266d20
commit
9139497871
|
@ -1,3 +1,8 @@
|
|||
venv/
|
||||
.idea/
|
||||
geckodriver
|
||||
VitalSource/
|
||||
|
||||
# ---> Python
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
selenium
|
||||
webdriver-manager
|
||||
requests
|
||||
pyshadow
|
||||
tqdm
|
|
@ -0,0 +1,114 @@
|
|||
import argparse
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import selenium
|
||||
from selenium.webdriver import ActionChains, Keys
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.common.by import By
|
||||
# from webdriver_manager.firefox import GeckoDriverManager
|
||||
# from selenium.webdriver.firefox.service import Service as FirefoxService
|
||||
from seleniumwire import webdriver
|
||||
from tqdm import tqdm
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
|
||||
from vitalsource_scraper.file import download_file
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--output', default='./VitalSource/')
|
||||
parser.add_argument('--isbn', required=True)
|
||||
parser.add_argument('--delay', default=8, type=int, help='Delay between pages to let them load.')
|
||||
parser.add_argument('--pages', default=None, type=int, help='Override how many pages to save.')
|
||||
args = parser.parse_args()
|
||||
|
||||
args.output = Path(args.output)
|
||||
|
||||
# driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))
|
||||
# version = read_version_from_cmd('/usr/bin/chromium-browser --version', PATTERN[ChromeType.CHROMIUM])
|
||||
# driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager(version='111.0.5563.64', chrome_type=ChromeType.CHROMIUM).install()))
|
||||
|
||||
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
|
||||
|
||||
driver.get(f'https://bookshelf.vitalsource.com')
|
||||
input('Press ENTER once logged in...')
|
||||
|
||||
driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/0')
|
||||
while True:
|
||||
try:
|
||||
num_pages = int(driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML').replace(' ', '').split('/')[1]) if not args.pages else args.pages
|
||||
break
|
||||
except selenium.common.exceptions.JavascriptException:
|
||||
time.sleep(1)
|
||||
print('Total number of pages:', num_pages)
|
||||
|
||||
|
||||
def load_page(page_id):
|
||||
driver.get(f'https://bookshelf.vitalsource.com/reader/books/{args.isbn}/pageid/{page_id}')
|
||||
|
||||
# Wait for the page to load
|
||||
while True:
|
||||
try:
|
||||
driver.execute_script('return document.getElementsByClassName("sc-knKHOI gGldJU")[0].innerHTML')
|
||||
break
|
||||
except selenium.common.exceptions.JavascriptException:
|
||||
time.sleep(1)
|
||||
while len(driver.find_elements(By.CLASS_NAME, "sc-AjmGg dDNaMw")):
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
auth_headers = {}
|
||||
|
||||
|
||||
def form_header(input_headers):
|
||||
output = {}
|
||||
for item in input_headers:
|
||||
output[item[0]] = item[1]
|
||||
# if output.get)
|
||||
return output
|
||||
|
||||
|
||||
# cookies = driver.get_cookies()
|
||||
# r_cookies = {}
|
||||
# for cookie in cookies:
|
||||
# r_cookies[cookie['name']] = cookie['value']
|
||||
# # s.cookies.set(cookie['name'], cookie['value'])
|
||||
# download_file('https://jigsaw.vitalsource.com/books/9781524976422/images/553246736447566b5831395573413731646d5371495171745037726a2f49564a6d574c424e424e793154513d0a/encrypted/2000', '/home/dpanzer/test.jpg', cookies=r_cookies)
|
||||
|
||||
img_sizes = [2000, 1600, 800]
|
||||
|
||||
page_urls = {None}
|
||||
all_images = []
|
||||
for page_num in tqdm(iterable=range(num_pages)):
|
||||
time.sleep(args.delay)
|
||||
for i in range(3):
|
||||
base_url = None
|
||||
headers = {}
|
||||
for i in range(60):
|
||||
for request in driver.requests:
|
||||
# print(request.headers)
|
||||
if request.response and request.url.startswith(f'https://jigsaw.vitalsource.com/books/{args.isbn}/images/'):
|
||||
base_url = request.url.split('/')
|
||||
del base_url[-1]
|
||||
base_url = '/'.join(base_url)
|
||||
if base_url in page_urls:
|
||||
break
|
||||
page_urls.add(base_url)
|
||||
headers = form_header(request.headers)
|
||||
print(headers)
|
||||
time.sleep(1)
|
||||
|
||||
if base_url:
|
||||
del driver.requests
|
||||
download_file(base_url, '/home/dpanzer/test.jpg', headers=headers)
|
||||
tqdm.write(base_url)
|
||||
break
|
||||
else:
|
||||
tqdm.write(f'Failed to find image on page {page_num}, reloading.')
|
||||
load_page(page_num)
|
||||
time.sleep(20)
|
||||
|
||||
actions = ActionChains(driver)
|
||||
actions.send_keys(Keys.RIGHT)
|
||||
actions.perform()
|
||||
|
||||
driver.close()
|
|
@ -0,0 +1,14 @@
|
|||
import requests
|
||||
|
||||
|
||||
def download_file(url, full_output_path, headers):
|
||||
# NOTE the stream=True parameter below
|
||||
with requests.get(url, stream=True, headers=headers) as r:
|
||||
r.raise_for_status()
|
||||
with open(full_output_path, 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
# If you have chunk encoded response uncomment if
|
||||
# and set chunk_size parameter to None.
|
||||
# if chunk:
|
||||
f.write(chunk)
|
||||
return full_output_path
|
Loading…
Reference in New Issue