2023-03-14 01:01:23 -06:00
#!/usr/bin/env python3
2023-03-12 22:41:19 -06:00
import argparse
2023-03-14 18:12:38 -06:00
import json
import os
import shutil
2023-03-14 11:49:09 -06:00
import subprocess
2023-03-14 18:12:38 -06:00
import tempfile
2023-03-14 11:53:47 -06:00
import time
2023-03-12 22:41:19 -06:00
from pathlib import Path
2023-03-14 00:27:53 -06:00
import img2pdf
2023-03-12 22:41:19 -06:00
import selenium
2023-03-13 17:58:22 -06:00
from PIL import Image
2023-03-14 22:10:07 -06:00
from PyPDF2 import PdfMerger , PdfReader , PdfWriter
2023-03-14 18:12:38 -06:00
from pagelabels import PageLabelScheme , PageLabels
from pdfrw import PdfReader as pdfrw_reader
from pdfrw import PdfWriter as pdfrw_writer
2023-03-12 22:41:19 -06:00
from selenium . webdriver import ActionChains , Keys
from selenium . webdriver . chrome . service import Service
from selenium . webdriver . common . by import By
from seleniumwire import webdriver
from tqdm import tqdm
from webdriver_manager . chrome import ChromeDriverManager
2023-03-14 18:12:38 -06:00
from fucts . roman import move_romans_to_front , roman_sort_with_ints , try_convert_int
2023-03-14 00:27:53 -06:00
2023-03-12 22:41:19 -06:00
parser = argparse . ArgumentParser ( )
parser . add_argument ( ' --output ' , default = ' ./VitalSource/ ' )
parser . add_argument ( ' --isbn ' , required = True )
2023-03-13 17:58:22 -06:00
parser . add_argument ( ' --delay ' , default = 2 , type = int , help = ' Delay between pages to let them load in seconds. ' )
parser . add_argument ( ' --pages ' , default = None , type = int , help = ' Override how many pages to save. ' ) # TODO
parser . add_argument ( ' --start-page ' , default = 0 , type = int , help = ' Start on this page. Pages start at zero and include any non-numbered pages. ' )
2023-03-14 10:46:42 -06:00
parser . add_argument ( ' --end-page ' , default = - 1 , type = int , help = ' End on this page. ' )
2023-03-14 00:27:53 -06:00
parser . add_argument ( ' --chrome-exe ' , default = None , type = str , help = ' Path to the Chrome executable. Leave blank to auto-detect. ' )
2023-03-13 16:39:47 -06:00
parser . add_argument ( ' --disable-web-security ' , action = ' store_true ' , help = " If pages aren ' t loading then you can try disabling CORS protections. " )
2023-03-14 11:49:09 -06:00
parser . add_argument ( ' --language ' , default = ' eng ' , help = ' OCR language. Default: " eng " ' )
2023-03-14 12:00:38 -06:00
parser . add_argument ( ' --skip-scrape ' , action = ' store_true ' , help = " Don ' t scrape anything, just re-build the PDF from existing files. " )
2023-03-14 18:12:38 -06:00
parser . add_argument ( ' --only-scrape-metadata ' , action = ' store_true ' , help = " Similar to --skip-scrape, but only scrape the metadata. " )
parser . add_argument ( ' --skip-ocr ' , action = ' store_true ' , help = " Don ' t do any OCR. " )
2023-03-14 22:17:47 -06:00
parser . add_argument ( ' --compress ' , action = ' store_true ' , help = " Run compression and optimization. Probably won ' t do anything as there isn ' t much more compression that can be done. " )
2023-03-12 22:41:19 -06:00
args = parser . parse_args ( )
args . output = Path ( args . output )
2023-03-13 16:39:47 -06:00
args . output . mkdir ( exist_ok = True , parents = True )
2023-03-14 18:12:38 -06:00
# ebook_output = args.output / f'{args.isbn}.pdf'
2023-03-13 16:39:47 -06:00
ebook_files = args . output / args . isbn
ebook_files . mkdir ( exist_ok = True , parents = True )
2023-03-12 22:41:19 -06:00
2023-03-14 18:12:38 -06:00
book_info = { }
2023-03-14 22:10:07 -06:00
non_number_pages = 0
2023-03-14 18:12:38 -06:00
def get_num_pages ( ) :
while True :
try :
total = int ( driver . execute_script ( ' return document.getElementsByClassName( " sc-knKHOI gGldJU " )[0].innerHTML ' ) . strip ( ) . split ( ' / ' ) [ - 1 ] . strip ( ) )
try :
2023-03-14 22:10:07 -06:00
# Get the value of the page number textbox
2023-03-14 18:12:38 -06:00
current_page = driver . execute_script ( ' return document.getElementsByClassName( " InputControl__input-fbzQBk hDtUvs TextField__InputControl-iza-dmV iISUBf " )[0].value ' )
if current_page == ' ' or not current_page :
2023-03-14 22:10:07 -06:00
# This element may be empty so just set it to 0
2023-03-14 18:12:38 -06:00
current_page = 0
except selenium . common . exceptions . JavascriptException :
current_page = 0
return current_page , total
except selenium . common . exceptions . JavascriptException :
time . sleep ( 1 )
def load_book_page ( page_id ) :
driver . get ( f ' https://bookshelf.vitalsource.com/reader/books/ { args . isbn } /pageid/ { page_id } ' )
get_num_pages ( ) # Wait for the page to load
2023-03-14 22:10:07 -06:00
# Wait for the page loader animation to disappear
2023-03-14 18:12:38 -06:00
while len ( driver . find_elements ( By . CLASS_NAME , " sc-AjmGg dDNaMw " ) ) :
time . sleep ( 1 )
if not args . skip_scrape or args . only_scrape_metadata :
2023-03-14 12:00:38 -06:00
chrome_options = webdriver . ChromeOptions ( )
if args . disable_web_security :
chrome_options . add_argument ( ' --disable-web-security ' )
print ( ' DISABLED WEB SECURITY! ' )
chrome_options . add_argument ( ' --disable-http2 ' ) # VitalSource's shit HTTP2 server is really slow and will sometimes send bad data.
if args . chrome_exe :
chrome_options . binary_location = args . chrome_exe # '/usr/bin/google-chrome'
2023-03-14 22:10:07 -06:00
seleniumwire_options = {
' disable_encoding ' : True # Ask the server not to compress the response
}
2023-03-14 18:12:38 -06:00
driver = webdriver . Chrome ( service = Service ( ChromeDriverManager ( ) . install ( ) ) , chrome_options = chrome_options , seleniumwire_options = seleniumwire_options )
2023-03-12 22:41:19 -06:00
2023-03-14 12:00:38 -06:00
driver . get ( f ' https://bookshelf.vitalsource.com ' )
input ( ' Press ENTER once logged in... ' )
2023-03-12 22:41:19 -06:00
2023-03-14 12:00:38 -06:00
driver . maximize_window ( )
page_num = args . start_page
load_book_page ( page_num )
2023-03-14 18:12:38 -06:00
# Get book info
print ( ' Scraping metadata... ' )
2023-03-14 22:17:47 -06:00
time . sleep ( args . delay * 2 )
2023-03-14 22:10:07 -06:00
failed = True
2023-03-14 18:12:38 -06:00
for i in range ( 5 ) :
for request in driver . requests :
if request . url == f ' https://jigsaw.vitalsource.com/books/ { args . isbn } /pages ' :
wait = 0
while not request . response and wait < 30 :
time . sleep ( 1 )
wait + = 1
if not request . response or not request . response . body :
print ( ' Failed to get pages information. ' )
else :
book_info [ ' pages ' ] = json . loads ( request . response . body . decode ( ) )
elif request . url == f ' https://jigsaw.vitalsource.com/info/books.json?isbns= { args . isbn } ' :
wait = 0
while not request . response and wait < 30 :
time . sleep ( 1 )
wait + = 1
if not request . response or not request . response . body :
print ( ' Failed to get book information. ' )
else :
book_info [ ' book ' ] = json . loads ( request . response . body . decode ( ) )
elif request . url == f ' https://jigsaw.vitalsource.com/books/ { args . isbn } /toc ' :
wait = 0
while not request . response and wait < 30 :
time . sleep ( 1 )
wait + = 1
if not request . response or not request . response . body :
2023-03-14 22:10:07 -06:00
print ( ' Failed to get TOC information, only got: ' , list ( book_info . keys ( ) ) )
2023-03-14 18:12:38 -06:00
else :
book_info [ ' toc ' ] = json . loads ( request . response . body . decode ( ) )
2023-03-14 22:10:07 -06:00
if ' pages ' not in book_info . keys ( ) or ' book ' not in book_info . keys ( ) or ' toc ' not in book_info . keys ( ) :
print ( ' Missing some book data, only got: ' , list ( book_info . keys ( ) ) )
else :
failed = False
2023-03-14 18:12:38 -06:00
if not failed :
break
print ( ' Retrying metadata scrape in 10s... ' )
load_book_page ( page_num )
time . sleep ( 10 )
if args . only_scrape_metadata :
driver . close ( )
del driver
2023-03-14 22:10:07 -06:00
if not args . only_scrape_metadata :
_ , total_pages = get_num_pages ( )
2023-03-13 17:58:22 -06:00
2023-03-14 22:42:23 -06:00
if args . start_page > 0 :
print ( ' You specified a start page so ignore the very large page count. ' )
2023-03-14 22:10:07 -06:00
total_pages = 99999999999999999 if args . start_page > 0 else total_pages
print ( ' Total number of pages: ' , total_pages )
print ( ' Scraping pages... ' )
page_urls = set ( )
failed_pages = set ( )
small_pages_redo = set ( )
bar = tqdm ( total = total_pages )
bar . update ( page_num )
while page_num < total_pages + 1 :
time . sleep ( args . delay )
retry_delay = 5
base_url = None
for page_retry in range ( 3 ) : # retry the page max this many times
largest_size = 0
for find_img_retry in range ( 3 ) :
for request in driver . requests :
if request . url . startswith ( f ' https://jigsaw.vitalsource.com/books/ { args . isbn } /images/ ' ) :
base_url = request . url . split ( ' / ' )
del base_url [ - 1 ]
base_url = ' / ' . join ( base_url )
time . sleep ( 1 )
if base_url :
2023-03-14 12:00:38 -06:00
break
2023-03-14 22:10:07 -06:00
bar . write ( f ' Could not find a matching image for page { page_num } , sleeping { retry_delay } s... ' )
time . sleep ( retry_delay )
retry_delay + = 5
page , _ = get_num_pages ( )
if not base_url :
bar . write ( f ' Failed to get a URL for page { page_num } , retrying later. ' )
failed_pages . add ( page_num )
else :
page_urls . add ( ( page , base_url ) )
bar . write ( base_url )
# If this isn't a numbered page we will need to increment the page count
try :
int ( page )
except ValueError :
total_pages + = 1
non_number_pages + = 1
bar . write ( f ' Non-number page { page } , increasing page count by 1 to: { total_pages } ' )
bar . total = total_pages
bar . refresh ( )
if page_num == args . end_page :
bar . write ( f ' Exiting on page { page_num } . ' )
2023-03-14 12:00:38 -06:00
break
2023-03-14 22:10:07 -06:00
# On the first page the back arrow is disabled and will trigger this
if isinstance ( page_num , int ) and page_num > 0 :
try :
# If a page forward/backwards button is disabled
if driver . execute_script ( f ' return document.getElementsByClassName( " IconButton__button-bQttMI gHMmeA sc-oXPCX mwNce " )[0].disabled ' ) :
bar . write ( f ' Book completed, exiting. ' )
break
except selenium . common . exceptions . JavascriptException :
pass
# Move to the next page
2023-03-14 12:00:38 -06:00
del driver . requests
2023-03-14 22:10:07 -06:00
actions = ActionChains ( driver )
actions . send_keys ( Keys . RIGHT )
actions . perform ( )
bar . update ( )
page_num + = 1
bar . close ( )
print ( ' Re-doing failed pages... ' )
bar = tqdm ( total = len ( failed_pages ) )
for page in failed_pages :
load_book_page ( page )
2023-03-14 12:00:38 -06:00
time . sleep ( args . delay )
retry_delay = 5
2023-03-14 22:10:07 -06:00
base_url = None
2023-03-14 12:00:38 -06:00
for page_retry in range ( 3 ) : # retry the page max this many times
largest_size = 0
for find_img_retry in range ( 3 ) :
for request in driver . requests :
if request . url . startswith ( f ' https://jigsaw.vitalsource.com/books/ { args . isbn } /images/ ' ) :
2023-03-14 22:10:07 -06:00
base_url = request . url . split ( ' / ' )
del base_url [ - 1 ]
base_url = ' / ' . join ( base_url )
time . sleep ( 1 )
if base_url :
break
bar . write ( f ' Could not find a matching image for page { page_num } , sleeping { retry_delay } s... ' )
time . sleep ( retry_delay )
retry_delay + = 5
page , _ = get_num_pages ( )
if not base_url :
bar . write ( f ' Failed to get a URL for page { page_num } , retrying later. ' )
failed_pages . add ( page_num )
else :
page_urls . add ( ( page , base_url ) )
bar . write ( base_url )
del driver . requests
bar . update ( 1 )
bar . close ( )
time . sleep ( 1 )
print ( ' All pages scraped! Now downloading images... ' )
bar = tqdm ( total = len ( page_urls ) )
for page , base_url in page_urls :
success = False
for retry in range ( 6 ) :
del driver . requests
time . sleep ( args . delay / 2 )
driver . get ( f ' { base_url . strip ( " / " ) } /2000 ' )
time . sleep ( args . delay / 2 )
retry_delay = 5
img_data = None
for page_retry in range ( 3 ) : # retry the page max this many times
largest_size = 0
for find_img_retry in range ( 3 ) :
for request in driver . requests :
if request . url . startswith ( f ' https://jigsaw.vitalsource.com/books/ { args . isbn } /images/ ' ) :
img_data = request . response . body
break
dl_file = ebook_files / f ' { page } .jpg '
if img_data :
with open ( dl_file , ' wb ' ) as file :
file . write ( img_data )
# Re-save the image to make sure it's in the correct format
img = Image . open ( dl_file )
if img . width != 2000 :
bar . write ( f ' Image too small at { img . width } px wide, retrying: { base_url } ' )
driver . get ( ' https://google.com ' )
time . sleep ( 8 )
load_book_page ( 0 )
time . sleep ( 8 )
continue
img . save ( dl_file , format = ' JPEG ' , subsampling = 0 , quality = 100 )
del img
success = True
if success :
break
if not success :
bar . write ( f ' Failed to download image: { base_url } ' )
bar . update ( )
bar . close ( )
driver . close ( )
del driver
2023-03-14 12:00:38 -06:00
else :
2023-03-14 18:12:38 -06:00
print ( ' Page scrape skipped... ' )
2023-03-14 00:27:53 -06:00
2023-03-14 22:10:07 -06:00
# Sometimes the book skips a page. Add a blank page if thats the case.
print ( ' Checking for blank pages... ' )
existing_page_files = move_romans_to_front ( roman_sort_with_ints ( [ try_convert_int ( str ( x . stem ) ) for x in list ( ebook_files . iterdir ( ) ) ] ) )
if non_number_pages == 0 : # We might not have scraped so this number needs to be updated.
for item in existing_page_files :
if isinstance ( try_convert_int ( item ) , str ) :
non_number_pages + = 1
for page in tqdm ( iterable = existing_page_files ) :
page_i = try_convert_int ( page )
if isinstance ( page_i , int ) and page_i > 0 :
page_i + = non_number_pages
last_page_i = try_convert_int ( existing_page_files [ page_i - 1 ] )
if isinstance ( last_page_i , int ) :
last_page_i = last_page_i + non_number_pages
if last_page_i != page_i - 1 :
img = Image . new ( ' RGB ' , ( 2000 , 2588 ) , ( 255 , 255 , 255 ) )
img . save ( ebook_files / f ' { int ( page ) - 1 } .jpg ' )
tqdm . write ( f ' Created blank image for page { int ( page ) - 1 } . ' )
2023-03-14 11:49:09 -06:00
print ( ' Building PDF... ' )
2023-03-14 18:12:38 -06:00
raw_pdf_file = args . output / f ' { args . isbn } RAW.pdf '
2023-03-14 22:10:07 -06:00
existing_page_files = move_romans_to_front ( roman_sort_with_ints ( [ try_convert_int ( str ( x . stem ) ) for x in list ( ebook_files . iterdir ( ) ) ] ) )
page_files = [ str ( ebook_files / f ' { x } .jpg ' ) for x in existing_page_files ]
2023-03-14 00:27:53 -06:00
pdf = img2pdf . convert ( page_files )
2023-03-14 18:12:38 -06:00
with open ( raw_pdf_file , ' wb ' ) as f :
2023-03-14 00:27:53 -06:00
f . write ( pdf )
2023-03-13 17:58:22 -06:00
2023-03-14 18:12:38 -06:00
if ' book ' in book_info . keys ( ) and ' books ' in book_info [ ' book ' ] . keys ( ) and len ( book_info [ ' book ' ] [ ' books ' ] ) :
title = book_info [ ' book ' ] [ ' books ' ] [ 0 ] [ ' title ' ]
author = book_info [ ' book ' ] [ ' books ' ] [ 0 ] [ ' author ' ]
else :
title = args . isbn
author = ' Unknown '
if not args . skip_ocr :
print ( ' Running OCR... ' )
ocr_in = raw_pdf_file
_ , raw_pdf_file = tempfile . mkstemp ( )
subprocess . run ( f ' ocrmypdf -l { args . language } --title " { title } " --jobs $(nproc) --output-type pdfa " { ocr_in } " " { raw_pdf_file } " ' , shell = True )
else :
ebook_output_ocr = args . output / f ' { args . isbn } .pdf '
print ( ' Skipping OCR... ' )
# Add metadata
print ( ' Adding metadata... ' )
file_in = open ( raw_pdf_file , ' rb ' )
pdf_reader = PdfReader ( file_in )
pdf_merger = PdfMerger ( )
pdf_merger . append ( file_in )
2023-03-14 11:49:09 -06:00
2023-03-14 18:12:38 -06:00
pdf_merger . add_metadata ( { ' /Author ' : author , ' /Title ' : title , ' /Creator ' : f ' ISBN: { args . isbn } ' } )
2023-03-13 17:58:22 -06:00
2023-03-14 18:12:38 -06:00
if ' toc ' in book_info . keys ( ) :
print ( ' Creating TOC... ' )
for item in book_info [ ' toc ' ] :
pdf_merger . add_outline_item ( item [ ' title ' ] , int ( item [ ' cfi ' ] . strip ( ' / ' ) ) - 1 )
else :
print ( ' Not creating TOC... ' )
_ , tmpfile = tempfile . mkstemp ( )
pdf_merger . write ( open ( tmpfile , ' wb ' ) )
romans_end = 0
2023-03-14 22:10:07 -06:00
for p in existing_page_files :
2023-03-14 18:12:38 -06:00
if isinstance ( p , str ) :
romans_end + = 1
if romans_end > 0 :
print ( ' Renumbering pages... ' )
reader = pdfrw_reader ( tmpfile )
labels = PageLabels . from_pdf ( reader )
roman_labels = PageLabelScheme (
startpage = 0 ,
style = ' none ' ,
prefix = ' Cover ' ,
firstpagenum = 1
)
labels . append ( roman_labels )
roman_labels = PageLabelScheme (
startpage = 1 ,
style = ' roman lowercase ' ,
firstpagenum = 1
)
labels . append ( roman_labels )
normal_labels = PageLabelScheme (
startpage = romans_end ,
style = ' arabic ' ,
firstpagenum = 1
)
labels . append ( normal_labels )
labels . write ( reader )
writer = pdfrw_writer ( )
writer . trailer = reader
writer . write ( args . output / f ' { title } .pdf ' )
else :
shutil . move ( tmpfile , args . output / f ' { title } .pdf ' )
2023-03-13 17:58:22 -06:00
2023-03-14 18:12:38 -06:00
os . remove ( tmpfile )
2023-03-13 17:58:22 -06:00
2023-03-14 22:10:07 -06:00
if args . compress :
print ( ' Compressing PDF... ' )
# https://pypdf2.readthedocs.io/en/latest/user/file-size.html
reader = PdfReader ( args . output / f ' { title } .pdf ' )
writer = PdfWriter ( )
for page in reader . pages :
page . compress_content_streams ( ) # This is CPU intensive!
writer . add_page ( page )
with open ( args . output / f ' { title } compressed.pdf ' , ' wb ' ) as f :
writer . write ( f )