fix path issues, clean up, try to exclude already downloaded pages
This commit is contained in:
parent
fe96f19eca
commit
c1988263d6
|
@ -1,6 +1,7 @@
|
|||
import json
|
||||
import os
|
||||
from http.cookiejar import MozillaCookieJar
|
||||
from pathlib import Path
|
||||
|
||||
import jsonpickle
|
||||
import yaml
|
||||
|
@ -17,7 +18,7 @@ with open("credentials.yaml", 'r') as f:
|
|||
API_URL = credentials["API_URL"]
|
||||
API_KEY = credentials["API_KEY"]
|
||||
USER_ID = credentials["USER_ID"]
|
||||
COOKIES_PATH = credentials["COOKIES_PATH"]
|
||||
COOKIES_PATH = str(Path(credentials["COOKIES_PATH"]).resolve().expanduser().absolute())
|
||||
COOKIE_JAR = MozillaCookieJar(COOKIES_PATH)
|
||||
COOKIE_JAR.load(ignore_discard=True, ignore_expires=True)
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@ def download_course_files(course, course_view):
|
|||
try:
|
||||
files = list(course.get_files())
|
||||
except canvasapi.exceptions.Forbidden:
|
||||
print('Files view disabled for this course.')
|
||||
print('Files view is disabled for this course.')
|
||||
return
|
||||
|
||||
for file in tqdm(files, desc='Downloading Files'):
|
||||
|
|
|
@ -6,6 +6,9 @@ def make_valid_filename(input_str):
|
|||
if not input_str:
|
||||
return input_str
|
||||
|
||||
# Make sure we have a string and not PosixPath
|
||||
input_str = str(input_str)
|
||||
|
||||
# Remove invalid characters
|
||||
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
|
||||
input_str = input_str.replace("+", " ") # Canvas default for spaces
|
||||
|
@ -23,6 +26,7 @@ def make_valid_filename(input_str):
|
|||
|
||||
|
||||
def make_valid_folder_path(input_str):
|
||||
input_str = str(input_str)
|
||||
# Remove invalid characters
|
||||
valid_chars = "-_.()/ %s%s" % (string.ascii_letters, string.digits)
|
||||
input_str = input_str.replace("+", " ") # Canvas default for spaces
|
||||
|
@ -41,14 +45,15 @@ def make_valid_folder_path(input_str):
|
|||
return input_str
|
||||
|
||||
|
||||
def shorten_file_name(string, shorten_by) -> str:
|
||||
if not string or shorten_by <= 0:
|
||||
return string
|
||||
def shorten_file_name(input_string, shorten_by) -> str:
|
||||
if not input_string or shorten_by <= 0:
|
||||
return input_string
|
||||
input_string = str(input_string)
|
||||
|
||||
# Shorten string by specified value + 1 for "-" to indicate incomplete file name (trailing periods not allowed)
|
||||
string = string[:len(string) - (shorten_by + 1)]
|
||||
input_string = input_string[:len(input_string) - (shorten_by + 1)]
|
||||
|
||||
string = string.rstrip().rstrip(".").rstrip("-")
|
||||
string += "-"
|
||||
input_string = input_string.rstrip().rstrip(".").rstrip("-")
|
||||
input_string += "-"
|
||||
|
||||
return string
|
||||
return input_string
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from pathlib import Path
|
||||
from subprocess import run
|
||||
|
||||
SINGLEFILE_BINARY_PATH = "./node_modules/single-file/cli/single-file"
|
||||
|
@ -5,12 +6,16 @@ CHROME_PATH = "/usr/bin/chromium-browser"
|
|||
|
||||
|
||||
def add_quotes(s):
|
||||
return "\"" + s.strip("\"") + "\""
|
||||
return "\"" + str(s).strip("\"") + "\""
|
||||
|
||||
|
||||
def download_page(url, cookies_path, output_path, output_name_template=""):
|
||||
# TODO: we can probably safely exclude pages that match the regex r'/external_tools/retrieve\?'
|
||||
|
||||
if output_name_template and Path(output_path, output_name_template).exists():
|
||||
print('exists')
|
||||
return
|
||||
|
||||
args = [
|
||||
add_quotes(SINGLEFILE_BINARY_PATH),
|
||||
"--browser-executable-path=" + add_quotes(CHROME_PATH.strip("\"")),
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
requests
|
||||
jsonpickle
|
||||
canvasapi
|
||||
python-dateutil
|
||||
PyYAML
|
||||
tqdm
|
||||
bs4
|
||||
PyYAML==6.0.1
|
||||
beautifulsoup4==4.12.2
|
||||
canvasapi==3.2.0
|
||||
jsonpickle==3.0.2
|
||||
requests==2.31.0
|
||||
tqdm==4.66.1
|
||||
python-dateutil==-2.8.2
|
Loading…
Reference in New Issue