fix path issues, clean up, try to exclude already downloaded pages

This commit is contained in:
Cyberes 2023-10-27 18:59:58 -06:00
parent fe96f19eca
commit c1988263d6
5 changed files with 28 additions and 17 deletions

View File

@ -1,6 +1,7 @@
import json
import os
from http.cookiejar import MozillaCookieJar
from pathlib import Path
import jsonpickle
import yaml
@ -17,7 +18,7 @@ with open("credentials.yaml", 'r') as f:
API_URL = credentials["API_URL"]
API_KEY = credentials["API_KEY"]
USER_ID = credentials["USER_ID"]
COOKIES_PATH = credentials["COOKIES_PATH"]
COOKIES_PATH = str(Path(credentials["COOKIES_PATH"]).resolve().expanduser().absolute())
COOKIE_JAR = MozillaCookieJar(COOKIES_PATH)
COOKIE_JAR.load(ignore_discard=True, ignore_expires=True)

View File

@ -24,7 +24,7 @@ def download_course_files(course, course_view):
try:
files = list(course.get_files())
except canvasapi.exceptions.Forbidden:
print('Files view disabled for this course.')
print('Files view is disabled for this course.')
return
for file in tqdm(files, desc='Downloading Files'):

View File

@ -6,6 +6,9 @@ def make_valid_filename(input_str):
if not input_str:
return input_str
# Make sure we have a string and not PosixPath
input_str = str(input_str)
# Remove invalid characters
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
input_str = input_str.replace("+", " ") # Canvas default for spaces
@ -23,6 +26,7 @@ def make_valid_filename(input_str):
def make_valid_folder_path(input_str):
input_str = str(input_str)
# Remove invalid characters
valid_chars = "-_.()/ %s%s" % (string.ascii_letters, string.digits)
input_str = input_str.replace("+", " ") # Canvas default for spaces
@ -41,14 +45,15 @@ def make_valid_folder_path(input_str):
return input_str
def shorten_file_name(string, shorten_by) -> str:
if not string or shorten_by <= 0:
return string
def shorten_file_name(input_string, shorten_by) -> str:
if not input_string or shorten_by <= 0:
return input_string
input_string = str(input_string)
# Shorten string by specified value + 1 for "-" to indicate incomplete file name (trailing periods not allowed)
string = string[:len(string) - (shorten_by + 1)]
input_string = input_string[:len(input_string) - (shorten_by + 1)]
string = string.rstrip().rstrip(".").rstrip("-")
string += "-"
input_string = input_string.rstrip().rstrip(".").rstrip("-")
input_string += "-"
return string
return input_string

View File

@ -1,3 +1,4 @@
from pathlib import Path
from subprocess import run
SINGLEFILE_BINARY_PATH = "./node_modules/single-file/cli/single-file"
@ -5,12 +6,16 @@ CHROME_PATH = "/usr/bin/chromium-browser"
def add_quotes(s):
return "\"" + s.strip("\"") + "\""
return "\"" + str(s).strip("\"") + "\""
def download_page(url, cookies_path, output_path, output_name_template=""):
# TODO: we can probably safely exclude pages that match the regex r'/external_tools/retrieve\?'
if output_name_template and Path(output_path, output_name_template).exists():
print('exists')
return
args = [
add_quotes(SINGLEFILE_BINARY_PATH),
"--browser-executable-path=" + add_quotes(CHROME_PATH.strip("\"")),

View File

@ -1,7 +1,7 @@
requests
jsonpickle
canvasapi
python-dateutil
PyYAML
tqdm
bs4
PyYAML==6.0.1
beautifulsoup4==4.12.2
canvasapi==3.2.0
jsonpickle==3.0.2
requests==2.31.0
tqdm==4.66.1
python-dateutil==-2.8.2