fix path issues, clean up, try to exclude already downloaded pages

2023-10-27 18:59:58 -06:00 · 2023-10-27 18:59:58 -06:00 · c1988263d6
parent fe96f19eca
commit c1988263d6
5 changed files with 28 additions and 17 deletions
--- a/export.py
+++ b/export.py
@ -1,6 +1,7 @@
 import json
 import os
 from http.cookiejar import MozillaCookieJar
+from pathlib import Path

 import jsonpickle
 import yaml
@ -17,7 +18,7 @@ with open("credentials.yaml", 'r') as f:
 API_URL = credentials["API_URL"]
 API_KEY = credentials["API_KEY"]
 USER_ID = credentials["USER_ID"]
-COOKIES_PATH = credentials["COOKIES_PATH"]
+COOKIES_PATH = str(Path(credentials["COOKIES_PATH"]).resolve().expanduser().absolute())
 COOKIE_JAR = MozillaCookieJar(COOKIES_PATH)
 COOKIE_JAR.load(ignore_discard=True, ignore_expires=True)

--- a/module/download_canvas.py
+++ b/module/download_canvas.py
@ -24,7 +24,7 @@ def download_course_files(course, course_view):
    try:
        files = list(course.get_files())
    except canvasapi.exceptions.Forbidden:
-        print('Files view disabled for this course.')
+        print('Files view is disabled for this course.')
        return

    for file in tqdm(files, desc='Downloading Files'):
--- a/module/helpers.py
+++ b/module/helpers.py
@ -6,6 +6,9 @@ def make_valid_filename(input_str):
    if not input_str:
        return input_str

+    # Make sure we have a string and not PosixPath
+    input_str = str(input_str)
+
    # Remove invalid characters
    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
    input_str = input_str.replace("+", " ")  # Canvas default for spaces
@ -23,6 +26,7 @@ def make_valid_filename(input_str):


 def make_valid_folder_path(input_str):
+    input_str = str(input_str)
    # Remove invalid characters
    valid_chars = "-_.()/ %s%s" % (string.ascii_letters, string.digits)
    input_str = input_str.replace("+", " ")  # Canvas default for spaces
@ -41,14 +45,15 @@ def make_valid_folder_path(input_str):
    return input_str


-def shorten_file_name(string, shorten_by) -> str:
-    if not string or shorten_by <= 0:
-        return string
+def shorten_file_name(input_string, shorten_by) -> str:
+    if not input_string or shorten_by <= 0:
+        return input_string
+    input_string = str(input_string)

    # Shorten string by specified value + 1 for "-" to indicate incomplete file name (trailing periods not allowed)
-    string = string[:len(string) - (shorten_by + 1)]
+    input_string = input_string[:len(input_string) - (shorten_by + 1)]

-    string = string.rstrip().rstrip(".").rstrip("-")
-    string += "-"
+    input_string = input_string.rstrip().rstrip(".").rstrip("-")
+    input_string += "-"

-    return string
+    return input_string
--- a/module/singlefile.py
+++ b/module/singlefile.py
@ -1,3 +1,4 @@
+from pathlib import Path
 from subprocess import run

 SINGLEFILE_BINARY_PATH = "./node_modules/single-file/cli/single-file"
@ -5,12 +6,16 @@ CHROME_PATH = "/usr/bin/chromium-browser"


 def add_quotes(s):
-    return "\"" + s.strip("\"") + "\""
+    return "\"" + str(s).strip("\"") + "\""


 def download_page(url, cookies_path, output_path, output_name_template=""):
    # TODO: we can probably safely exclude pages that match the regex r'/external_tools/retrieve\?'

+    if output_name_template and Path(output_path, output_name_template).exists():
+        print('exists')
+        return
+
    args = [
        add_quotes(SINGLEFILE_BINARY_PATH),
        "--browser-executable-path=" + add_quotes(CHROME_PATH.strip("\"")),
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
-requests
-jsonpickle
-canvasapi
-python-dateutil
-PyYAML
-tqdm
-bs4
+PyYAML==6.0.1
+beautifulsoup4==4.12.2
+canvasapi==3.2.0
+jsonpickle==3.0.2
+requests==2.31.0
+tqdm==4.66.1
+python-dateutil==-2.8.2