refactor structure

2024-01-26 08:55:17 -07:00 · 2024-01-26 08:55:17 -07:00 · 55c2532075
parent 796500e954
commit 55c2532075
9 changed files with 315 additions and 485 deletions
--- a/export.py
+++ b/export.py
@ -5,22 +5,22 @@ from http.cookiejar import MozillaCookieJar
 from pathlib import Path
 import canvasapi
 import jsonpickle
 import requests
 import yaml
 from canvasapi import Canvas
-from module.const import COURSES_TO_SKIP, OUTPUT_LOCATION
+from module.const import global_consts
-from module.download_canvas import download_assignment_pages, download_course_announcement_pages, download_course_discussion_pages, download_course_files, download_course_module_pages, download_submission_attachments, download_course_grades_page, download_course_home_page_html, download_course_html
+from module.download_canvas import download_assignments, download_course_modules, download_course_grades_page, download_course_announcement_pages, download_course_home_page_html, download_course_discussion_pages
-from module.get_canvas import find_course_announcements, find_course_assignments, find_course_discussions, find_course_modules, find_course_pages
+from module.get_canvas import find_course_pages, find_course_modules, find_course_assignments, find_course_announcements, find_course_discussions
-from module.items import CanvasCourse
+from module.items import CanvasCourse, jsonify_anything
 from module.singlefile import download_page
 from module.user_files import download_user_files
 SCRIPT_PATH = os.path.abspath(os.path.dirname(__file__))
 def export_all_course_data(c):
-    json_data = json.dumps(json.loads(jsonpickle.encode(c, unpicklable=False)), indent=4)
+    json_data = jsonify_anything(c)
    course_output_dir = os.path.join(OUTPUT_LOCATION, c.term, c.name)
    if not os.path.exists(course_output_dir):
        os.makedirs(course_output_dir)
@ -48,17 +48,17 @@ if __name__ == "__main__":
    with open("credentials.yaml", 'r') as f:
        credentials = yaml.full_load(f)
-    API_URL = credentials["API_URL"]
+    global_consts.API_URL = credentials["API_URL"]
-    API_KEY = credentials["API_KEY"]
+    global_consts.API_KEY = credentials["API_KEY"]
-    USER_ID = credentials["USER_ID"]
+    global_consts.USER_ID = credentials["USER_ID"]
-    COOKIES_PATH = str(Path(credentials["COOKIES_PATH"]).resolve().expanduser().absolute())
+    global_consts.COOKIES_PATH = str(Path(credentials["COOKIES_PATH"]).resolve().expanduser().absolute())
-    if not Path(COOKIES_PATH).is_file():
+    if not Path(global_consts.COOKIES_PATH).is_file():
-        print('The cookies file does not exist:', COOKIES_PATH)
+        print('The cookies file does not exist:', global_consts.COOKIES_PATH)
        quit(1)
-    COOKIE_JAR = MozillaCookieJar(COOKIES_PATH)
+    global_consts.COOKIE_JAR = MozillaCookieJar(global_consts.COOKIES_PATH)
-    COOKIE_JAR.load(ignore_discard=True, ignore_expires=True)
+    global_consts.COOKIE_JAR.load(ignore_discard=True, ignore_expires=True)
    # ==================================================================================================================
    # Initialization
@ -68,18 +68,18 @@ if __name__ == "__main__":
        print("Creating output directory:", OUTPUT_LOCATION)
        os.makedirs(OUTPUT_LOCATION)
-    if COOKIES_PATH:
+    if global_consts.COOKIES_PATH:
        # Test the cookies.
        print("Authenticating with Canvas frontend...")
        # Requests takes a dict, not the MozillaCookieJar object.
-        request_cookies = {c.name: c.value for c in COOKIE_JAR}
+        request_cookies = {c.name: c.value for c in global_consts.COOKIE_JAR}
-        r = requests.get(f'{API_URL}/profile', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}, cookies=request_cookies)
+        r = requests.get(f'{global_consts.API_URL}/profile', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}, cookies=request_cookies)
        if r.status_code != 200:
            print('Failed to fetch Canvas profile: got status code', r.status_code)
            quit(1)
-        if not r.url.startswith(API_URL):
+        if not r.url.startswith(global_consts.API_URL):
            print('Failed to fetch Canvas profile: client was redirected away from Canvas:')
            print(r.url)
            quit(1)
@ -93,7 +93,7 @@ if __name__ == "__main__":
        print('No cookies file specified! No HTML pages will be saved.')
    print("Authenticating with Canvas API...")
-    canvas = Canvas(API_URL, API_KEY)
+    canvas = Canvas(global_consts.API_URL, global_consts.API_KEY)
    courses = canvas.get_courses(include="term")
    try:
        course_count = len(list(courses))
@ -108,15 +108,17 @@ if __name__ == "__main__":
    print('')
-    skip = set(COURSES_TO_SKIP)
+    skip = set(global_consts.COURSES_TO_SKIP)
    # ==================================================================================================================
    # Exporting
    print("Downloading courses page...")
-    download_course_html(API_URL, COOKIES_PATH)
+    courses_dict = {v['id']: v for v in json.loads(jsonify_anything(courses))['_elements']}
    (global_consts.OUTPUT_LOCATION / 'courses.json').write_text(json.dumps(courses_dict))
    download_page(global_consts.API_URL + "/courses/", global_consts.OUTPUT_LOCATION, "courses.html")
-    if not args.user_files:
+    if args.user_files:
        print('Downloading user files...')
        download_user_files(canvas, OUTPUT_LOCATION / 'User Files')
@ -128,56 +130,55 @@ if __name__ == "__main__":
        if course.id in skip or not hasattr(course, "name") or not hasattr(course, "term"):
            continue
-        course_view = CanvasCourse(course)
+        resolved_canvas_course = CanvasCourse(course)
-        if args.term and args.term != course_view.term:
+        if args.term and args.term != resolved_canvas_course.term:
-            print('Skipping term:', course_view.term, '\n')
+            print('Skipping term:', resolved_canvas_course.term, '\n')
            continue
-        print(f"=== {course_view.term}: {course_view.name} ===")
+        print(f"=== {resolved_canvas_course.term}: {resolved_canvas_course.name} ===")
-        valid, r = course_view.test_course(API_URL, COOKIE_JAR)
+        valid, r = resolved_canvas_course.test_course(global_consts.API_URL, global_consts.COOKIE_JAR)
        if not valid:
-            print(f'Invalid course: {course_view.course_id} - {r} - {r.text}')
+            print(f'Invalid course: {resolved_canvas_course.course_id} - {r} - {r.text}')
            if r.status_code == 401:
                # We can't recover from this error.
                quit(1)
            continue
-        course_view.assignments = find_course_assignments(course, USER_ID)
+        resolved_canvas_course.modules = find_course_modules(course)
-        course_view.announcements = find_course_announcements(course)
+        resolved_canvas_course.assignments = find_course_assignments(course)
-        course_view.discussions = find_course_discussions(course)
+        resolved_canvas_course.announcements = find_course_announcements(course)
-        course_view.pages = find_course_pages(course)
+        resolved_canvas_course.discussions = find_course_discussions(course)
-        course_view.modules = find_course_modules(course, course_view)
+        resolved_canvas_course.pages = find_course_pages(course)
-        all_courses_views.append(course_view)
+        all_courses_views.append(resolved_canvas_course)
        print('Downloading course home page...')
-        download_course_home_page_html(API_URL, course_view, COOKIES_PATH)
+        download_course_home_page_html(resolved_canvas_course)
        print('Downloading grades...')
-        download_course_grades_page(API_URL, course_view, COOKIES_PATH)
+        download_course_grades_page(resolved_canvas_course)
-        download_assignment_pages(API_URL, course_view, COOKIES_PATH, COOKIE_JAR)
+        download_assignments(resolved_canvas_course)
-        download_course_module_pages(API_URL, course_view, COOKIES_PATH)
+        download_course_modules(resolved_canvas_course)
-        download_course_announcement_pages(API_URL, course_view, COOKIES_PATH)
+        download_course_announcement_pages(resolved_canvas_course)
-        download_course_discussion_pages(API_URL, course_view, COOKIES_PATH)
+        download_course_discussion_pages(resolved_canvas_course)
-        download_course_files(course, course_view)
+        # TODO: nothing to test this on
-
+        # download_course_files(course)
        download_submission_attachments(course, course_view)
        print("Exporting course metadata...")
-        export_all_course_data(course_view)
+        export_all_course_data(resolved_canvas_course)
        if course_count > 1:
            print('')
    # Remove elements from the course objects that can't be JSON serialized, then format it.
-    json_str = json.dumps(json.loads(jsonpickle.encode(all_courses_views, unpicklable=False)), indent=4)
+    json_str = jsonify_anything(all_courses_views)
    all_output_path = os.path.join(OUTPUT_LOCATION, "all_output.json")
    with open(all_output_path, "w") as out_file:
--- a/module/api/init.py
+++ b/module/api/init.py
--- a/module/api/file.py
+++ b/module/api/file.py
@ -0,0 +1,21 @@
 import re
 import canvasapi
 from canvasapi.course import Course
 HTML_ITEM_ATTACHED_FILE_RE = re.compile(r'<a .*? data-api-endpoint=\"(.*?)\" .*?>')
 CANVAS_API_FILE_ID_RE = re.compile(r'.*?/api/v1/courses/.*?/files/(.*?)$')
 def get_embedded_files(course: Course, html: str):
    attached_files = set()
    file_matches = re.findall(HTML_ITEM_ATTACHED_FILE_RE, html)
    for match in file_matches:
        file_id = re.match(CANVAS_API_FILE_ID_RE, match)
        if file_id:
            try:
                canvas_file = course.get_file(file_id.group(1))
                attached_files.add(canvas_file)
            except canvasapi.exceptions.ResourceDoesNotExist:
                continue
    return attached_files
--- a/module/const.py
+++ b/module/const.py
@ -1,10 +1,13 @@
 from http.cookiejar import MozillaCookieJar
 from pathlib import Path
 class GlobalConsts:
    # Directory in which to download course information to (will be created if not present)
    OUTPUT_LOCATION = Path("./output").resolve().expanduser().absolute()
    # List of Course IDs that should be skipped (need to be integers)
-COURSES_TO_SKIP = [288290, 512033]
+    COURSES_TO_SKIP = []
    DATE_TEMPLATE = "%B %d, %Y %I:%M %p"
@ -12,3 +15,14 @@ DATE_TEMPLATE = "%B %d, %Y %I:%M %p"
    # Applies to modules, assignments, announcements, and discussions
    # If a folder exceeds this limit, a "-" will be added to the end to indicate it was shortened ("..." not valid)
    MAX_FOLDER_NAME_SIZE = 70
    COOKIES_PATH = ""
    COOKIE_JAR = MozillaCookieJar()
    API_URL = ""
    API_KEY = ""
    USER_ID = ""
 global_consts = GlobalConsts()
--- a/module/download_canvas.py
+++ b/module/download_canvas.py
@ -1,26 +1,20 @@
 import os
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from functools import partial
 from http.cookiejar import MozillaCookieJar
 from pathlib import Path
 import canvasapi
 import requests
 from tqdm import tqdm
-from module.const import OUTPUT_LOCATION, MAX_FOLDER_NAME_SIZE
+from module.api.file import get_embedded_files
 from module.const import global_consts
 from module.helpers import make_valid_filename, make_valid_folder_path, shorten_file_name
 from module.items import CanvasCourse, jsonify_anything
 from module.singlefile import download_page
 from module.threading import download_assignment, download_module_item
 def download_course_files(course, course_view):
-    # file full_name starts with "course files"
+    dl_dir = global_consts.OUTPUT_LOCATION / course_view.term / course_view.name
-    dl_dir = os.path.join(OUTPUT_LOCATION, course_view.term, course_view.name)
+    dl_dir.mkdir(parents=True, exist_ok=True)
    # Create directory if not present
    if not os.path.exists(dl_dir):
        os.makedirs(dl_dir)
    try:
        files = list(course.get_files())
@ -31,205 +25,109 @@ def download_course_files(course, course_view):
    for file in tqdm(files, desc='Downloading Files'):
        try:
            file_folder = course.get_folder(file.folder_id)
-
+            folder_dl_dir = dl_dir / make_valid_folder_path(file_folder.full_name)
-            folder_dl_dir = os.path.join(dl_dir, make_valid_folder_path(file_folder.full_name))
+            folder_dl_dir.mkdir(parents=True, exist_ok=True)
-
+            dl_path = folder_dl_dir / make_valid_filename(str(file.display_name))
            if not os.path.exists(folder_dl_dir):
                os.makedirs(folder_dl_dir)
            dl_path = os.path.join(folder_dl_dir, make_valid_filename(str(file.display_name)))
            # Download file if it doesn't already exist
            if not os.path.exists(dl_path):
                # print('Downloading: {}'.format(dl_path))
            file.download(dl_path)
        except Exception as e:
            tqdm.write(f"Skipping {file.display_name} - {e}")
-def download_course_discussion_pages(api_url, course_view, cookies_path):
+def download_course_discussion_pages(resolved_course: CanvasCourse):
-    if cookies_path == "" or len(course_view.discussions) == 0:
+    if not len(resolved_course.discussions):
        return
-    base_discussion_dir = os.path.join(OUTPUT_LOCATION, course_view.term, course_view.name, "discussions")
+    base_discussion_dir = global_consts.OUTPUT_LOCATION / resolved_course.term / resolved_course.name / 'discussions'
-    if not os.path.exists(base_discussion_dir):
+    base_discussion_dir.mkdir(parents=True, exist_ok=True)
        os.makedirs(base_discussion_dir)
-    discussion_list_dir = os.path.join(base_discussion_dir, "discussion_list.html")
+    # (base_discussion_dir / 'discussions.json').write_text(jsonify_anything(resolved_course.discussions))
    download_page(global_consts.API_URL + "/courses/" + str(resolved_course.course_id) + "/discussion_topics/", base_discussion_dir, "discussions.html")
-    # Download assignment list (theres a chance this might be the course homepage if the course has the assignments page disabled)
+    for discussion in tqdm(list(resolved_course.discussions), desc='Downloading Discussions'):
    if not os.path.exists(discussion_list_dir):
        download_page(api_url + "/courses/" + str(course_view.course_id) + "/discussion_topics/", cookies_path, base_discussion_dir, "discussion_list.html")
    for discussion in tqdm(list(course_view.discussions), desc='Downloading Discussions'):
        discussion_title = make_valid_filename(str(discussion.title))
-        discussion_title = shorten_file_name(discussion_title, len(discussion_title) - MAX_FOLDER_NAME_SIZE)
+        discussion_title = shorten_file_name(discussion_title, len(discussion_title) - global_consts.MAX_FOLDER_NAME_SIZE)
-        discussion_dir = os.path.join(base_discussion_dir, discussion_title)
+        discussion_dir = base_discussion_dir / discussion_title
-        if discussion.url == "":
+        if not discussion.url:
            continue
-        if not os.path.exists(discussion_dir):
+        discussion_dir.mkdir(parents=True, exist_ok=True)
-            os.makedirs(discussion_dir)
+
        for file in get_embedded_files(resolved_course.course, discussion.body):
            file.download(discussion_dir / file.display_name)
        # Downloads each page that a discussion takes.
        for i in range(discussion.amount_pages):
            filename = "discussion_" + str(i + 1) + ".html"
-            discussion_page_dir = os.path.join(discussion_dir, filename)
+            download_page(discussion.url + "/page-" + str(i + 1), discussion_dir, filename)
            # Download assignment page, this usually has instructions and etc.
            if not os.path.exists(discussion_page_dir):
                download_page(discussion.url + "/page-" + str(i + 1), cookies_path, discussion_dir, filename)
-def download_assignment_pages(api_url, course_view, cookies_path, cookie_jar: MozillaCookieJar):
+def download_assignments(course_view: CanvasCourse):
-    if cookies_path == "" or len(course_view.assignments) == 0:
+    if not len(course_view.assignments):
        return
-    base_assign_dir = os.path.join(OUTPUT_LOCATION, course_view.term, course_view.name, "assignments")
+    base_assign_dir = global_consts.OUTPUT_LOCATION / course_view.term / course_view.name / 'assignments'
-    if not os.path.exists(base_assign_dir):
+    base_assign_dir.mkdir(parents=True, exist_ok=True)
        os.makedirs(base_assign_dir)
-    assignment_list_path = os.path.join(base_assign_dir, "assignment_list.html")
+    # (base_assign_dir / 'assignments.json').write_text(jsonify_anything(course_view.assignments))
-
+    download_page(global_consts.API_URL + "/courses/" + str(course_view.course_id) + "/assignments/", base_assign_dir, "assignments.html")
    # Download assignment list (theres a chance this might be the course homepage if the course has the assignments page disabled)
    if not os.path.exists(assignment_list_path):
        download_page(api_url + "/courses/" + str(course_view.course_id) + "/assignments/", cookies_path, base_assign_dir, "assignment_list.html")
    with ThreadPoolExecutor(max_workers=3) as executor:
-        download_func = partial(download_assignment, cookies_path, cookie_jar, base_assign_dir)
+        download_func = partial(download_assignment, base_assign_dir, course_view.course)
        list(tqdm(executor.map(download_func, course_view.assignments), total=len(course_view.assignments), desc='Downloading Assignments'))
-def download_course_announcement_pages(api_url, course_view, cookies_path):
+def download_course_announcement_pages(resolved_course: CanvasCourse):
-    """
+    if not len(resolved_course.announcements):
    Download assignment list.
    There's a chance this might be the course homepage if the course has the assignments page disabled.
    :param api_url:
    :param course_view:
    :param cookies_path:
    :return:
    """
    if cookies_path == "" or len(course_view.announcements) == 0:
        return
-    base_announce_dir = os.path.join(OUTPUT_LOCATION, course_view.term, course_view.name, "announcements")
+    base_announce_dir = global_consts.OUTPUT_LOCATION / resolved_course.term / resolved_course.name / 'announcements'
-    if not os.path.exists(base_announce_dir):
+    base_announce_dir.mkdir(parents=True, exist_ok=True)
        os.makedirs(base_announce_dir)
    announcement_list_dir = os.path.join(base_announce_dir, "announcement_list.html")
    if not os.path.exists(announcement_list_dir):
        download_page(api_url + "/courses/" + str(course_view.course_id) + "/announcements/", cookies_path, base_announce_dir, "announcement_list.html")
-    for announcements in tqdm(list(course_view.announcements), desc='Downloading Announcements'):
+    # (base_announce_dir / 'announcements.json').write_text(jsonify_anything(resolved_course.announcements))
-        announcements_title = make_valid_filename(str(announcements.title))
+    download_page(global_consts.API_URL + "/courses/" + str(resolved_course.course_id) + "/announcements/", base_announce_dir, "announcements.html")
        announcements_title = shorten_file_name(announcements_title, len(announcements_title) - MAX_FOLDER_NAME_SIZE)
        announce_dir = os.path.join(base_announce_dir, announcements_title)
-        if announcements.url == "":
+    for announcement in tqdm(list(resolved_course.announcements), desc='Downloading Announcements'):
        announcements_title = make_valid_filename(str(announcement.title))
        announcements_title = shorten_file_name(announcements_title, len(announcements_title) - global_consts.MAX_FOLDER_NAME_SIZE)
        announce_dir = base_announce_dir / announcements_title
        if not announcement.url:
            continue
-        if not os.path.exists(announce_dir):
+        announce_dir.mkdir(parents=True, exist_ok=True)
            os.makedirs(announce_dir)
-        # Downloads each page that a discussion takes.
+        for file in get_embedded_files(resolved_course.course, announcement.body):
-        for i in range(announcements.amount_pages):
+            file.download(announce_dir / file.display_name)
        for i in range(announcement.amount_pages):
            filename = "announcement_" + str(i + 1) + ".html"
-            announcement_page_dir = os.path.join(announce_dir, filename)
+            download_page(announcement.url + "/page-" + str(i + 1), announce_dir, filename)
            # Download assignment page, this usually has instructions and etc.
            if not os.path.exists(announcement_page_dir):
                download_page(announcements.url + "/page-" + str(i + 1), cookies_path, announce_dir, filename)
-def download_submission_attachments(course, course_view):
+def download_course_home_page_html(course_view):
-    course_dir = os.path.join(OUTPUT_LOCATION, course_view.term, course_view.name)
+    dl_dir = global_consts.OUTPUT_LOCATION / course_view.term / course_view.name
-
+    dl_dir.mkdir(parents=True, exist_ok=True)
-    # Create directory if not present
+    download_page(global_consts.API_URL + "/courses/" + str(course_view.course_id), dl_dir, "homepage.html")
    if not os.path.exists(course_dir):
        os.makedirs(course_dir)
    for assignment in tqdm(list(course_view.assignments), desc='Downloading Submissions'):
        for submission in assignment.submissions:
            assignment_title = make_valid_filename(str(assignment.title))
            assignment_title = shorten_file_name(assignment_title, len(assignment_title) - MAX_FOLDER_NAME_SIZE)
            attachment_dir = os.path.join(course_dir, "assignments", assignment_title)
            if len(assignment.submissions) != 1:
                attachment_dir = os.path.join(attachment_dir, str(submission.user_id))
            if not os.path.exists(attachment_dir) and submission.attachments:
                os.makedirs(attachment_dir)
            for attachment in submission.attachments:
                filepath = os.path.join(attachment_dir, make_valid_filename(str(attachment.id) + "_" + attachment.filename))
                if not os.path.exists(filepath):
                    # print('Downloading attachment: {}'.format(filepath))
                    r = requests.get(attachment.url, allow_redirects=True)
                    with open(filepath, 'wb') as f:
                        f.write(r.content)
                # else:
                #     print('File already exists: {}'.format(filepath))
-def download_course_html(api_url, cookies_path):
+def download_course_modules(course_view: CanvasCourse):
-    if cookies_path == "":
+    modules_dir = global_consts.OUTPUT_LOCATION / course_view.term / course_view.name / 'modules'
-        return
+    modules_dir.mkdir(parents=True, exist_ok=True)
-    course_dir = OUTPUT_LOCATION
+    # (modules_dir / 'modules.json').write_text(jsonify_anything(course_view.modules))
-
+    download_page(global_consts.API_URL + "/courses/" + str(course_view.course_id) + "/modules/", modules_dir, "modules.html")
    if not os.path.exists(course_dir):
        os.makedirs(course_dir)
    course_list_path = os.path.join(course_dir, "course_list.html")
    # Downloads the course list.
    if not os.path.exists(course_list_path):
        download_page(api_url + "/courses/", cookies_path, course_dir, "course_list.html")
 def download_course_home_page_html(api_url, course_view, cookies_path):
    if cookies_path == "":
        return
    dl_dir = os.path.join(OUTPUT_LOCATION, course_view.term, course_view.name)
    if not os.path.exists(dl_dir):
        os.makedirs(dl_dir)
    homepage_path = os.path.join(dl_dir, "homepage.html")
    # Downloads the course home page.
    if not os.path.exists(homepage_path):
        download_page(api_url + "/courses/" + str(course_view.course_id), cookies_path, dl_dir, "homepage.html")
 def download_course_module_pages(api_url, course_view, cookies_path):
    if cookies_path == "" or len(course_view.modules) == 0:
        return
    modules_dir = os.path.join(OUTPUT_LOCATION, course_view.term, course_view.name, "modules")
    if not os.path.exists(modules_dir):
        os.makedirs(modules_dir)
    # Downloads the modules page (possible this is disabled by the teacher)
    module_list_dir = Path(str(modules_dir), "modules_list.html")
    if not os.path.exists(module_list_dir):
        download_page(api_url + "/courses/" + str(course_view.course_id) + "/modules/", cookies_path, modules_dir, "modules_list.html")
    with ThreadPoolExecutor(max_workers=3) as executor:
        for module in tqdm(list(course_view.modules), desc='Downloading Modules'):
-            bar = tqdm(list(module.items), leave=False, desc=module.name)
+            bar = tqdm(list(module.items), leave=False, desc=module.module.name)
-            futures = [executor.submit(download_module_item, module, item, modules_dir, cookies_path) for item in module.items]
+            futures = [executor.submit(download_module_item, course_view.course, module, item, modules_dir) for item in module.items]
            for _ in as_completed(futures):
                bar.update()
            bar.close()
-def download_course_grades_page(api_url, course_view, cookies_path):
+def download_course_grades_page(course_view: CanvasCourse):
-    if cookies_path == "":
+    dl_dir = global_consts.OUTPUT_LOCATION / course_view.term / course_view.name
        return
    dl_dir = Path(OUTPUT_LOCATION, course_view.term, course_view.name)
    dl_dir.mkdir(parents=True, exist_ok=True)
-
+    api_target = f'{global_consts.API_URL}/courses/{course_view.course_id}/grades'
-    # TODO: command line arg to prohibit overwrite. Default should overwrite
+    download_page(api_target, dl_dir, "grades.html")
    if not (dl_dir / "grades.html").exists():
        api_target = f'{api_url}/courses/{course_view.course_id}/grades'
        download_page(api_target, cookies_path, dl_dir, "grades.html")
--- a/module/get_canvas.py
+++ b/module/get_canvas.py
@ -1,113 +1,51 @@
 import os
 import re
-from http.cookiejar import MozillaCookieJar
+from typing import List
 import canvasapi
 import dateutil.parser
-import requests
+from canvasapi.discussion_topic import DiscussionTopic
 from bs4 import BeautifulSoup
 from tqdm import tqdm
-from module.const import DATE_TEMPLATE, OUTPUT_LOCATION, MAX_FOLDER_NAME_SIZE
+from module.const import global_consts
-from module.helpers import make_valid_filename, shorten_file_name
+from module.items import CanvasDiscussion, CanvasPage, CanvasTopicEntry, CanvasTopicReply, CanvasModule
 from module.items import AssignmentView, AttachmentView, DiscussionView, CanvasModuleItem, CanvasModule, PageView, SubmissionView, TopicEntryView, TopicReplyView
-MODULE_ITEM_ATTACHED_FILE_RE = re.compile(r'<a .*? data-api-endpoint="(.*?)" .*?>')
+HTML_ITEM_ATTACHED_FILE_RE = re.compile(r'<a .*? data-api-endpoint=\"(.*?)\" .*?>')
 CANVAS_API_FILE_ID_RE = re.compile(r'.*?/api/v1/courses/.*?/files/(.*?)$')
-def find_course_modules(course, course_view):
+def find_course_modules(course) -> List[CanvasModule]:
-    modules_dir = os.path.join(OUTPUT_LOCATION, course_view.term, course_view.name, "modules")
+    # modules_dir = os.path.join(global_consts.OUTPUT_LOCATION, course_view.term, course_view.name, "modules")
-    # Create modules directory if not present
+    results = []
    if not os.path.exists(modules_dir):
        os.makedirs(modules_dir)
    module_views = []
    try:
        modules = list(course.get_modules())
        for module in tqdm(modules, desc='Fetching Modules'):
            module_view = CanvasModule()
            module_view.id = module.id if hasattr(module, "id") else ""
            module_view.name = str(module.name) if hasattr(module, "name") else ""
            try:
-                # Get items for each module
+                resolved_module = CanvasModule(module)
-                for item in module.get_module_items():
+                for item in resolved_module.items:
-                    module_item = CanvasModuleItem()
+                    if item.item.type == 'Page':
-                    module_item.id = item.id if hasattr(item, "id") else 0
+                        page = course.get_page(item.item.page_url)
-                    module_item.title = str(item.title).replace('  ', ' ') if hasattr(item, "title") else ""
+                        item.page = page
                    module_item.content_type = str(item.type) if hasattr(item, "type") else ""
                    module_item.url = str(item.html_url) if hasattr(item, "html_url") else ""
                    module_item.external_url = str(item.external_url) if hasattr(item, "external_url") else ""
                    if module_item.content_type == "File":
                        # If problems arise due to long pathnames, changing module.name to module.id might help
                        # A change would also have to be made in downloadCourseModulePages(api_url, course_view, cookies_path)
                        module_name = make_valid_filename(str(module.name))
                        module_name = shorten_file_name(module_name, len(module_name) - MAX_FOLDER_NAME_SIZE)
                        module_dir = os.path.join(modules_dir, module_name, "files")
                        try:
                            # Create directory for current module if not present
                            if not os.path.exists(module_dir):
                                os.makedirs(module_dir)
                            # Get the file object
                            module_file = course.get_file(str(item.content_id))
                            # Create path for module file download
                            module_file_path = os.path.join(module_dir, make_valid_filename(str(module_file.display_name)))
                            # Download file if it doesn't already exist
                            if not os.path.exists(module_file_path):
                                module_file.download(module_file_path)
                        except Exception as e:
                            tqdm.write(f"Skipping module file download that gave the following error: {e} - {item}")
                    elif item.type == 'Page':
                        page = course.get_page(item.page_url)
                        if hasattr(page, 'body'):
                            # Extract the attached files from the item's HTML.
-                            file_matches = re.findall(MODULE_ITEM_ATTACHED_FILE_RE, page.body)
+                            file_matches = re.findall(HTML_ITEM_ATTACHED_FILE_RE, page.body)
                            for match in file_matches:
                                file_id = re.match(CANVAS_API_FILE_ID_RE, match)
                                if file_id:
                                    try:
                                        # Grab the metadata from the API.
                                        canvas_file = course.get_file(file_id.group(1))
-                                    module_item.attached_files.add(canvas_file)
+                                        item.attached_files.add(canvas_file)
-
+                                    except canvasapi.exceptions.ResourceDoesNotExist:
-                    module_view.items.append(module_item)
+                                        continue
                results.append(resolved_module)
            except Exception as e:
                tqdm.write(f"Skipping module file download that gave the following error: {e}")
    except Exception as e:
        tqdm.write(f"Skipping module file download that gave the following error: {e}")
-            module_views.append(module_view)
+    return results
    except Exception as e:
        print("Skipping entire module that gave the following error:")
        print(e)
    return module_views
 def get_extra_assignment_files(html, cookie_jar: MozillaCookieJar):
    soup = BeautifulSoup(html, 'html.parser')
    urls = [a['data-api-endpoint'] for a in soup.find_all('a', {'data-api-returntype': 'File'})]
    s = requests.Session()
    for cookie in cookie_jar:
        s.cookies.set(cookie.name, cookie.value)
    extra_files = []
    for item in urls:
        r = s.get(item)
        if r.status_code != 200:
            continue
        j = r.json()
        extra_files.append((j['display_name'], j['url']))
    return extra_files
 def get_course_page_urls(course):
@ -132,18 +70,18 @@ def find_course_pages(course):
        for url in tqdm(page_urls, desc='Fetching Pages'):
            page = course.get_page(url)
-            page_view = PageView()
+            page_view = CanvasPage()
            page_view.id = page.id if hasattr(page, "id") else 0
            page_view.title = str(page.title).replace('  ', ' ') if hasattr(page, "title") else ""
            page_view.body = str(page.body) if hasattr(page, "body") else ""
            if hasattr(page, "created_at"):
-                page_view.created_date = dateutil.parser.parse(page.created_at).strftime(DATE_TEMPLATE)
+                page_view.created_date = dateutil.parser.parse(page.created_at).strftime(global_consts.DATE_TEMPLATE)
            else:
                page_view.created_date = ''
            if hasattr(page, "updated_at"):
-                page_view.last_updated_date = dateutil.parser.parse(page.updated_at).strftime(DATE_TEMPLATE)
+                page_view.last_updated_date = dateutil.parser.parse(page.updated_at).strftime(global_consts.DATE_TEMPLATE)
            else:
                page_view.last_updated_date = ''
@ -154,83 +92,31 @@ def find_course_pages(course):
    return page_views
-def find_course_assignments(course, user_id):
+def find_course_assignments(course):
-    assignment_views = []
+    results = []
    # Get all assignments
    assignments = list(course.get_assignments())
    for assignment in tqdm(assignments, desc='Fetching Assignments'):
-        assignment_view = AssignmentView()
+        # Have to re-define the object because the `/api/v1/courses/:course_id/assignments` endpoint is sometimes outdated.
-        assignment_view.id = assignment.id if hasattr(assignment, "id") else ""
+        # The endpoint `/api/v1/courses/:course_id/assignments/:id` has the most up to date data.
-        assignment_view.title = make_valid_filename(str(assignment.name).replace('  ', ' ')) if hasattr(assignment, "name") else ""
+        assignment = course.get_assignment(assignment.id)
-        assignment_view.description = str(assignment.description) if hasattr(assignment, "description") else ""
+        results.append(assignment)
-        assignment_view.assigned_date = assignment.created_at_date.strftime(DATE_TEMPLATE) if hasattr(assignment, "created_at_date") else ""
+    return results
        assignment_view.due_date = assignment.due_at_date.strftime(DATE_TEMPLATE) if hasattr(assignment, "due_at_date") else ""
        assignment_view.html_url = assignment.html_url if hasattr(assignment, "html_url") else ""
        assignment_view.ext_url = str(assignment.url) if hasattr(assignment, "url") else ""
        assignment_view.updated_url = str(assignment.submissions_download_url).split("submissions?")[0] if hasattr(assignment, "submissions_download_url") else ""
        # Download submission for this user only
        submissions = [assignment.get_submission(user_id)]
        if not len(submissions):
            raise IndexError(f'No submissions found for assignment: {vars(assignment)}')
        try:
            for submission in submissions:
                sub_view = SubmissionView()
                sub_view.id = submission.id if hasattr(submission, "id") else 0
                sub_view.grade = str(submission.grade) if hasattr(submission, "grade") else ""
                sub_view.raw_score = str(submission.score) if hasattr(submission, "score") else ""
                sub_view.total_possible_points = str(assignment.points_possible) if hasattr(assignment, "points_possible") else ""
                sub_view.submission_comments = str(submission.submission_comments) if hasattr(submission, "submission_comments") else ""
                sub_view.attempt = submission.attempt if hasattr(submission, "attempt") and submission.attempt is not None else 0
                sub_view.user_id = str(submission.user_id) if hasattr(submission, "user_id") else ""
                sub_view.preview_url = str(submission.preview_url) if hasattr(submission, "preview_url") else ""
                sub_view.ext_url = str(submission.url) if hasattr(submission, "url") else ""
                try:
                    submission.attachments
                except AttributeError:
                    print('No attachments')
                else:
                    for attachment in submission.attachments:
                        attach_view = AttachmentView()
                        attach_view.url = attachment.url
                        attach_view.id = attachment.id
                        attach_view.filename = attachment.filename
                        sub_view.attachments.append(attach_view)
                assignment_view.submissions.append(sub_view)
        except Exception as e:
            raise
            # print("Skipping submission that gave the following error:")
            # print(e)
        assignment_views.append(assignment_view)
    return assignment_views
 def find_course_announcements(course):
    announcement_views = []
-
+    announcements: List[DiscussionTopic] = list(course.get_discussion_topics(only_announcements=True))
    # try:
    announcements = list(course.get_discussion_topics(only_announcements=True))
    for announcement in tqdm(announcements, desc='Fetching Announcements'):
        discussion_view = get_discussion_view(announcement)
        announcement_views.append(discussion_view)
    # except Exception as e:
    #     print("Skipping announcement that gave the following error:")
    #     print(e)
    return announcement_views
 def get_discussion_view(discussion_topic):
    # Create discussion view
-    discussion_view = DiscussionView()
+    discussion_view = CanvasDiscussion(discussion_topic)
    discussion_view.id = discussion_topic.id if hasattr(discussion_topic, "id") else 0
    discussion_view.title = str(discussion_topic.title).replace('  ', ' ') if hasattr(discussion_topic, "title") else ""
    discussion_view.author = str(discussion_topic.user_name) if hasattr(discussion_topic, "user_name") else ""
@ -250,7 +136,7 @@ def get_discussion_view(discussion_topic):
                topic_entries_counter += 1
                # Create new discussion view for the topic_entry
-                topic_entry_view = TopicEntryView()
+                topic_entry_view = CanvasTopicEntry()
                topic_entry_view.id = topic_entry.id if hasattr(topic_entry, "id") else 0
                topic_entry_view.author = str(topic_entry.user_name) if hasattr(topic_entry, "user_name") else ""
                topic_entry_view.posted_date = topic_entry.created_at_date.strftime("%B %d, %Y %I:%M %p") if hasattr(topic_entry, "created_at_date") else ""
@ -262,7 +148,7 @@ def get_discussion_view(discussion_topic):
                try:
                    for topic_reply in topic_entry_replies:
                        # Create new topic reply view
-                        topic_reply_view = TopicReplyView()
+                        topic_reply_view = CanvasTopicReply()
                        topic_reply_view.id = topic_reply.id if hasattr(topic_reply, "id") else 0
                        topic_reply_view.author = str(topic_reply.user_name) if hasattr(topic_reply, "user_name") else ""
                        topic_reply_view.posted_date = topic_reply.created_at_date.strftime("%B %d, %Y %I:%M %p") if hasattr(topic_reply, "created_at_date") else ""
@ -286,15 +172,8 @@ def get_discussion_view(discussion_topic):
 def find_course_discussions(course):
    discussion_views = []
    # try:
    discussion_topics = list(course.get_discussion_topics())
    for discussion_topic in tqdm(discussion_topics, desc='Fetching Discussions'):
        discussion_view = get_discussion_view(discussion_topic)
        discussion_views.append(discussion_view)
    # except Exception as e:
    #     print("Skipping discussion that gave the following error:")
    #     print(e)
    return discussion_views
--- a/module/items.py
+++ b/module/items.py
@ -1,29 +1,64 @@
 import json
 from http.cookiejar import MozillaCookieJar
 from typing import List, Any
 import requests
 from canvasapi.assignment import Assignment
 from canvasapi.course import Course
 from canvasapi.file import File
 from canvasapi.module import ModuleItem, Module
 from canvasapi.page import Page
 from module.helpers import make_valid_filename
 def varsify(item) -> Any:
    result = {}
    try:
        if isinstance(item, (str, int, float, bool)):
            return item
        elif isinstance(item, (list, set)):
            l_result = []
            for i, x in enumerate(item):
                l_result.append(varsify(x))
            return l_result
        else:
            for k, v in vars(item).items():
                if isinstance(v, dict):
                    result[k] = varsify(v)
                elif isinstance(v, list):
                    result[k] = []
                    for i, x in enumerate(v):
                        result[k].insert(i, varsify(x))
                else:
                    if not k.startswith('_'):
                        result[k] = varsify(v)
            return result
    except:
        return item
 def jsonify_anything(item):
    return json.dumps(varsify(item), indent=4, sort_keys=True, default=str)
 class CanvasModuleItem:
-    def __init__(self):
+    def __init__(self, module_item: ModuleItem):
-        self.id = 0
+        self.item = module_item
        self.title = ""
        self.content_type = ""
        self.url = ""
        self.external_url = ""
        self.attached_files: set[File] = set()
        self.page: Page
 class CanvasModule:
-    def __init__(self):
+    def __init__(self, module: Module):
-        self.id = 0
+        self.module = module
-        self.name = ""
+        self.items: List[CanvasModuleItem] = []
-        self.items = []
+        for item in module.get_module_items():
            i = self.module.get_module_item(item.id)
            self.items.append(CanvasModuleItem(i))
-class PageView:
+class CanvasPage:
    def __init__(self):
        self.id = 0
        self.title = ""
@ -32,7 +67,7 @@ class PageView:
        self.last_updated_date = ""
-class TopicReplyView:
+class CanvasTopicReply:
    def __init__(self):
        self.id = 0
        self.author = ""
@ -40,7 +75,7 @@ class TopicReplyView:
        self.body = ""
-class TopicEntryView:
+class CanvasTopicEntry:
    def __init__(self):
        self.id = 0
        self.author = ""
@ -49,8 +84,9 @@ class TopicEntryView:
        self.topic_replies = []
-class DiscussionView:
+class CanvasDiscussion:
-    def __init__(self):
+    def __init__(self, discussion):
        self.discussion = discussion
        self.id = 0
        self.title = ""
        self.author = ""
@ -61,7 +97,7 @@ class DiscussionView:
        self.amount_pages = 0
-class SubmissionView:
+class CanvasSubmission:
    def __init__(self):
        self.id = 0
        self.attachments = []
@ -75,41 +111,25 @@ class SubmissionView:
        self.ext_url = ""
 class AttachmentView:
    def __init__(self):
        self.id = 0
        self.filename = ""
        self.url = ""
 class AssignmentView:
    def __init__(self):
        self.id = 0
        self.title = ""
        self.description = ""
        self.assigned_date = ""
        self.due_date = ""
        self.submissions = []
        self.html_url = ""
        self.ext_url = ""
        self.updated_url = ""
 class CanvasCourse:
    def __init__(self, course):
        self.course: Course = course
        self.course_id = course.id if hasattr(course, "id") else 0
        self.term = make_valid_filename(course.term["name"] if hasattr(course, "term") and "name" in course.term.keys() else "")
        self.course_code = make_valid_filename(course.course_code if hasattr(course, "course_code") else "")
        if hasattr(course, 'original_name'):
            self.name = course.original_name
        else:
            self.name = course.name if hasattr(course, "name") else ""
        self.course_code = self.course_code.replace('  ', ' ')
        self.name = self.name.replace('  ', ' ')
-        self.assignments = []
+        self.assignments: List[Assignment] = []
-        self.announcements = []
+        self.announcements: List[CanvasDiscussion] = []
-        self.discussions = []
+        self.discussions: List[CanvasDiscussion] = []
-        self.modules = []
+        self.modules: List[CanvasModule] = []
    def test_course(self, base_url: str, cookie_jar: MozillaCookieJar):
        s = requests.Session()
--- a/module/singlefile.py
+++ b/module/singlefile.py
@ -1,6 +1,8 @@
 from pathlib import Path
 from subprocess import run
 from .const import global_consts
 SINGLEFILE_BINARY_PATH = "./node_modules/single-file/cli/single-file"
 # TODO: have this be specified by a required arg.
@ -11,7 +13,7 @@ def add_quotes(s):
    return "\"" + str(s).strip("\"") + "\""
-def download_page(url, cookies_path, output_path, output_name_template=""):
+def download_page(url, output_path, output_name_template=""):
    # TODO: we can probably safely exclude pages that match the regex r'/external_tools/retrieve\?'
    if output_name_template and Path(output_path, output_name_template).exists():
@ -21,7 +23,7 @@ def download_page(url, cookies_path, output_path, output_name_template=""):
    args = [
        add_quotes(SINGLEFILE_BINARY_PATH),
        "--browser-executable-path=" + add_quotes(CHROME_PATH.strip("\"")),
-        "--browser-cookies-file=" + add_quotes(cookies_path),
+        "--browser-cookies-file=" + add_quotes(global_consts.COOKIES_PATH),
        "--output-directory=" + add_quotes(output_path),
        add_quotes(url)
    ]
--- a/module/threading.py
+++ b/module/threading.py
@ -1,85 +1,80 @@
 import os
 import traceback
 from pathlib import Path
-from module.const import MAX_FOLDER_NAME_SIZE
+from canvasapi.assignment import Assignment
-from module.download import download_file
+from canvasapi.course import Course
-from module.get_canvas import get_extra_assignment_files
+from canvasapi.submission import Submission
 from module.api.file import get_embedded_files
 from module.const import global_consts
 from module.helpers import make_valid_filename, shorten_file_name
-from module.items import CanvasModuleItem, CanvasModule
+from module.items import CanvasModuleItem, jsonify_anything, CanvasModule
 from module.singlefile import download_page
-def download_module_item(module: CanvasModule, item: CanvasModuleItem, modules_dir, cookies_path):
+def download_module_item(course: Course, module: CanvasModule, item: CanvasModuleItem, modules_dir: Path):
    try:
-        module_name = make_valid_filename(str(module.name))
+        module_name = make_valid_filename(str(module.module.name))
-        module_name = shorten_file_name(module_name, len(module_name) - MAX_FOLDER_NAME_SIZE)
+        module_name = shorten_file_name(module_name, len(module_name) - global_consts.MAX_FOLDER_NAME_SIZE)
-        output_dir = Path(modules_dir, module_name)
+        module_dir = modules_dir / module_name
        output_dir.mkdir(parents=True, exist_ok=True)
-        if not item.url:
+        if not hasattr(item.item, 'url') or not item.item.url:
            return
-        # Download attached files
+        module_dir.mkdir(parents=True, exist_ok=True)
        if item.item.type == "File":
            file = course.get_file(item.item.content_id)
            module_file_path = module_dir / make_valid_filename(str(file.display_name))
            file.download(module_file_path)
        else:
            # It's a page, so download the attached files.
            for file in item.attached_files:
-            file.download(output_dir / file.filename)
+                file.download(module_dir / file.filename)
        # Download the module page.
-        html_filename = make_valid_filename(str(item.title)) + ".html"
+        html_filename = make_valid_filename(str(item.item.title)) + ".html"
-        if not (output_dir / html_filename).exists():
+        download_page(item.item.html_url, module_dir, html_filename)
            download_page(item.url, cookies_path, output_dir, html_filename)
    except:
        # TODO: wrap all threaded funcs in this try/catch
        traceback.print_exc()
-def download_assignment(cookies_path, cookie_jar, base_assign_dir, assignment):
+def download_assignment(base_assign_dir: Path, course: Course, assignment: Assignment):
-    assignment_title = make_valid_filename(str(assignment.title))
+    try:
-    assignment_title = shorten_file_name(assignment_title, len(assignment_title) - MAX_FOLDER_NAME_SIZE)
+        assignment_title = make_valid_filename(str(assignment.name))
-    assign_dir = os.path.join(base_assign_dir, assignment_title)
+        assignment_title = shorten_file_name(assignment_title, len(assignment_title) - global_consts.MAX_FOLDER_NAME_SIZE)
        assign_dir = Path(base_assign_dir, assignment_title)
        assign_dir.mkdir(parents=True, exist_ok=True)
-    if assignment.html_url != "":
+        if assignment.html_url:
-        if not os.path.exists(assign_dir):
+            download_page(assignment.html_url, assign_dir, "assignment.html")
            os.makedirs(assign_dir)
-        assignment_page_path = os.path.join(assign_dir, "assignment.html")
+            # Download attached files.
            if assignment.description:
                for file in get_embedded_files(course, assignment.description):
                    file.download(assign_dir / file.display_name)
-        if not os.path.exists(assignment_page_path):
+        # Students cannot view their past attempts, but this logic is left if that's ever implemented in Canvas.
-            download_page(assignment.html_url, cookies_path, assign_dir, "assignment.html")
+        submissions = [assignment.get_submission(global_consts.USER_ID)]
-
+        for submission in submissions:
-        extra_files = get_extra_assignment_files(assignment.description, cookie_jar)
+            download_attempt(submission, assign_dir)
-        for name, url in extra_files:
+            submission_dir = assign_dir / 'submission' / str(submission.id)
-            download_file(url, Path(assign_dir, name), cookie_jar)
+            for attachment in submission.attachments:
-
+                filepath = submission_dir / attachment.display_name
-    for submission in assignment.submissions:
+                if not filepath.exists():
-        download_submission(assignment, submission, assign_dir, cookies_path)
+                    attachment.download(filepath)
    except:
        traceback.print_exc()
-def download_submission(assignment, submission, assign_dir, cookies_path):
+def download_attempt(submission: Submission, assign_dir: Path):
-    submission_dir = assign_dir
+    try:
-
+        submission_dir = assign_dir / 'submission' / str(submission.id)
-    if len(assignment.submissions) != 1:
+        submission_dir.mkdir(parents=True, exist_ok=True)
-        submission_dir = os.path.join(assign_dir, str(submission.user_id))
+        for file in submission.attachments:
-
+            file.download(submission_dir / file.display_name)
-    if submission.preview_url != "":
+        if submission.preview_url:
-        if not os.path.exists(submission_dir):
+            download_page(submission.preview_url, submission_dir, f'{submission.id}.html')
-            os.makedirs(submission_dir)
+    except:
-
+        traceback.print_exc()
        submission_page_dir = os.path.join(submission_dir, "submission.html")
        if not os.path.exists(submission_page_dir):
            download_page(submission.preview_url, cookies_path, submission_dir, "submission.html")
    if (submission.attempt != 1 and assignment.updated_url != "" and assignment.html_url != ""
            and assignment.html_url.rstrip("/") != assignment.updated_url.rstrip("/")):
        submission_dir = os.path.join(assign_dir, "attempts")
        if not os.path.exists(submission_dir):
            os.makedirs(submission_dir)
        for i in range(submission.attempt):
            filename = "attempt_" + str(i + 1) + ".html"
            submission_page_attempt_dir = os.path.join(submission_dir, filename)
            if not os.path.exists(submission_page_attempt_dir):
                download_page(assignment.updated_url + "/history?version=" + str(i + 1), cookies_path, submission_dir, filename)