From 796500e954757632f187ddc89c72d95712900b15 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Thu, 25 Jan 2024 15:43:35 -0700 Subject: [PATCH] download files embedded in modules, add argparse for term and downloading user files --- export.py | 50 +++++++++++++++++++++++-------------- module/const.py | 2 +- module/download_canvas.py | 25 +++++++++---------- module/get_canvas.py | 52 +++++++++++++++++++++++++-------------- module/items.py | 8 +++--- module/threading.py | 34 ++++++++++++++----------- module/user_files.py | 3 +-- 7 files changed, 104 insertions(+), 70 deletions(-) diff --git a/export.py b/export.py index a0e96aa..bdb031b 100644 --- a/export.py +++ b/export.py @@ -1,3 +1,4 @@ +import argparse import json import os from http.cookiejar import MozillaCookieJar @@ -9,10 +10,10 @@ import requests import yaml from canvasapi import Canvas -from module.const import COURSES_TO_SKIP, DL_LOCATION -from module.download_canvas import download_assignment_pages, download_course_announcement_pages, download_course_discussion_pages, download_course_files, download_course_html, download_course_module_pages, download_submission_attachments, download_course_grades_page, download_course_home_page_html +from module.const import COURSES_TO_SKIP, OUTPUT_LOCATION +from module.download_canvas import download_assignment_pages, download_course_announcement_pages, download_course_discussion_pages, download_course_files, download_course_module_pages, download_submission_attachments, download_course_grades_page, download_course_home_page_html, download_course_html from module.get_canvas import find_course_announcements, find_course_assignments, find_course_discussions, find_course_modules, find_course_pages -from module.items import CourseView +from module.items import CanvasCourse from module.user_files import download_user_files SCRIPT_PATH = os.path.abspath(os.path.dirname(__file__)) @@ -20,7 +21,7 @@ SCRIPT_PATH = os.path.abspath(os.path.dirname(__file__)) def export_all_course_data(c): json_data = json.dumps(json.loads(jsonpickle.encode(c, unpicklable=False)), indent=4) - course_output_dir = os.path.join(DL_LOCATION, c.term, c.name) + course_output_dir = os.path.join(OUTPUT_LOCATION, c.term, c.name) if not os.path.exists(course_output_dir): os.makedirs(course_output_dir) course_output_path = os.path.join(course_output_dir, c.name + ".json") @@ -29,6 +30,15 @@ def export_all_course_data(c): if __name__ == "__main__": + parser = argparse.ArgumentParser(description='') + parser.add_argument('--output', default='./output', help='Output location. If it does not exist, it will be created.') + parser.add_argument('--term', default=None, help='Only download this term.') + parser.add_argument('--user-files', action='store_true', help="Download the user files.") + args = parser.parse_args() + + OUTPUT_LOCATION = Path(args.output).resolve().expanduser().absolute() + OUTPUT_LOCATION.mkdir(parents=True, exist_ok=True) + # Startup checks. creds_file = Path(SCRIPT_PATH, 'credentials.yaml') if not creds_file.is_file(): @@ -54,21 +64,16 @@ if __name__ == "__main__": # Initialization print("Welcome to the Canvas Student Data Export Tool") - if not os.path.exists(DL_LOCATION): - print("Creating output directory:", DL_LOCATION) - os.makedirs(DL_LOCATION) + if not os.path.exists(OUTPUT_LOCATION): + print("Creating output directory:", OUTPUT_LOCATION) + os.makedirs(OUTPUT_LOCATION) if COOKIES_PATH: + # Test the cookies. print("Authenticating with Canvas frontend...") - # Test the cookies. - cookies = MozillaCookieJar(COOKIES_PATH) - cookies.load(ignore_discard=True, ignore_expires=True) - # Requests takes a dict, not the MozillaCookieJar object. - request_cookies = {} - for cookie in cookies: - request_cookies[cookie.name] = cookie.value + request_cookies = {c.name: c.value for c in COOKIE_JAR} r = requests.get(f'{API_URL}/profile', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}, cookies=request_cookies) if r.status_code != 200: @@ -79,6 +84,7 @@ if __name__ == "__main__": print(r.url) quit(1) if 'profileContent__Block' not in r.text: + # TODO: add an arg to skip this check. print('Failed to test Canvas profile: could not find an element with the class "profileContent__Block". This could mean that your authentication is incorrect.') quit(1) @@ -110,8 +116,10 @@ if __name__ == "__main__": print("Downloading courses page...") download_course_html(API_URL, COOKIES_PATH) - print('Downloading user files...') - download_user_files(canvas, DL_LOCATION / 'User Files') + if not args.user_files: + print('Downloading user files...') + download_user_files(canvas, OUTPUT_LOCATION / 'User Files') + print('') all_courses_views = [] @@ -120,7 +128,12 @@ if __name__ == "__main__": if course.id in skip or not hasattr(course, "name") or not hasattr(course, "term"): continue - course_view = CourseView(course) + course_view = CanvasCourse(course) + + if args.term and args.term != course_view.term: + print('Skipping term:', course_view.term, '\n') + continue + print(f"=== {course_view.term}: {course_view.name} ===") valid, r = course_view.test_course(API_URL, COOKIE_JAR) @@ -136,6 +149,7 @@ if __name__ == "__main__": course_view.discussions = find_course_discussions(course) course_view.pages = find_course_pages(course) course_view.modules = find_course_modules(course, course_view) + all_courses_views.append(course_view) print('Downloading course home page...') @@ -165,7 +179,7 @@ if __name__ == "__main__": # Remove elements from the course objects that can't be JSON serialized, then format it. json_str = json.dumps(json.loads(jsonpickle.encode(all_courses_views, unpicklable=False)), indent=4) - all_output_path = os.path.join(DL_LOCATION, "all_output.json") + all_output_path = os.path.join(OUTPUT_LOCATION, "all_output.json") with open(all_output_path, "w") as out_file: out_file.write(json_str) diff --git a/module/const.py b/module/const.py index 6a3f175..a05a220 100644 --- a/module/const.py +++ b/module/const.py @@ -1,7 +1,7 @@ from pathlib import Path # Directory in which to download course information to (will be created if not present) -DL_LOCATION = Path("./output").resolve().expanduser().absolute() +OUTPUT_LOCATION = Path("./output").resolve().expanduser().absolute() # List of Course IDs that should be skipped (need to be integers) COURSES_TO_SKIP = [288290, 512033] diff --git a/module/download_canvas.py b/module/download_canvas.py index 8a127cd..c7b3b31 100644 --- a/module/download_canvas.py +++ b/module/download_canvas.py @@ -8,7 +8,7 @@ import canvasapi import requests from tqdm import tqdm -from module.const import DL_LOCATION, MAX_FOLDER_NAME_SIZE +from module.const import OUTPUT_LOCATION, MAX_FOLDER_NAME_SIZE from module.helpers import make_valid_filename, make_valid_folder_path, shorten_file_name from module.singlefile import download_page from module.threading import download_assignment, download_module_item @@ -16,7 +16,7 @@ from module.threading import download_assignment, download_module_item def download_course_files(course, course_view): # file full_name starts with "course files" - dl_dir = os.path.join(DL_LOCATION, course_view.term, course_view.name) + dl_dir = os.path.join(OUTPUT_LOCATION, course_view.term, course_view.name) # Create directory if not present if not os.path.exists(dl_dir): @@ -51,7 +51,7 @@ def download_course_discussion_pages(api_url, course_view, cookies_path): if cookies_path == "" or len(course_view.discussions) == 0: return - base_discussion_dir = os.path.join(DL_LOCATION, course_view.term, course_view.name, "discussions") + base_discussion_dir = os.path.join(OUTPUT_LOCATION, course_view.term, course_view.name, "discussions") if not os.path.exists(base_discussion_dir): os.makedirs(base_discussion_dir) @@ -86,7 +86,7 @@ def download_assignment_pages(api_url, course_view, cookies_path, cookie_jar: Mo if cookies_path == "" or len(course_view.assignments) == 0: return - base_assign_dir = os.path.join(DL_LOCATION, course_view.term, course_view.name, "assignments") + base_assign_dir = os.path.join(OUTPUT_LOCATION, course_view.term, course_view.name, "assignments") if not os.path.exists(base_assign_dir): os.makedirs(base_assign_dir) @@ -114,7 +114,7 @@ def download_course_announcement_pages(api_url, course_view, cookies_path): if cookies_path == "" or len(course_view.announcements) == 0: return - base_announce_dir = os.path.join(DL_LOCATION, course_view.term, course_view.name, "announcements") + base_announce_dir = os.path.join(OUTPUT_LOCATION, course_view.term, course_view.name, "announcements") if not os.path.exists(base_announce_dir): os.makedirs(base_announce_dir) announcement_list_dir = os.path.join(base_announce_dir, "announcement_list.html") @@ -143,7 +143,7 @@ def download_course_announcement_pages(api_url, course_view, cookies_path): def download_submission_attachments(course, course_view): - course_dir = os.path.join(DL_LOCATION, course_view.term, course_view.name) + course_dir = os.path.join(OUTPUT_LOCATION, course_view.term, course_view.name) # Create directory if not present if not os.path.exists(course_dir): @@ -173,7 +173,7 @@ def download_course_html(api_url, cookies_path): if cookies_path == "": return - course_dir = DL_LOCATION + course_dir = OUTPUT_LOCATION if not os.path.exists(course_dir): os.makedirs(course_dir) @@ -189,7 +189,7 @@ def download_course_home_page_html(api_url, course_view, cookies_path): if cookies_path == "": return - dl_dir = os.path.join(DL_LOCATION, course_view.term, course_view.name) + dl_dir = os.path.join(OUTPUT_LOCATION, course_view.term, course_view.name) if not os.path.exists(dl_dir): os.makedirs(dl_dir) @@ -204,18 +204,17 @@ def download_course_module_pages(api_url, course_view, cookies_path): if cookies_path == "" or len(course_view.modules) == 0: return - modules_dir = os.path.join(DL_LOCATION, course_view.term, course_view.name, "modules") + modules_dir = os.path.join(OUTPUT_LOCATION, course_view.term, course_view.name, "modules") if not os.path.exists(modules_dir): os.makedirs(modules_dir) - module_list_dir = os.path.join(modules_dir, "modules_list.html") - # Downloads the modules page (possible this is disabled by the teacher) + module_list_dir = Path(str(modules_dir), "modules_list.html") if not os.path.exists(module_list_dir): download_page(api_url + "/courses/" + str(course_view.course_id) + "/modules/", cookies_path, modules_dir, "modules_list.html") with ThreadPoolExecutor(max_workers=3) as executor: - for module in tqdm(list(course_view.modules), desc='Downloading Module Pages'): + for module in tqdm(list(course_view.modules), desc='Downloading Modules'): bar = tqdm(list(module.items), leave=False, desc=module.name) futures = [executor.submit(download_module_item, module, item, modules_dir, cookies_path) for item in module.items] for _ in as_completed(futures): @@ -227,7 +226,7 @@ def download_course_grades_page(api_url, course_view, cookies_path): if cookies_path == "": return - dl_dir = Path(DL_LOCATION, course_view.term, course_view.name) + dl_dir = Path(OUTPUT_LOCATION, course_view.term, course_view.name) dl_dir.mkdir(parents=True, exist_ok=True) # TODO: command line arg to prohibit overwrite. Default should overwrite diff --git a/module/get_canvas.py b/module/get_canvas.py index 22985e8..15c3d7f 100644 --- a/module/get_canvas.py +++ b/module/get_canvas.py @@ -1,4 +1,5 @@ import os +import re from http.cookiejar import MozillaCookieJar import dateutil.parser @@ -6,13 +7,16 @@ import requests from bs4 import BeautifulSoup from tqdm import tqdm -from module.const import DATE_TEMPLATE, DL_LOCATION, MAX_FOLDER_NAME_SIZE +from module.const import DATE_TEMPLATE, OUTPUT_LOCATION, MAX_FOLDER_NAME_SIZE from module.helpers import make_valid_filename, shorten_file_name -from module.items import AssignmentView, AttachmentView, DiscussionView, ModuleItemView, ModuleView, PageView, SubmissionView, TopicEntryView, TopicReplyView +from module.items import AssignmentView, AttachmentView, DiscussionView, CanvasModuleItem, CanvasModule, PageView, SubmissionView, TopicEntryView, TopicReplyView + +MODULE_ITEM_ATTACHED_FILE_RE = re.compile(r'') +CANVAS_API_FILE_ID_RE = re.compile(r'.*?/api/v1/courses/.*?/files/(.*?)$') def find_course_modules(course, course_view): - modules_dir = os.path.join(DL_LOCATION, course_view.term, course_view.name, "modules") + modules_dir = os.path.join(OUTPUT_LOCATION, course_view.term, course_view.name, "modules") # Create modules directory if not present if not os.path.exists(modules_dir): @@ -23,24 +27,22 @@ def find_course_modules(course, course_view): try: modules = list(course.get_modules()) - for module in tqdm(modules, desc='Downloading Module Files'): - module_view = ModuleView() + for module in tqdm(modules, desc='Fetching Modules'): + module_view = CanvasModule() module_view.id = module.id if hasattr(module, "id") else "" module_view.name = str(module.name) if hasattr(module, "name") else "" try: - # Get module items - module_items = module.get_module_items() + # Get items for each module + for item in module.get_module_items(): + module_item = CanvasModuleItem() + module_item.id = item.id if hasattr(item, "id") else 0 + module_item.title = str(item.title).replace(' ', ' ') if hasattr(item, "title") else "" + module_item.content_type = str(item.type) if hasattr(item, "type") else "" + module_item.url = str(item.html_url) if hasattr(item, "html_url") else "" + module_item.external_url = str(item.external_url) if hasattr(item, "external_url") else "" - for module_item in module_items: - module_item_view = ModuleItemView() - module_item_view.id = module_item.id if hasattr(module_item, "id") else 0 - module_item_view.title = str(module_item.title).replace(' ', ' ') if hasattr(module_item, "title") else "" - module_item_view.content_type = str(module_item.type) if hasattr(module_item, "type") else "" - module_item_view.url = str(module_item.html_url) if hasattr(module_item, "html_url") else "" - module_item_view.external_url = str(module_item.external_url) if hasattr(module_item, "external_url") else "" - - if module_item_view.content_type == "File": + if module_item.content_type == "File": # If problems arise due to long pathnames, changing module.name to module.id might help # A change would also have to be made in downloadCourseModulePages(api_url, course_view, cookies_path) module_name = make_valid_filename(str(module.name)) @@ -53,7 +55,7 @@ def find_course_modules(course, course_view): os.makedirs(module_dir) # Get the file object - module_file = course.get_file(str(module_item.content_id)) + module_file = course.get_file(str(item.content_id)) # Create path for module file download module_file_path = os.path.join(module_dir, make_valid_filename(str(module_file.display_name))) @@ -62,9 +64,21 @@ def find_course_modules(course, course_view): if not os.path.exists(module_file_path): module_file.download(module_file_path) except Exception as e: - tqdm.write(f"Skipping module file download that gave the following error: {e} - {module_item}") + tqdm.write(f"Skipping module file download that gave the following error: {e} - {item}") - module_view.items.append(module_item_view) + elif item.type == 'Page': + page = course.get_page(item.page_url) + if hasattr(page, 'body'): + # Extract the attached files from the item's HTML. + file_matches = re.findall(MODULE_ITEM_ATTACHED_FILE_RE, page.body) + for match in file_matches: + file_id = re.match(CANVAS_API_FILE_ID_RE, match) + if file_id: + # Grab the metadata from the API. + canvas_file = course.get_file(file_id.group(1)) + module_item.attached_files.add(canvas_file) + + module_view.items.append(module_item) except Exception as e: tqdm.write(f"Skipping module file download that gave the following error: {e}") diff --git a/module/items.py b/module/items.py index e608fcf..16f5a84 100644 --- a/module/items.py +++ b/module/items.py @@ -1,20 +1,22 @@ from http.cookiejar import MozillaCookieJar import requests +from canvasapi.file import File from module.helpers import make_valid_filename -class ModuleItemView: +class CanvasModuleItem: def __init__(self): self.id = 0 self.title = "" self.content_type = "" self.url = "" self.external_url = "" + self.attached_files: set[File] = set() -class ModuleView: +class CanvasModule: def __init__(self): self.id = 0 self.name = "" @@ -94,7 +96,7 @@ class AssignmentView: self.updated_url = "" -class CourseView: +class CanvasCourse: def __init__(self, course): self.course_id = course.id if hasattr(course, "id") else 0 self.term = make_valid_filename(course.term["name"] if hasattr(course, "term") and "name" in course.term.keys() else "") diff --git a/module/threading.py b/module/threading.py index ce2c5dd..683eab5 100644 --- a/module/threading.py +++ b/module/threading.py @@ -1,30 +1,36 @@ import os +import traceback from pathlib import Path -from module.singlefile import download_page from module.const import MAX_FOLDER_NAME_SIZE from module.download import download_file from module.get_canvas import get_extra_assignment_files from module.helpers import make_valid_filename, shorten_file_name +from module.items import CanvasModuleItem, CanvasModule +from module.singlefile import download_page -def download_module_item(module, item, modules_dir, cookies_path): - # If problems arise due to long pathnames, changing module.name to module.id might help, this can also be done with item.title - # A change would also have to be made in findCourseModules(course, course_view) - module_name = make_valid_filename(str(module.name)) - module_name = shorten_file_name(module_name, len(module_name) - MAX_FOLDER_NAME_SIZE) - items_dir = os.path.join(modules_dir, module_name) +def download_module_item(module: CanvasModule, item: CanvasModuleItem, modules_dir, cookies_path): + try: + module_name = make_valid_filename(str(module.name)) + module_name = shorten_file_name(module_name, len(module_name) - MAX_FOLDER_NAME_SIZE) + output_dir = Path(modules_dir, module_name) + output_dir.mkdir(parents=True, exist_ok=True) - if item.url != "": - if not os.path.exists(items_dir): - os.makedirs(items_dir) + if not item.url: + return - filename = make_valid_filename(str(item.title)) + ".html" - module_item_dir = os.path.join(items_dir, filename) + # Download attached files + for file in item.attached_files: + file.download(output_dir / file.filename) # Download the module page. - if not os.path.exists(module_item_dir): - download_page(item.url, cookies_path, items_dir, filename) + html_filename = make_valid_filename(str(item.title)) + ".html" + if not (output_dir / html_filename).exists(): + download_page(item.url, cookies_path, output_dir, html_filename) + except: + # TODO: wrap all threaded funcs in this try/catch + traceback.print_exc() def download_assignment(cookies_path, cookie_jar, base_assign_dir, assignment): diff --git a/module/user_files.py b/module/user_files.py index 4c3ab3a..312ef8c 100644 --- a/module/user_files.py +++ b/module/user_files.py @@ -12,8 +12,7 @@ def do_download(task): task[0].download(task[1]) -def download_user_files(canvas: canvasapi.Canvas, base_path: str): - base_path = Path(base_path) +def download_user_files(canvas: canvasapi.Canvas, base_path: Path): user = canvas.get_current_user() folders = [] for folder in user.get_folders():