From a27aa18f404aa9cd65c8d034ee3c6a1255da6a50 Mon Sep 17 00:00:00 2001 From: Cyberes Date: Fri, 27 Oct 2023 18:04:07 -0600 Subject: [PATCH] download user files, clean up code --- README.md | 15 +++++- export.py | 23 ++++++--- module/const.py | 10 ++-- module/download_canvas.py | 100 +++++--------------------------------- module/get_canvas.py | 7 ++- module/items.py | 16 ++++++ module/user_files.py | 39 +++++++++++++++ 7 files changed, 103 insertions(+), 107 deletions(-) create mode 100644 module/user_files.py diff --git a/README.md b/README.md index be8515e..61a3868 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,19 @@ Forked from https://github.com/davekats/canvas-student-data-export -Major rewrite and improvement. +Major changes: + +- Reorganized the project structure. +- Refactored the code to make it more Pythonic. +- Added progress bars. +- Use threading where possible. +- Save assignment attachements. +- Download all user files (as seen in the file manager at `/files` on your Canvas platform.) --- The Canvas Student Data Export Tool can export nearly all of a student's data from Instructure Canvas Learning Management System (Canvas LMS). -This is useful when you are graduating or leaving your college or university, and would like to have a backup of all the data you had in canvas. +This is useful when you are graduating or leaving your college or university, and would like to have a backup of all the data you had in canvas. Also, some instructors disable the built-in export tool. The tool exports all of the following data for each course: @@ -28,6 +35,8 @@ pip install -r requirements.txt npm install ``` +Make sure you have Chomium or Chrome installed. Currently, the executable path is hardcoded to `/usr/bin/chromium-browser` in `module/singlefile.py`. If you are not on Linux or do not use Chromium, you will need to change the path. + ## Run 1. Get your Canvas API key by going to Canvas and navigating to `Account` > `Settings` > `Approved Integrations` > `New Access Token` @@ -52,3 +61,5 @@ Now, run the program: ```shell python export.py ``` + +The folder `./output` will be created and your data downloaded to this path. diff --git a/export.py b/export.py index 3bd56d1..b96919a 100644 --- a/export.py +++ b/export.py @@ -10,6 +10,7 @@ from module.const import COURSES_TO_SKIP, DL_LOCATION from module.download_canvas import download_assignment_pages, download_course_announcement_pages, download_course_discussion_pages, download_course_files, download_course_home_page_html, download_course_html, download_course_module_pages, download_submission_attachments from module.get_canvas import find_course_announcements, find_course_assignments, find_course_discussions, find_course_modules, find_course_pages from module.items import CourseView +from module.user_files import download_user_files with open("credentials.yaml", 'r') as f: credentials = yaml.full_load(f) @@ -37,13 +38,18 @@ if __name__ == "__main__": if not os.path.exists(DL_LOCATION): os.makedirs(DL_LOCATION) - print("\nConnecting to Canvas...") + print("Connecting to Canvas...") canvas = Canvas(API_URL, API_KEY) + print('\nDownloading user files...') + download_user_files(canvas, DL_LOCATION / 'User Files') + print('') + all_courses_views = [] print("Getting list of all courses...") courses = canvas.get_courses(include="term") + course_count = len(list(courses)) skip = set(COURSES_TO_SKIP) @@ -59,12 +65,17 @@ if __name__ == "__main__": course_view = CourseView(course) print(f"=== {course_view.term}: {course_view.name} ===") + + valid, r = course_view.test_course(API_URL, COOKIE_JAR) + if not valid: + print(f'Invalid course: {course_view.course_id} - {r}') + continue + course_view.assignments = find_course_assignments(course, USER_ID) course_view.announcements = find_course_announcements(course) course_view.discussions = find_course_discussions(course) course_view.pages = find_course_pages(course) course_view.modules = find_course_modules(course, course_view) - all_courses_views.append(course_view) download_course_files(course, course_view) @@ -85,17 +96,13 @@ if __name__ == "__main__": print("Exporting all course data...") export_all_course_data(course_view) - if len(courses) > 1: + if course_count > 1: print('') - print("Exporting data from all courses combined as all_output.json") - - # Awful hack to make the JSON pretty. Decode it with Python stdlib json - # module then re-encode with indentation + # Remove elemnts from the course objects that can't be JSON serialized, then format it. json_str = json.dumps(json.loads(jsonpickle.encode(all_courses_views, unpicklable=False)), indent=4) all_output_path = os.path.join(DL_LOCATION, "all_output.json") - with open(all_output_path, "w") as out_file: out_file.write(json_str) diff --git a/module/const.py b/module/const.py index 4ddd900..6a3f175 100644 --- a/module/const.py +++ b/module/const.py @@ -1,6 +1,8 @@ -# Directory in which to download course information to (will be created if not -# present) -DL_LOCATION = "./output" +from pathlib import Path + +# Directory in which to download course information to (will be created if not present) +DL_LOCATION = Path("./output").resolve().expanduser().absolute() + # List of Course IDs that should be skipped (need to be integers) COURSES_TO_SKIP = [288290, 512033] @@ -9,4 +11,4 @@ DATE_TEMPLATE = "%B %d, %Y %I:%M %p" # Max PATH length is 260 characters on Windows. 70 is just an estimate for a reasonable max folder name to prevent the chance of reaching the limit # Applies to modules, assignments, announcements, and discussions # If a folder exceeds this limit, a "-" will be added to the end to indicate it was shortened ("..." not valid) -MAX_FOLDER_NAME_SIZE = 70 \ No newline at end of file +MAX_FOLDER_NAME_SIZE = 70 diff --git a/module/download_canvas.py b/module/download_canvas.py index 02d70af..b7a3dcd 100644 --- a/module/download_canvas.py +++ b/module/download_canvas.py @@ -3,12 +3,13 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from functools import partial from http.cookiejar import MozillaCookieJar +import canvasapi import requests from tqdm import tqdm -from module.singlefile import download_page from module.const import DL_LOCATION, MAX_FOLDER_NAME_SIZE from module.helpers import make_valid_filename, make_valid_folder_path, shorten_file_name +from module.singlefile import download_page from module.threading import download_assignment, download_module_item @@ -22,8 +23,12 @@ def download_course_files(course, course_view): try: files = list(course.get_files()) + except canvasapi.exceptions.Forbidden: + print('Files view disabled for this course.') + return - for file in tqdm(files, desc='Downloading Files'): + for file in tqdm(files, desc='Downloading Files'): + try: file_folder = course.get_folder(file.folder_id) folder_dl_dir = os.path.join(dl_dir, make_valid_folder_path(file_folder.full_name)) @@ -35,10 +40,10 @@ def download_course_files(course, course_view): # Download file if it doesn't already exist if not os.path.exists(dl_path): - print('Downloading: {}'.format(dl_path)) + # print('Downloading: {}'.format(dl_path)) file.download(dl_path) - except Exception as e: - tqdm.write(f"Skipping file download that gave the following error: {e}") + except Exception as e: + tqdm.write(f"Skipping {file.display_name} - {e}") def download_course_discussion_pages(api_url, course_view, cookies_path): @@ -55,7 +60,7 @@ def download_course_discussion_pages(api_url, course_view, cookies_path): if not os.path.exists(discussion_list_dir): download_page(api_url + "/courses/" + str(course_view.course_id) + "/discussion_topics/", cookies_path, base_discussion_dir, "discussion_list.html") - for discussion in tqdm(list(course_view.discussions), desc='Downloading Discussion Pages'): + for discussion in tqdm(list(course_view.discussions), desc='Downloading Discussions'): discussion_title = make_valid_filename(str(discussion.title)) discussion_title = shorten_file_name(discussion_title, len(discussion_title) - MAX_FOLDER_NAME_SIZE) discussion_dir = os.path.join(base_discussion_dir, discussion_title) @@ -90,65 +95,6 @@ def download_assignment_pages(api_url, course_view, cookies_path, cookie_jar: Mo if not os.path.exists(assignment_list_path): download_page(api_url + "/courses/" + str(course_view.course_id) + "/assignments/", cookies_path, base_assign_dir, "assignment_list.html") - # for assignment in tqdm(course_view.assignments, desc='Downloading Assignments'): - # assignment_title = make_valid_filename(str(assignment.title)) - # assignment_title = shorten_file_name(assignment_title, len(assignment_title) - MAX_FOLDER_NAME_SIZE) - # assign_dir = os.path.join(base_assign_dir, assignment_title) - # - # # Download an html image of each assignment (includes assignment instructions and other stuff). - # # Currently, this will only download the main assignment page and not external pages, this is - # # because these external pages are given in a json format. Saving these would require a lot - # # more work then normal. - # if assignment.html_url != "": - # if not os.path.exists(assign_dir): - # os.makedirs(assign_dir) - # - # assignment_page_path = os.path.join(assign_dir, "assignment.html") - # - # # Download assignment page, this usually has instructions and etc. - # if not os.path.exists(assignment_page_path): - # download_page(assignment.html_url, cookies_path, assign_dir, "assignment.html") - # - # extra_files = get_extra_assignment_files(assignment.description, cookie_jar) - # if extra_files: # in an if statement so that we only show the bar when there's things to do. - # for name, url in tqdm(extra_files, desc='Downloading Additional Files', leave=False): - # download_file(url, Path(assign_dir, name), cookie_jar) - # - # for submission in assignment.submissions: - # submission_dir = assign_dir - # - # # If theres more then 1 submission, add unique id to download dir - # if len(assignment.submissions) != 1: - # submission_dir = os.path.join(assign_dir, str(submission.user_id)) - # - # if submission.preview_url != "": - # if not os.path.exists(submission_dir): - # os.makedirs(submission_dir) - # - # submission_page_dir = os.path.join(submission_dir, "submission.html") - # - # # Download submission url, this is typically a more focused page - # if not os.path.exists(submission_page_dir): - # download_page(submission.preview_url, cookies_path, submission_dir, "submission.html") - # - # # If theres more then 1 attempt, save each attempt in attempts folder - # if (submission.attempt != 1 and assignment.updated_url != "" and assignment.html_url != "" - # and assignment.html_url.rstrip("/") != assignment.updated_url.rstrip("/")): - # submission_dir = os.path.join(assign_dir, "attempts") - # - # if not os.path.exists(submission_dir): - # os.makedirs(submission_dir) - # - # # Saves the attempts if multiple were taken, doesn't account for - # # different ID's however, as I wasnt able to find out what the url - # # for the specific id's attempts would be. - # for i in range(submission.attempt): - # filename = "attempt_" + str(i + 1) + ".html" - # submission_page_attempt_dir = os.path.join(submission_dir, filename) - # - # if not os.path.exists(submission_page_attempt_dir): - # download_page(assignment.updated_url + "/history?version=" + str(i + 1), cookies_path, submission_dir, filename) - with ThreadPoolExecutor(max_workers=3) as executor: download_func = partial(download_assignment, cookies_path, cookie_jar, base_assign_dir) list(tqdm(executor.map(download_func, course_view.assignments), total=len(course_view.assignments), desc='Downloading Assignments')) @@ -267,30 +213,6 @@ def download_course_module_pages(api_url, course_view, cookies_path): if not os.path.exists(module_list_dir): download_page(api_url + "/courses/" + str(course_view.course_id) + "/modules/", cookies_path, modules_dir, "modules_list.html") - # for module in tqdm(list(course_view.modules), desc='Downloading Module Pages'): - # bar = tqdm(list(module.items), leave=False, desc=module.name) - # for item in module.items: - # # bar.set_postfix({'title': item.title}) - # - # # If problems arise due to long pathnames, changing module.name to module.id might help, this can also be done with item.title - # # A change would also have to be made in findCourseModules(course, course_view) - # module_name = make_valid_filename(str(module.name)) - # module_name = shorten_file_name(module_name, len(module_name) - MAX_FOLDER_NAME_SIZE) - # items_dir = os.path.join(modules_dir, module_name) - # - # if item.url != "": - # if not os.path.exists(items_dir): - # os.makedirs(items_dir) - # - # filename = make_valid_filename(str(item.title)) + ".html" - # module_item_dir = os.path.join(items_dir, filename) - # - # # Download the module page. - # if not os.path.exists(module_item_dir): - # download_page(item.url, cookies_path, items_dir, filename) - # bar.update() - # bar.close() - with ThreadPoolExecutor(max_workers=3) as executor: for module in tqdm(list(course_view.modules), desc='Downloading Module Pages'): bar = tqdm(list(module.items), leave=False, desc=module.name) diff --git a/module/get_canvas.py b/module/get_canvas.py index d1bbfc4..22985e8 100644 --- a/module/get_canvas.py +++ b/module/get_canvas.py @@ -23,7 +23,7 @@ def find_course_modules(course, course_view): try: modules = list(course.get_modules()) - for module in tqdm(modules, desc='Fetching Modules and Downloading Files'): + for module in tqdm(modules, desc='Downloading Module Files'): module_view = ModuleView() module_view.id = module.id if hasattr(module, "id") else "" module_view.name = str(module.name) if hasattr(module, "name") else "" @@ -62,7 +62,7 @@ def find_course_modules(course, course_view): if not os.path.exists(module_file_path): module_file.download(module_file_path) except Exception as e: - tqdm.write(f"Skipping module file download that gave the following error: {e}") + tqdm.write(f"Skipping module file download that gave the following error: {e} - {module_item}") module_view.items.append(module_item_view) except Exception as e: @@ -88,9 +88,8 @@ def get_extra_assignment_files(html, cookie_jar: MozillaCookieJar): extra_files = [] for item in urls: r = s.get(item) - if r.status_code == 404: + if r.status_code != 200: continue - r.raise_for_status() j = r.json() extra_files.append((j['display_name'], j['url'])) diff --git a/module/items.py b/module/items.py index 5dcc06d..e608fcf 100644 --- a/module/items.py +++ b/module/items.py @@ -1,3 +1,7 @@ +from http.cookiejar import MozillaCookieJar + +import requests + from module.helpers import make_valid_filename @@ -104,3 +108,15 @@ class CourseView: self.announcements = [] self.discussions = [] self.modules = [] + + def test_course(self, base_url: str, cookie_jar: MozillaCookieJar): + s = requests.Session() + for cookie in cookie_jar: + s.cookies.set(cookie.name, cookie.value) + try: + r = s.get(f'{base_url}/api/v1/courses/{self.course_id}') + if not r.status_code == 200: + return False, r + return True, r + except Exception as e: + return False, e diff --git a/module/user_files.py b/module/user_files.py new file mode 100644 index 0000000..abbd0fc --- /dev/null +++ b/module/user_files.py @@ -0,0 +1,39 @@ +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +import canvasapi +from tqdm import tqdm + +from module.helpers import make_valid_folder_path + + +def do_download(task): + task[1].parent.mkdir(parents=True, exist_ok=True) + task[0].download(task[1]) + + +def download_user_files(canvas: canvasapi.Canvas, base_path: str): + base_path = Path(base_path) + user = canvas.get_current_user() + folders = [] + for folder in user.get_folders(): + n = folder.full_name.lstrip('my files/') + if n: + c_n = make_valid_folder_path(n) + folders.append((folder, c_n)) + + files = [] + for folder, folder_name in tqdm(folders, desc='Fetching User Files'): + for file in folder.get_files(): + out_path = base_path / folder_name / file.display_name + files.append((file, out_path)) + + with ThreadPoolExecutor(max_workers=10) as executor: + bar = tqdm(files, desc='Downloading User Files') + futures = [executor.submit(do_download, task) for task in files] + for future in as_completed(futures): + bar.update() + + # for file, out_path in tqdm(files, desc='Downloading User Files'): + # if not out_path.exists(): + # file.download(out_path)