diff --git a/README.md b/README.md index 3ace696..5042297 100644 --- a/README.md +++ b/README.md @@ -24,10 +24,20 @@ The tool exports all of the following data for each course: - Pages - Files - Modules -- Single file webpage of the Canvas page for assignments, announcements, discussions, and modules +- Single file webpage of the Canvas page for grades, assignments, announcements, discussions, and modules Additionally, all your files stored on Canvas (such as historic submissions and attachments) will be downloaded. +**TO DO LIST** +- [x] Export grades. +- [x] Detect when the cookies and API token are not valid. +- [ ] Use argparse. +- [ ] Add an argument to prohibit overwriting existing files. +- [ ] Have the path to the Chrome binary be specified by a required argument. +- [ ] Use logging. +- [ ] Refactor `download_canvas.py`. +- [ ] Refactor `export.py`. + ## Install ```shell diff --git a/export.py b/export.py index df296d8..a0e96aa 100644 --- a/export.py +++ b/export.py @@ -3,24 +3,19 @@ import os from http.cookiejar import MozillaCookieJar from pathlib import Path +import canvasapi import jsonpickle +import requests import yaml from canvasapi import Canvas from module.const import COURSES_TO_SKIP, DL_LOCATION -from module.download_canvas import download_assignment_pages, download_course_announcement_pages, download_course_discussion_pages, download_course_files, download_course_home_page_html, download_course_html, download_course_module_pages, download_submission_attachments +from module.download_canvas import download_assignment_pages, download_course_announcement_pages, download_course_discussion_pages, download_course_files, download_course_html, download_course_module_pages, download_submission_attachments, download_course_grades_page, download_course_home_page_html from module.get_canvas import find_course_announcements, find_course_assignments, find_course_discussions, find_course_modules, find_course_pages from module.items import CourseView from module.user_files import download_user_files -with open("credentials.yaml", 'r') as f: - credentials = yaml.full_load(f) -API_URL = credentials["API_URL"] -API_KEY = credentials["API_KEY"] -USER_ID = credentials["USER_ID"] -COOKIES_PATH = str(Path(credentials["COOKIES_PATH"]).resolve().expanduser().absolute()) -COOKIE_JAR = MozillaCookieJar(COOKIES_PATH) -COOKIE_JAR.load(ignore_discard=True, ignore_expires=True) +SCRIPT_PATH = os.path.abspath(os.path.dirname(__file__)) def export_all_course_data(c): @@ -34,32 +29,93 @@ def export_all_course_data(c): if __name__ == "__main__": + # Startup checks. + creds_file = Path(SCRIPT_PATH, 'credentials.yaml') + if not creds_file.is_file(): + print('The credentials.yaml file does not exist:', creds_file) + quit(1) + + with open("credentials.yaml", 'r') as f: + credentials = yaml.full_load(f) + + API_URL = credentials["API_URL"] + API_KEY = credentials["API_KEY"] + USER_ID = credentials["USER_ID"] + COOKIES_PATH = str(Path(credentials["COOKIES_PATH"]).resolve().expanduser().absolute()) + + if not Path(COOKIES_PATH).is_file(): + print('The cookies file does not exist:', COOKIES_PATH) + quit(1) + + COOKIE_JAR = MozillaCookieJar(COOKIES_PATH) + COOKIE_JAR.load(ignore_discard=True, ignore_expires=True) + + # ================================================================================================================== + # Initialization + print("Welcome to the Canvas Student Data Export Tool") - print("Creating output directory:", DL_LOCATION) if not os.path.exists(DL_LOCATION): + print("Creating output directory:", DL_LOCATION) os.makedirs(DL_LOCATION) - print("Connecting to Canvas...") - canvas = Canvas(API_URL, API_KEY) + if COOKIES_PATH: + print("Authenticating with Canvas frontend...") - print('\nDownloading user files...') + # Test the cookies. + cookies = MozillaCookieJar(COOKIES_PATH) + cookies.load(ignore_discard=True, ignore_expires=True) + + # Requests takes a dict, not the MozillaCookieJar object. + request_cookies = {} + for cookie in cookies: + request_cookies[cookie.name] = cookie.value + + r = requests.get(f'{API_URL}/profile', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}, cookies=request_cookies) + if r.status_code != 200: + print('Failed to fetch Canvas profile: got status code', r.status_code) + quit(1) + if not r.url.startswith(API_URL): + print('Failed to fetch Canvas profile: client was redirected away from Canvas:') + print(r.url) + quit(1) + if 'profileContent__Block' not in r.text: + print('Failed to test Canvas profile: could not find an element with the class "profileContent__Block". This could mean that your authentication is incorrect.') + quit(1) + + # TODO: log debug status success here + else: + print('No cookies file specified! No HTML pages will be saved.') + + print("Authenticating with Canvas API...") + canvas = Canvas(API_URL, API_KEY) + courses = canvas.get_courses(include="term") + try: + course_count = len(list(courses)) + except canvasapi.exceptions.InvalidAccessToken as e: + try: + msg = e.message[0]['message'] + except: + # Something went very wrong. + msg = '' + print('Failed to fetch courses from the Canvas API:', msg) + quit(1) + + print('') + + skip = set(COURSES_TO_SKIP) + + # ================================================================================================================== + # Exporting + + print("Downloading courses page...") + download_course_html(API_URL, COOKIES_PATH) + + print('Downloading user files...') download_user_files(canvas, DL_LOCATION / 'User Files') print('') all_courses_views = [] - print("Getting list of all courses...") - courses = canvas.get_courses(include="term") - course_count = len(list(courses)) - - skip = set(COURSES_TO_SKIP) - - if COOKIES_PATH: - print("Fetching Courses...") - download_course_html(API_URL, COOKIES_PATH) - - print('') - for course in courses: if course.id in skip or not hasattr(course, "name") or not hasattr(course, "term"): continue @@ -69,12 +125,10 @@ if __name__ == "__main__": valid, r = course_view.test_course(API_URL, COOKIE_JAR) if not valid: - print(f'Invalid course: {course_view.course_id} - {r}') - + print(f'Invalid course: {course_view.course_id} - {r} - {r.text}') if r.status_code == 401: - print('Got a bad status code:', r.status_code) + # We can't recover from this error. quit(1) - continue course_view.assignments = find_course_assignments(course, USER_ID) @@ -84,13 +138,12 @@ if __name__ == "__main__": course_view.modules = find_course_modules(course, course_view) all_courses_views.append(course_view) - download_course_files(course, course_view) - - download_submission_attachments(course, course_view) - print('Downloading course home page...') download_course_home_page_html(API_URL, course_view, COOKIES_PATH) + print('Downloading grades...') + download_course_grades_page(API_URL, course_view, COOKIES_PATH) + download_assignment_pages(API_URL, course_view, COOKIES_PATH, COOKIE_JAR) download_course_module_pages(API_URL, course_view, COOKIES_PATH) @@ -99,13 +152,17 @@ if __name__ == "__main__": download_course_discussion_pages(API_URL, course_view, COOKIES_PATH) - print("Exporting all course data...") + download_course_files(course, course_view) + + download_submission_attachments(course, course_view) + + print("Exporting course metadata...") export_all_course_data(course_view) if course_count > 1: print('') - # Remove elemnts from the course objects that can't be JSON serialized, then format it. + # Remove elements from the course objects that can't be JSON serialized, then format it. json_str = json.dumps(json.loads(jsonpickle.encode(all_courses_views, unpicklable=False)), indent=4) all_output_path = os.path.join(DL_LOCATION, "all_output.json") diff --git a/module/download_canvas.py b/module/download_canvas.py index 5a00c8b..8a127cd 100644 --- a/module/download_canvas.py +++ b/module/download_canvas.py @@ -2,6 +2,7 @@ import os from concurrent.futures import ThreadPoolExecutor, as_completed from functools import partial from http.cookiejar import MozillaCookieJar +from pathlib import Path import canvasapi import requests @@ -220,3 +221,16 @@ def download_course_module_pages(api_url, course_view, cookies_path): for _ in as_completed(futures): bar.update() bar.close() + + +def download_course_grades_page(api_url, course_view, cookies_path): + if cookies_path == "": + return + + dl_dir = Path(DL_LOCATION, course_view.term, course_view.name) + dl_dir.mkdir(parents=True, exist_ok=True) + + # TODO: command line arg to prohibit overwrite. Default should overwrite + if not (dl_dir / "grades.html").exists(): + api_target = f'{api_url}/courses/{course_view.course_id}/grades' + download_page(api_target, cookies_path, dl_dir, "grades.html") diff --git a/module/singlefile.py b/module/singlefile.py index a1a2f42..dac1e11 100644 --- a/module/singlefile.py +++ b/module/singlefile.py @@ -2,7 +2,9 @@ from pathlib import Path from subprocess import run SINGLEFILE_BINARY_PATH = "./node_modules/single-file/cli/single-file" -CHROME_PATH = "/usr/bin/chromium-browser" + +# TODO: have this be specified by a required arg. +CHROME_PATH = "/usr/bin/google-chrome" def add_quotes(s):