canvas-student-data-export/export.py

import argparse
import json
import os
from http.cookiejar import MozillaCookieJar
from pathlib import Path

import canvasapi
import jsonpickle
import requests
import yaml
from canvasapi import Canvas

from module.const import COURSES_TO_SKIP, OUTPUT_LOCATION
from module.download_canvas import download_assignment_pages, download_course_announcement_pages, download_course_discussion_pages, download_course_files, download_course_module_pages, download_submission_attachments, download_course_grades_page, download_course_home_page_html, download_course_html
from module.get_canvas import find_course_announcements, find_course_assignments, find_course_discussions, find_course_modules, find_course_pages
from module.items import CanvasCourse
from module.user_files import download_user_files

SCRIPT_PATH = os.path.abspath(os.path.dirname(__file__))


def export_all_course_data(c):
    json_data = json.dumps(json.loads(jsonpickle.encode(c, unpicklable=False)), indent=4)
    course_output_dir = os.path.join(OUTPUT_LOCATION, c.term, c.name)
    if not os.path.exists(course_output_dir):
        os.makedirs(course_output_dir)
    course_output_path = os.path.join(course_output_dir, c.name + ".json")
    with open(course_output_path, "w") as file:
        file.write(json_data)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--output', default='./output', help='Output location. If it does not exist, it will be created.')
    parser.add_argument('--term', default=None, help='Only download this term.')
    parser.add_argument('--user-files', action='store_true', help="Download the user files.")
    args = parser.parse_args()

    OUTPUT_LOCATION = Path(args.output).resolve().expanduser().absolute()
    OUTPUT_LOCATION.mkdir(parents=True, exist_ok=True)

    # Startup checks.
    creds_file = Path(SCRIPT_PATH, 'credentials.yaml')
    if not creds_file.is_file():
        print('The credentials.yaml file does not exist:', creds_file)
        quit(1)

    with open("credentials.yaml", 'r') as f:
        credentials = yaml.full_load(f)

    API_URL = credentials["API_URL"]
    API_KEY = credentials["API_KEY"]
    USER_ID = credentials["USER_ID"]
    COOKIES_PATH = str(Path(credentials["COOKIES_PATH"]).resolve().expanduser().absolute())

    if not Path(COOKIES_PATH).is_file():
        print('The cookies file does not exist:', COOKIES_PATH)
        quit(1)

    COOKIE_JAR = MozillaCookieJar(COOKIES_PATH)
    COOKIE_JAR.load(ignore_discard=True, ignore_expires=True)

    # ==================================================================================================================
    # Initialization

    print("Welcome to the Canvas Student Data Export Tool")
    if not os.path.exists(OUTPUT_LOCATION):
        print("Creating output directory:", OUTPUT_LOCATION)
        os.makedirs(OUTPUT_LOCATION)

    if COOKIES_PATH:
        # Test the cookies.
        print("Authenticating with Canvas frontend...")

        # Requests takes a dict, not the MozillaCookieJar object.
        request_cookies = {c.name: c.value for c in COOKIE_JAR}

        r = requests.get(f'{API_URL}/profile', headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'}, cookies=request_cookies)
        if r.status_code != 200:
            print('Failed to fetch Canvas profile: got status code', r.status_code)
            quit(1)
        if not r.url.startswith(API_URL):
            print('Failed to fetch Canvas profile: client was redirected away from Canvas:')
            print(r.url)
            quit(1)
        if 'profileContent__Block' not in r.text:
            # TODO: add an arg to skip this check.
            print('Failed to test Canvas profile: could not find an element with the class "profileContent__Block". This could mean that your authentication is incorrect.')
            quit(1)

        # TODO: log debug status success here
    else:
        print('No cookies file specified! No HTML pages will be saved.')

    print("Authenticating with Canvas API...")
    canvas = Canvas(API_URL, API_KEY)
    courses = canvas.get_courses(include="term")
    try:
        course_count = len(list(courses))
    except canvasapi.exceptions.InvalidAccessToken as e:
        try:
            msg = e.message[0]['message']
        except:
            # Something went very wrong.
            msg = ''
        print('Failed to fetch courses from the Canvas API:', msg)
        quit(1)

    print('')

    skip = set(COURSES_TO_SKIP)

    # ==================================================================================================================
    # Exporting

    print("Downloading courses page...")
    download_course_html(API_URL, COOKIES_PATH)

    if not args.user_files:
        print('Downloading user files...')
        download_user_files(canvas, OUTPUT_LOCATION / 'User Files')

    print('')

    all_courses_views = []

    for course in courses:
        if course.id in skip or not hasattr(course, "name") or not hasattr(course, "term"):
            continue

        course_view = CanvasCourse(course)

        if args.term and args.term != course_view.term:
            print('Skipping term:', course_view.term, '\n')
            continue

        print(f"=== {course_view.term}: {course_view.name} ===")

        valid, r = course_view.test_course(API_URL, COOKIE_JAR)
        if not valid:
            print(f'Invalid course: {course_view.course_id} - {r} - {r.text}')
            if r.status_code == 401:
                # We can't recover from this error.
                quit(1)
            continue

        course_view.assignments = find_course_assignments(course, USER_ID)
        course_view.announcements = find_course_announcements(course)
        course_view.discussions = find_course_discussions(course)
        course_view.pages = find_course_pages(course)
        course_view.modules = find_course_modules(course, course_view)

        all_courses_views.append(course_view)

        print('Downloading course home page...')
        download_course_home_page_html(API_URL, course_view, COOKIES_PATH)

        print('Downloading grades...')
        download_course_grades_page(API_URL, course_view, COOKIES_PATH)

        download_assignment_pages(API_URL, course_view, COOKIES_PATH, COOKIE_JAR)

        download_course_module_pages(API_URL, course_view, COOKIES_PATH)

        download_course_announcement_pages(API_URL, course_view, COOKIES_PATH)

        download_course_discussion_pages(API_URL, course_view, COOKIES_PATH)

        download_course_files(course, course_view)

        download_submission_attachments(course, course_view)

        print("Exporting course metadata...")
        export_all_course_data(course_view)

        if course_count > 1:
            print('')

    # Remove elements from the course objects that can't be JSON serialized, then format it.
    json_str = json.dumps(json.loads(jsonpickle.encode(all_courses_views, unpicklable=False)), indent=4)

    all_output_path = os.path.join(OUTPUT_LOCATION, "all_output.json")
    with open(all_output_path, "w") as out_file:
        out_file.write(json_str)

    print("\nProcess complete. All canvas data exported!")