2024-01-25 15:43:35 -07:00
import argparse
2019-08-15 23:38:16 -06:00
import json
import os
2023-10-27 16:24:52 -06:00
from http . cookiejar import MozillaCookieJar
2023-10-27 18:59:58 -06:00
from pathlib import Path
2020-07-08 10:50:20 -06:00
2024-01-11 21:57:36 -07:00
import canvasapi
2020-07-08 10:50:20 -06:00
import jsonpickle
2024-01-11 21:57:36 -07:00
import requests
2020-07-08 14:51:04 -06:00
import yaml
2023-10-27 16:24:52 -06:00
from canvasapi import Canvas
2020-07-08 14:51:04 -06:00
2024-01-25 15:43:35 -07:00
from module . const import COURSES_TO_SKIP , OUTPUT_LOCATION
from module . download_canvas import download_assignment_pages , download_course_announcement_pages , download_course_discussion_pages , download_course_files , download_course_module_pages , download_submission_attachments , download_course_grades_page , download_course_home_page_html , download_course_html
2023-10-27 16:30:38 -06:00
from module . get_canvas import find_course_announcements , find_course_assignments , find_course_discussions , find_course_modules , find_course_pages
2024-01-25 15:43:35 -07:00
from module . items import CanvasCourse
2023-10-27 18:04:07 -06:00
from module . user_files import download_user_files
2020-07-08 14:51:04 -06:00
2024-01-11 21:57:36 -07:00
SCRIPT_PATH = os . path . abspath ( os . path . dirname ( __file__ ) )
2019-08-15 23:38:16 -06:00
2023-10-27 16:30:16 -06:00
def export_all_course_data ( c ) :
json_data = json . dumps ( json . loads ( jsonpickle . encode ( c , unpicklable = False ) ) , indent = 4 )
2024-01-25 15:43:35 -07:00
course_output_dir = os . path . join ( OUTPUT_LOCATION , c . term , c . name )
2019-08-15 23:38:16 -06:00
if not os . path . exists ( course_output_dir ) :
os . makedirs ( course_output_dir )
2023-10-27 16:30:16 -06:00
course_output_path = os . path . join ( course_output_dir , c . name + " .json " )
2023-10-27 16:24:52 -06:00
with open ( course_output_path , " w " ) as file :
file . write ( json_data )
2022-01-22 10:21:05 -07:00
2019-08-15 23:38:16 -06:00
2020-07-08 14:51:04 -06:00
if __name__ == " __main__ " :
2024-01-25 15:43:35 -07:00
parser = argparse . ArgumentParser ( description = ' ' )
parser . add_argument ( ' --output ' , default = ' ./output ' , help = ' Output location. If it does not exist, it will be created. ' )
parser . add_argument ( ' --term ' , default = None , help = ' Only download this term. ' )
parser . add_argument ( ' --user-files ' , action = ' store_true ' , help = " Download the user files. " )
args = parser . parse_args ( )
OUTPUT_LOCATION = Path ( args . output ) . resolve ( ) . expanduser ( ) . absolute ( )
OUTPUT_LOCATION . mkdir ( parents = True , exist_ok = True )
2024-01-11 21:57:36 -07:00
# Startup checks.
creds_file = Path ( SCRIPT_PATH , ' credentials.yaml ' )
if not creds_file . is_file ( ) :
print ( ' The credentials.yaml file does not exist: ' , creds_file )
quit ( 1 )
with open ( " credentials.yaml " , ' r ' ) as f :
credentials = yaml . full_load ( f )
API_URL = credentials [ " API_URL " ]
API_KEY = credentials [ " API_KEY " ]
USER_ID = credentials [ " USER_ID " ]
COOKIES_PATH = str ( Path ( credentials [ " COOKIES_PATH " ] ) . resolve ( ) . expanduser ( ) . absolute ( ) )
if not Path ( COOKIES_PATH ) . is_file ( ) :
print ( ' The cookies file does not exist: ' , COOKIES_PATH )
quit ( 1 )
COOKIE_JAR = MozillaCookieJar ( COOKIES_PATH )
COOKIE_JAR . load ( ignore_discard = True , ignore_expires = True )
# ==================================================================================================================
# Initialization
2023-10-27 16:24:52 -06:00
print ( " Welcome to the Canvas Student Data Export Tool " )
2024-01-25 15:43:35 -07:00
if not os . path . exists ( OUTPUT_LOCATION ) :
print ( " Creating output directory: " , OUTPUT_LOCATION )
os . makedirs ( OUTPUT_LOCATION )
2020-07-07 22:33:49 -06:00
2024-01-11 21:57:36 -07:00
if COOKIES_PATH :
# Test the cookies.
2024-01-25 15:43:35 -07:00
print ( " Authenticating with Canvas frontend... " )
2024-01-11 21:57:36 -07:00
# Requests takes a dict, not the MozillaCookieJar object.
2024-01-25 15:43:35 -07:00
request_cookies = { c . name : c . value for c in COOKIE_JAR }
2024-01-11 21:57:36 -07:00
r = requests . get ( f ' { API_URL } /profile ' , headers = { ' User-Agent ' : ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 ' } , cookies = request_cookies )
if r . status_code != 200 :
print ( ' Failed to fetch Canvas profile: got status code ' , r . status_code )
quit ( 1 )
if not r . url . startswith ( API_URL ) :
print ( ' Failed to fetch Canvas profile: client was redirected away from Canvas: ' )
print ( r . url )
quit ( 1 )
if ' profileContent__Block ' not in r . text :
2024-01-25 15:43:35 -07:00
# TODO: add an arg to skip this check.
2024-01-11 21:57:36 -07:00
print ( ' Failed to test Canvas profile: could not find an element with the class " profileContent__Block " . This could mean that your authentication is incorrect. ' )
quit ( 1 )
# TODO: log debug status success here
else :
print ( ' No cookies file specified! No HTML pages will be saved. ' )
print ( " Authenticating with Canvas API... " )
2023-10-27 16:30:16 -06:00
canvas = Canvas ( API_URL , API_KEY )
2024-01-11 21:57:36 -07:00
courses = canvas . get_courses ( include = " term " )
try :
course_count = len ( list ( courses ) )
except canvasapi . exceptions . InvalidAccessToken as e :
try :
msg = e . message [ 0 ] [ ' message ' ]
except :
# Something went very wrong.
msg = ' '
print ( ' Failed to fetch courses from the Canvas API: ' , msg )
quit ( 1 )
2023-10-27 16:30:16 -06:00
2023-10-27 18:04:07 -06:00
print ( ' ' )
2020-07-08 14:51:04 -06:00
skip = set ( COURSES_TO_SKIP )
2019-08-15 23:38:16 -06:00
2024-01-11 21:57:36 -07:00
# ==================================================================================================================
# Exporting
print ( " Downloading courses page... " )
download_course_html ( API_URL , COOKIES_PATH )
2022-01-22 10:21:05 -07:00
2024-01-25 15:43:35 -07:00
if not args . user_files :
print ( ' Downloading user files... ' )
download_user_files ( canvas , OUTPUT_LOCATION / ' User Files ' )
2023-10-27 16:24:52 -06:00
print ( ' ' )
2022-01-22 10:21:05 -07:00
2024-01-11 21:57:36 -07:00
all_courses_views = [ ]
2020-07-08 14:51:04 -06:00
for course in courses :
2022-01-22 10:21:05 -07:00
if course . id in skip or not hasattr ( course , " name " ) or not hasattr ( course , " term " ) :
2020-07-08 14:51:04 -06:00
continue
2019-08-15 23:38:16 -06:00
2024-01-25 15:43:35 -07:00
course_view = CanvasCourse ( course )
if args . term and args . term != course_view . term :
print ( ' Skipping term: ' , course_view . term , ' \n ' )
continue
2023-10-27 16:24:52 -06:00
print ( f " === { course_view . term } : { course_view . name } === " )
2023-10-27 18:04:07 -06:00
valid , r = course_view . test_course ( API_URL , COOKIE_JAR )
if not valid :
2024-01-11 21:57:36 -07:00
print ( f ' Invalid course: { course_view . course_id } - { r } - { r . text } ' )
2023-12-13 09:04:34 -07:00
if r . status_code == 401 :
2024-01-11 21:57:36 -07:00
# We can't recover from this error.
2023-12-13 09:04:34 -07:00
quit ( 1 )
2023-10-27 18:04:07 -06:00
continue
2023-10-27 16:24:52 -06:00
course_view . assignments = find_course_assignments ( course , USER_ID )
course_view . announcements = find_course_announcements ( course )
course_view . discussions = find_course_discussions ( course )
course_view . pages = find_course_pages ( course )
course_view . modules = find_course_modules ( course , course_view )
2024-01-25 15:43:35 -07:00
2020-07-08 14:51:04 -06:00
all_courses_views . append ( course_view )
2019-08-15 23:38:16 -06:00
2023-10-27 16:24:52 -06:00
print ( ' Downloading course home page... ' )
download_course_home_page_html ( API_URL , course_view , COOKIES_PATH )
2020-07-08 14:51:04 -06:00
2024-01-11 21:57:36 -07:00
print ( ' Downloading grades... ' )
download_course_grades_page ( API_URL , course_view , COOKIES_PATH )
2023-10-27 16:24:52 -06:00
download_assignment_pages ( API_URL , course_view , COOKIES_PATH , COOKIE_JAR )
2022-01-22 10:21:05 -07:00
2023-10-27 16:24:52 -06:00
download_course_module_pages ( API_URL , course_view , COOKIES_PATH )
2022-01-22 10:21:05 -07:00
2023-10-27 16:24:52 -06:00
download_course_announcement_pages ( API_URL , course_view , COOKIES_PATH )
2022-01-22 10:21:05 -07:00
2023-10-27 16:24:52 -06:00
download_course_discussion_pages ( API_URL , course_view , COOKIES_PATH )
2022-01-22 10:21:05 -07:00
2024-01-11 21:57:36 -07:00
download_course_files ( course , course_view )
download_submission_attachments ( course , course_view )
print ( " Exporting course metadata... " )
2023-10-27 16:24:52 -06:00
export_all_course_data ( course_view )
2022-01-22 10:21:05 -07:00
2023-10-27 18:04:07 -06:00
if course_count > 1 :
2023-10-27 16:30:16 -06:00
print ( ' ' )
2024-01-11 21:57:36 -07:00
# Remove elements from the course objects that can't be JSON serialized, then format it.
2023-10-27 16:24:52 -06:00
json_str = json . dumps ( json . loads ( jsonpickle . encode ( all_courses_views , unpicklable = False ) ) , indent = 4 )
2019-08-15 23:38:16 -06:00
2024-01-25 15:43:35 -07:00
all_output_path = os . path . join ( OUTPUT_LOCATION , " all_output.json " )
2019-08-15 23:38:16 -06:00
with open ( all_output_path , " w " ) as out_file :
out_file . write ( json_str )
print ( " \n Process complete. All canvas data exported! " )