download user files, clean up code

This commit is contained in:
Cyberes 2023-10-27 18:04:07 -06:00
parent 76b2b6604e
commit a27aa18f40
7 changed files with 103 additions and 107 deletions

View File

@ -2,12 +2,19 @@
Forked from https://github.com/davekats/canvas-student-data-export Forked from https://github.com/davekats/canvas-student-data-export
Major rewrite and improvement. Major changes:
- Reorganized the project structure.
- Refactored the code to make it more Pythonic.
- Added progress bars.
- Use threading where possible.
- Save assignment attachements.
- Download all user files (as seen in the file manager at `/files` on your Canvas platform.)
--- ---
The Canvas Student Data Export Tool can export nearly all of a student's data from Instructure Canvas Learning Management System (Canvas LMS). The Canvas Student Data Export Tool can export nearly all of a student's data from Instructure Canvas Learning Management System (Canvas LMS).
This is useful when you are graduating or leaving your college or university, and would like to have a backup of all the data you had in canvas. This is useful when you are graduating or leaving your college or university, and would like to have a backup of all the data you had in canvas. Also, some instructors disable the built-in export tool.
The tool exports all of the following data for each course: The tool exports all of the following data for each course:
@ -28,6 +35,8 @@ pip install -r requirements.txt
npm install npm install
``` ```
Make sure you have Chomium or Chrome installed. Currently, the executable path is hardcoded to `/usr/bin/chromium-browser` in `module/singlefile.py`. If you are not on Linux or do not use Chromium, you will need to change the path.
## Run ## Run
1. Get your Canvas API key by going to Canvas and navigating to `Account` > `Settings` > `Approved Integrations` > `New Access Token` 1. Get your Canvas API key by going to Canvas and navigating to `Account` > `Settings` > `Approved Integrations` > `New Access Token`
@ -52,3 +61,5 @@ Now, run the program:
```shell ```shell
python export.py python export.py
``` ```
The folder `./output` will be created and your data downloaded to this path.

View File

@ -10,6 +10,7 @@ from module.const import COURSES_TO_SKIP, DL_LOCATION
from module.download_canvas import download_assignment_pages, download_course_announcement_pages, download_course_discussion_pages, download_course_files, download_course_home_page_html, download_course_html, download_course_module_pages, download_submission_attachments from module.download_canvas import download_assignment_pages, download_course_announcement_pages, download_course_discussion_pages, download_course_files, download_course_home_page_html, download_course_html, download_course_module_pages, download_submission_attachments
from module.get_canvas import find_course_announcements, find_course_assignments, find_course_discussions, find_course_modules, find_course_pages from module.get_canvas import find_course_announcements, find_course_assignments, find_course_discussions, find_course_modules, find_course_pages
from module.items import CourseView from module.items import CourseView
from module.user_files import download_user_files
with open("credentials.yaml", 'r') as f: with open("credentials.yaml", 'r') as f:
credentials = yaml.full_load(f) credentials = yaml.full_load(f)
@ -37,13 +38,18 @@ if __name__ == "__main__":
if not os.path.exists(DL_LOCATION): if not os.path.exists(DL_LOCATION):
os.makedirs(DL_LOCATION) os.makedirs(DL_LOCATION)
print("\nConnecting to Canvas...") print("Connecting to Canvas...")
canvas = Canvas(API_URL, API_KEY) canvas = Canvas(API_URL, API_KEY)
print('\nDownloading user files...')
download_user_files(canvas, DL_LOCATION / 'User Files')
print('')
all_courses_views = [] all_courses_views = []
print("Getting list of all courses...") print("Getting list of all courses...")
courses = canvas.get_courses(include="term") courses = canvas.get_courses(include="term")
course_count = len(list(courses))
skip = set(COURSES_TO_SKIP) skip = set(COURSES_TO_SKIP)
@ -59,12 +65,17 @@ if __name__ == "__main__":
course_view = CourseView(course) course_view = CourseView(course)
print(f"=== {course_view.term}: {course_view.name} ===") print(f"=== {course_view.term}: {course_view.name} ===")
valid, r = course_view.test_course(API_URL, COOKIE_JAR)
if not valid:
print(f'Invalid course: {course_view.course_id} - {r}')
continue
course_view.assignments = find_course_assignments(course, USER_ID) course_view.assignments = find_course_assignments(course, USER_ID)
course_view.announcements = find_course_announcements(course) course_view.announcements = find_course_announcements(course)
course_view.discussions = find_course_discussions(course) course_view.discussions = find_course_discussions(course)
course_view.pages = find_course_pages(course) course_view.pages = find_course_pages(course)
course_view.modules = find_course_modules(course, course_view) course_view.modules = find_course_modules(course, course_view)
all_courses_views.append(course_view) all_courses_views.append(course_view)
download_course_files(course, course_view) download_course_files(course, course_view)
@ -85,17 +96,13 @@ if __name__ == "__main__":
print("Exporting all course data...") print("Exporting all course data...")
export_all_course_data(course_view) export_all_course_data(course_view)
if len(courses) > 1: if course_count > 1:
print('') print('')
print("Exporting data from all courses combined as all_output.json") # Remove elemnts from the course objects that can't be JSON serialized, then format it.
# Awful hack to make the JSON pretty. Decode it with Python stdlib json
# module then re-encode with indentation
json_str = json.dumps(json.loads(jsonpickle.encode(all_courses_views, unpicklable=False)), indent=4) json_str = json.dumps(json.loads(jsonpickle.encode(all_courses_views, unpicklable=False)), indent=4)
all_output_path = os.path.join(DL_LOCATION, "all_output.json") all_output_path = os.path.join(DL_LOCATION, "all_output.json")
with open(all_output_path, "w") as out_file: with open(all_output_path, "w") as out_file:
out_file.write(json_str) out_file.write(json_str)

View File

@ -1,6 +1,8 @@
# Directory in which to download course information to (will be created if not from pathlib import Path
# present)
DL_LOCATION = "./output" # Directory in which to download course information to (will be created if not present)
DL_LOCATION = Path("./output").resolve().expanduser().absolute()
# List of Course IDs that should be skipped (need to be integers) # List of Course IDs that should be skipped (need to be integers)
COURSES_TO_SKIP = [288290, 512033] COURSES_TO_SKIP = [288290, 512033]
@ -9,4 +11,4 @@ DATE_TEMPLATE = "%B %d, %Y %I:%M %p"
# Max PATH length is 260 characters on Windows. 70 is just an estimate for a reasonable max folder name to prevent the chance of reaching the limit # Max PATH length is 260 characters on Windows. 70 is just an estimate for a reasonable max folder name to prevent the chance of reaching the limit
# Applies to modules, assignments, announcements, and discussions # Applies to modules, assignments, announcements, and discussions
# If a folder exceeds this limit, a "-" will be added to the end to indicate it was shortened ("..." not valid) # If a folder exceeds this limit, a "-" will be added to the end to indicate it was shortened ("..." not valid)
MAX_FOLDER_NAME_SIZE = 70 MAX_FOLDER_NAME_SIZE = 70

View File

@ -3,12 +3,13 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial from functools import partial
from http.cookiejar import MozillaCookieJar from http.cookiejar import MozillaCookieJar
import canvasapi
import requests import requests
from tqdm import tqdm from tqdm import tqdm
from module.singlefile import download_page
from module.const import DL_LOCATION, MAX_FOLDER_NAME_SIZE from module.const import DL_LOCATION, MAX_FOLDER_NAME_SIZE
from module.helpers import make_valid_filename, make_valid_folder_path, shorten_file_name from module.helpers import make_valid_filename, make_valid_folder_path, shorten_file_name
from module.singlefile import download_page
from module.threading import download_assignment, download_module_item from module.threading import download_assignment, download_module_item
@ -22,8 +23,12 @@ def download_course_files(course, course_view):
try: try:
files = list(course.get_files()) files = list(course.get_files())
except canvasapi.exceptions.Forbidden:
print('Files view disabled for this course.')
return
for file in tqdm(files, desc='Downloading Files'): for file in tqdm(files, desc='Downloading Files'):
try:
file_folder = course.get_folder(file.folder_id) file_folder = course.get_folder(file.folder_id)
folder_dl_dir = os.path.join(dl_dir, make_valid_folder_path(file_folder.full_name)) folder_dl_dir = os.path.join(dl_dir, make_valid_folder_path(file_folder.full_name))
@ -35,10 +40,10 @@ def download_course_files(course, course_view):
# Download file if it doesn't already exist # Download file if it doesn't already exist
if not os.path.exists(dl_path): if not os.path.exists(dl_path):
print('Downloading: {}'.format(dl_path)) # print('Downloading: {}'.format(dl_path))
file.download(dl_path) file.download(dl_path)
except Exception as e: except Exception as e:
tqdm.write(f"Skipping file download that gave the following error: {e}") tqdm.write(f"Skipping {file.display_name} - {e}")
def download_course_discussion_pages(api_url, course_view, cookies_path): def download_course_discussion_pages(api_url, course_view, cookies_path):
@ -55,7 +60,7 @@ def download_course_discussion_pages(api_url, course_view, cookies_path):
if not os.path.exists(discussion_list_dir): if not os.path.exists(discussion_list_dir):
download_page(api_url + "/courses/" + str(course_view.course_id) + "/discussion_topics/", cookies_path, base_discussion_dir, "discussion_list.html") download_page(api_url + "/courses/" + str(course_view.course_id) + "/discussion_topics/", cookies_path, base_discussion_dir, "discussion_list.html")
for discussion in tqdm(list(course_view.discussions), desc='Downloading Discussion Pages'): for discussion in tqdm(list(course_view.discussions), desc='Downloading Discussions'):
discussion_title = make_valid_filename(str(discussion.title)) discussion_title = make_valid_filename(str(discussion.title))
discussion_title = shorten_file_name(discussion_title, len(discussion_title) - MAX_FOLDER_NAME_SIZE) discussion_title = shorten_file_name(discussion_title, len(discussion_title) - MAX_FOLDER_NAME_SIZE)
discussion_dir = os.path.join(base_discussion_dir, discussion_title) discussion_dir = os.path.join(base_discussion_dir, discussion_title)
@ -90,65 +95,6 @@ def download_assignment_pages(api_url, course_view, cookies_path, cookie_jar: Mo
if not os.path.exists(assignment_list_path): if not os.path.exists(assignment_list_path):
download_page(api_url + "/courses/" + str(course_view.course_id) + "/assignments/", cookies_path, base_assign_dir, "assignment_list.html") download_page(api_url + "/courses/" + str(course_view.course_id) + "/assignments/", cookies_path, base_assign_dir, "assignment_list.html")
# for assignment in tqdm(course_view.assignments, desc='Downloading Assignments'):
# assignment_title = make_valid_filename(str(assignment.title))
# assignment_title = shorten_file_name(assignment_title, len(assignment_title) - MAX_FOLDER_NAME_SIZE)
# assign_dir = os.path.join(base_assign_dir, assignment_title)
#
# # Download an html image of each assignment (includes assignment instructions and other stuff).
# # Currently, this will only download the main assignment page and not external pages, this is
# # because these external pages are given in a json format. Saving these would require a lot
# # more work then normal.
# if assignment.html_url != "":
# if not os.path.exists(assign_dir):
# os.makedirs(assign_dir)
#
# assignment_page_path = os.path.join(assign_dir, "assignment.html")
#
# # Download assignment page, this usually has instructions and etc.
# if not os.path.exists(assignment_page_path):
# download_page(assignment.html_url, cookies_path, assign_dir, "assignment.html")
#
# extra_files = get_extra_assignment_files(assignment.description, cookie_jar)
# if extra_files: # in an if statement so that we only show the bar when there's things to do.
# for name, url in tqdm(extra_files, desc='Downloading Additional Files', leave=False):
# download_file(url, Path(assign_dir, name), cookie_jar)
#
# for submission in assignment.submissions:
# submission_dir = assign_dir
#
# # If theres more then 1 submission, add unique id to download dir
# if len(assignment.submissions) != 1:
# submission_dir = os.path.join(assign_dir, str(submission.user_id))
#
# if submission.preview_url != "":
# if not os.path.exists(submission_dir):
# os.makedirs(submission_dir)
#
# submission_page_dir = os.path.join(submission_dir, "submission.html")
#
# # Download submission url, this is typically a more focused page
# if not os.path.exists(submission_page_dir):
# download_page(submission.preview_url, cookies_path, submission_dir, "submission.html")
#
# # If theres more then 1 attempt, save each attempt in attempts folder
# if (submission.attempt != 1 and assignment.updated_url != "" and assignment.html_url != ""
# and assignment.html_url.rstrip("/") != assignment.updated_url.rstrip("/")):
# submission_dir = os.path.join(assign_dir, "attempts")
#
# if not os.path.exists(submission_dir):
# os.makedirs(submission_dir)
#
# # Saves the attempts if multiple were taken, doesn't account for
# # different ID's however, as I wasnt able to find out what the url
# # for the specific id's attempts would be.
# for i in range(submission.attempt):
# filename = "attempt_" + str(i + 1) + ".html"
# submission_page_attempt_dir = os.path.join(submission_dir, filename)
#
# if not os.path.exists(submission_page_attempt_dir):
# download_page(assignment.updated_url + "/history?version=" + str(i + 1), cookies_path, submission_dir, filename)
with ThreadPoolExecutor(max_workers=3) as executor: with ThreadPoolExecutor(max_workers=3) as executor:
download_func = partial(download_assignment, cookies_path, cookie_jar, base_assign_dir) download_func = partial(download_assignment, cookies_path, cookie_jar, base_assign_dir)
list(tqdm(executor.map(download_func, course_view.assignments), total=len(course_view.assignments), desc='Downloading Assignments')) list(tqdm(executor.map(download_func, course_view.assignments), total=len(course_view.assignments), desc='Downloading Assignments'))
@ -267,30 +213,6 @@ def download_course_module_pages(api_url, course_view, cookies_path):
if not os.path.exists(module_list_dir): if not os.path.exists(module_list_dir):
download_page(api_url + "/courses/" + str(course_view.course_id) + "/modules/", cookies_path, modules_dir, "modules_list.html") download_page(api_url + "/courses/" + str(course_view.course_id) + "/modules/", cookies_path, modules_dir, "modules_list.html")
# for module in tqdm(list(course_view.modules), desc='Downloading Module Pages'):
# bar = tqdm(list(module.items), leave=False, desc=module.name)
# for item in module.items:
# # bar.set_postfix({'title': item.title})
#
# # If problems arise due to long pathnames, changing module.name to module.id might help, this can also be done with item.title
# # A change would also have to be made in findCourseModules(course, course_view)
# module_name = make_valid_filename(str(module.name))
# module_name = shorten_file_name(module_name, len(module_name) - MAX_FOLDER_NAME_SIZE)
# items_dir = os.path.join(modules_dir, module_name)
#
# if item.url != "":
# if not os.path.exists(items_dir):
# os.makedirs(items_dir)
#
# filename = make_valid_filename(str(item.title)) + ".html"
# module_item_dir = os.path.join(items_dir, filename)
#
# # Download the module page.
# if not os.path.exists(module_item_dir):
# download_page(item.url, cookies_path, items_dir, filename)
# bar.update()
# bar.close()
with ThreadPoolExecutor(max_workers=3) as executor: with ThreadPoolExecutor(max_workers=3) as executor:
for module in tqdm(list(course_view.modules), desc='Downloading Module Pages'): for module in tqdm(list(course_view.modules), desc='Downloading Module Pages'):
bar = tqdm(list(module.items), leave=False, desc=module.name) bar = tqdm(list(module.items), leave=False, desc=module.name)

View File

@ -23,7 +23,7 @@ def find_course_modules(course, course_view):
try: try:
modules = list(course.get_modules()) modules = list(course.get_modules())
for module in tqdm(modules, desc='Fetching Modules and Downloading Files'): for module in tqdm(modules, desc='Downloading Module Files'):
module_view = ModuleView() module_view = ModuleView()
module_view.id = module.id if hasattr(module, "id") else "" module_view.id = module.id if hasattr(module, "id") else ""
module_view.name = str(module.name) if hasattr(module, "name") else "" module_view.name = str(module.name) if hasattr(module, "name") else ""
@ -62,7 +62,7 @@ def find_course_modules(course, course_view):
if not os.path.exists(module_file_path): if not os.path.exists(module_file_path):
module_file.download(module_file_path) module_file.download(module_file_path)
except Exception as e: except Exception as e:
tqdm.write(f"Skipping module file download that gave the following error: {e}") tqdm.write(f"Skipping module file download that gave the following error: {e} - {module_item}")
module_view.items.append(module_item_view) module_view.items.append(module_item_view)
except Exception as e: except Exception as e:
@ -88,9 +88,8 @@ def get_extra_assignment_files(html, cookie_jar: MozillaCookieJar):
extra_files = [] extra_files = []
for item in urls: for item in urls:
r = s.get(item) r = s.get(item)
if r.status_code == 404: if r.status_code != 200:
continue continue
r.raise_for_status()
j = r.json() j = r.json()
extra_files.append((j['display_name'], j['url'])) extra_files.append((j['display_name'], j['url']))

View File

@ -1,3 +1,7 @@
from http.cookiejar import MozillaCookieJar
import requests
from module.helpers import make_valid_filename from module.helpers import make_valid_filename
@ -104,3 +108,15 @@ class CourseView:
self.announcements = [] self.announcements = []
self.discussions = [] self.discussions = []
self.modules = [] self.modules = []
def test_course(self, base_url: str, cookie_jar: MozillaCookieJar):
s = requests.Session()
for cookie in cookie_jar:
s.cookies.set(cookie.name, cookie.value)
try:
r = s.get(f'{base_url}/api/v1/courses/{self.course_id}')
if not r.status_code == 200:
return False, r
return True, r
except Exception as e:
return False, e

39
module/user_files.py Normal file
View File

@ -0,0 +1,39 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
import canvasapi
from tqdm import tqdm
from module.helpers import make_valid_folder_path
def do_download(task):
task[1].parent.mkdir(parents=True, exist_ok=True)
task[0].download(task[1])
def download_user_files(canvas: canvasapi.Canvas, base_path: str):
base_path = Path(base_path)
user = canvas.get_current_user()
folders = []
for folder in user.get_folders():
n = folder.full_name.lstrip('my files/')
if n:
c_n = make_valid_folder_path(n)
folders.append((folder, c_n))
files = []
for folder, folder_name in tqdm(folders, desc='Fetching User Files'):
for file in folder.get_files():
out_path = base_path / folder_name / file.display_name
files.append((file, out_path))
with ThreadPoolExecutor(max_workers=10) as executor:
bar = tqdm(files, desc='Downloading User Files')
futures = [executor.submit(do_download, task) for task in files]
for future in as_completed(futures):
bar.update()
# for file, out_path in tqdm(files, desc='Downloading User Files'):
# if not out_path.exists():
# file.download(out_path)