canvas-student-data-export/module/download_canvas.py

import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
from http.cookiejar import MozillaCookieJar

import requests
from tqdm import tqdm

from module.singlefile import download_page
from module.const import DL_LOCATION, MAX_FOLDER_NAME_SIZE
from module.helpers import make_valid_filename, make_valid_folder_path, shorten_file_name
from module.threading import download_assignment, download_module_item


def download_course_files(course, course_view):
    # file full_name starts with "course files"
    dl_dir = os.path.join(DL_LOCATION, course_view.term, course_view.name)

    # Create directory if not present
    if not os.path.exists(dl_dir):
        os.makedirs(dl_dir)

    try:
        files = list(course.get_files())

        for file in tqdm(files, desc='Downloading Files'):
            file_folder = course.get_folder(file.folder_id)

            folder_dl_dir = os.path.join(dl_dir, make_valid_folder_path(file_folder.full_name))

            if not os.path.exists(folder_dl_dir):
                os.makedirs(folder_dl_dir)

            dl_path = os.path.join(folder_dl_dir, make_valid_filename(str(file.display_name)))

            # Download file if it doesn't already exist
            if not os.path.exists(dl_path):
                print('Downloading: {}'.format(dl_path))
                file.download(dl_path)
    except Exception as e:
        tqdm.write(f"Skipping file download that gave the following error: {e}")


def download_course_discussion_pages(api_url, course_view, cookies_path):
    if cookies_path == "" or len(course_view.discussions) == 0:
        return

    base_discussion_dir = os.path.join(DL_LOCATION, course_view.term, course_view.name, "discussions")
    if not os.path.exists(base_discussion_dir):
        os.makedirs(base_discussion_dir)

    discussion_list_dir = os.path.join(base_discussion_dir, "discussion_list.html")

    # Download assignment list (theres a chance this might be the course homepage if the course has the assignments page disabled)
    if not os.path.exists(discussion_list_dir):
        download_page(api_url + "/courses/" + str(course_view.course_id) + "/discussion_topics/", cookies_path, base_discussion_dir, "discussion_list.html")

    for discussion in tqdm(list(course_view.discussions), desc='Downloading Discussion Pages'):
        discussion_title = make_valid_filename(str(discussion.title))
        discussion_title = shorten_file_name(discussion_title, len(discussion_title) - MAX_FOLDER_NAME_SIZE)
        discussion_dir = os.path.join(base_discussion_dir, discussion_title)

        if discussion.url == "":
            continue

        if not os.path.exists(discussion_dir):
            os.makedirs(discussion_dir)

        # Downloads each page that a discussion takes.
        for i in range(discussion.amount_pages):
            filename = "discussion_" + str(i + 1) + ".html"
            discussion_page_dir = os.path.join(discussion_dir, filename)

            # Download assignment page, this usually has instructions and etc.
            if not os.path.exists(discussion_page_dir):
                download_page(discussion.url + "/page-" + str(i + 1), cookies_path, discussion_dir, filename)


def download_assignment_pages(api_url, course_view, cookies_path, cookie_jar: MozillaCookieJar):
    if cookies_path == "" or len(course_view.assignments) == 0:
        return

    base_assign_dir = os.path.join(DL_LOCATION, course_view.term, course_view.name, "assignments")
    if not os.path.exists(base_assign_dir):
        os.makedirs(base_assign_dir)

    assignment_list_path = os.path.join(base_assign_dir, "assignment_list.html")

    # Download assignment list (theres a chance this might be the course homepage if the course has the assignments page disabled)
    if not os.path.exists(assignment_list_path):
        download_page(api_url + "/courses/" + str(course_view.course_id) + "/assignments/", cookies_path, base_assign_dir, "assignment_list.html")

    # for assignment in tqdm(course_view.assignments, desc='Downloading Assignments'):
    #     assignment_title = make_valid_filename(str(assignment.title))
    #     assignment_title = shorten_file_name(assignment_title, len(assignment_title) - MAX_FOLDER_NAME_SIZE)
    #     assign_dir = os.path.join(base_assign_dir, assignment_title)
    #
    #     # Download an html image of each assignment (includes assignment instructions and other stuff).
    #     # Currently, this will only download the main assignment page and not external pages, this is
    #     # because these external pages are given in a json format. Saving these would require a lot
    #     # more work then normal.
    #     if assignment.html_url != "":
    #         if not os.path.exists(assign_dir):
    #             os.makedirs(assign_dir)
    #
    #         assignment_page_path = os.path.join(assign_dir, "assignment.html")
    #
    #         # Download assignment page, this usually has instructions and etc.
    #         if not os.path.exists(assignment_page_path):
    #             download_page(assignment.html_url, cookies_path, assign_dir, "assignment.html")
    #
    #         extra_files = get_extra_assignment_files(assignment.description, cookie_jar)
    #         if extra_files:  # in an if statement so that we only show the bar when there's things to do.
    #             for name, url in tqdm(extra_files, desc='Downloading Additional Files', leave=False):
    #                 download_file(url, Path(assign_dir, name), cookie_jar)
    #
    #     for submission in assignment.submissions:
    #         submission_dir = assign_dir
    #
    #         # If theres more then 1 submission, add unique id to download dir
    #         if len(assignment.submissions) != 1:
    #             submission_dir = os.path.join(assign_dir, str(submission.user_id))
    #
    #         if submission.preview_url != "":
    #             if not os.path.exists(submission_dir):
    #                 os.makedirs(submission_dir)
    #
    #             submission_page_dir = os.path.join(submission_dir, "submission.html")
    #
    #             # Download submission url, this is typically a more focused page
    #             if not os.path.exists(submission_page_dir):
    #                 download_page(submission.preview_url, cookies_path, submission_dir, "submission.html")
    #
    #         # If theres more then 1 attempt, save each attempt in attempts folder
    #         if (submission.attempt != 1 and assignment.updated_url != "" and assignment.html_url != ""
    #                 and assignment.html_url.rstrip("/") != assignment.updated_url.rstrip("/")):
    #             submission_dir = os.path.join(assign_dir, "attempts")
    #
    #             if not os.path.exists(submission_dir):
    #                 os.makedirs(submission_dir)
    #
    #             # Saves the attempts if multiple were taken, doesn't account for
    #             # different ID's however, as I wasnt able to find out what the url
    #             # for the specific id's attempts would be.
    #             for i in range(submission.attempt):
    #                 filename = "attempt_" + str(i + 1) + ".html"
    #                 submission_page_attempt_dir = os.path.join(submission_dir, filename)
    #
    #                 if not os.path.exists(submission_page_attempt_dir):
    #                     download_page(assignment.updated_url + "/history?version=" + str(i + 1), cookies_path, submission_dir, filename)

    with ThreadPoolExecutor(max_workers=3) as executor:
        download_func = partial(download_assignment, cookies_path, cookie_jar, base_assign_dir)
        list(tqdm(executor.map(download_func, course_view.assignments), total=len(course_view.assignments), desc='Downloading Assignments'))


def download_course_announcement_pages(api_url, course_view, cookies_path):
    """
    Download assignment list.
    There's a chance this might be the course homepage if the course has the assignments page disabled.
    :param api_url:
    :param course_view:
    :param cookies_path:
    :return:
    """

    if cookies_path == "" or len(course_view.announcements) == 0:
        return

    base_announce_dir = os.path.join(DL_LOCATION, course_view.term, course_view.name, "announcements")
    if not os.path.exists(base_announce_dir):
        os.makedirs(base_announce_dir)
    announcement_list_dir = os.path.join(base_announce_dir, "announcement_list.html")
    if not os.path.exists(announcement_list_dir):
        download_page(api_url + "/courses/" + str(course_view.course_id) + "/announcements/", cookies_path, base_announce_dir, "announcement_list.html")

    for announcements in tqdm(list(course_view.announcements), desc='Downloading Announcements'):
        announcements_title = make_valid_filename(str(announcements.title))
        announcements_title = shorten_file_name(announcements_title, len(announcements_title) - MAX_FOLDER_NAME_SIZE)
        announce_dir = os.path.join(base_announce_dir, announcements_title)

        if announcements.url == "":
            continue

        if not os.path.exists(announce_dir):
            os.makedirs(announce_dir)

        # Downloads each page that a discussion takes.
        for i in range(announcements.amount_pages):
            filename = "announcement_" + str(i + 1) + ".html"
            announcement_page_dir = os.path.join(announce_dir, filename)

            # Download assignment page, this usually has instructions and etc.
            if not os.path.exists(announcement_page_dir):
                download_page(announcements.url + "/page-" + str(i + 1), cookies_path, announce_dir, filename)


def download_submission_attachments(course, course_view):
    course_dir = os.path.join(DL_LOCATION, course_view.term, course_view.name)

    # Create directory if not present
    if not os.path.exists(course_dir):
        os.makedirs(course_dir)

    for assignment in tqdm(list(course_view.assignments), desc='Downloading Submissions'):
        for submission in assignment.submissions:
            assignment_title = make_valid_filename(str(assignment.title))
            assignment_title = shorten_file_name(assignment_title, len(assignment_title) - MAX_FOLDER_NAME_SIZE)
            attachment_dir = os.path.join(course_dir, "assignments", assignment_title)
            if len(assignment.submissions) != 1:
                attachment_dir = os.path.join(attachment_dir, str(submission.user_id))
            if not os.path.exists(attachment_dir) and submission.attachments:
                os.makedirs(attachment_dir)
            for attachment in submission.attachments:
                filepath = os.path.join(attachment_dir, make_valid_filename(str(attachment.id) + "_" + attachment.filename))
                if not os.path.exists(filepath):
                    # print('Downloading attachment: {}'.format(filepath))
                    r = requests.get(attachment.url, allow_redirects=True)
                    with open(filepath, 'wb') as f:
                        f.write(r.content)
                # else:
                #     print('File already exists: {}'.format(filepath))


def download_course_html(api_url, cookies_path):
    if cookies_path == "":
        return

    course_dir = DL_LOCATION

    if not os.path.exists(course_dir):
        os.makedirs(course_dir)

    course_list_path = os.path.join(course_dir, "course_list.html")

    # Downloads the course list.
    if not os.path.exists(course_list_path):
        download_page(api_url + "/courses/", cookies_path, course_dir, "course_list.html")


def download_course_home_page_html(api_url, course_view, cookies_path):
    if cookies_path == "":
        return

    dl_dir = os.path.join(DL_LOCATION, course_view.term, course_view.name)
    if not os.path.exists(dl_dir):
        os.makedirs(dl_dir)

    homepage_path = os.path.join(dl_dir, "homepage.html")

    # Downloads the course home page.
    if not os.path.exists(homepage_path):
        download_page(api_url + "/courses/" + str(course_view.course_id), cookies_path, dl_dir, "homepage.html")


def download_course_module_pages(api_url, course_view, cookies_path):
    if cookies_path == "" or len(course_view.modules) == 0:
        return

    modules_dir = os.path.join(DL_LOCATION, course_view.term, course_view.name, "modules")
    if not os.path.exists(modules_dir):
        os.makedirs(modules_dir)

    module_list_dir = os.path.join(modules_dir, "modules_list.html")

    # Downloads the modules page (possible this is disabled by the teacher)
    if not os.path.exists(module_list_dir):
        download_page(api_url + "/courses/" + str(course_view.course_id) + "/modules/", cookies_path, modules_dir, "modules_list.html")

    # for module in tqdm(list(course_view.modules), desc='Downloading Module Pages'):
    #     bar = tqdm(list(module.items), leave=False, desc=module.name)
    #     for item in module.items:
    #         # bar.set_postfix({'title': item.title})
    #
    #         # If problems arise due to long pathnames, changing module.name to module.id might help, this can also be done with item.title
    #         # A change would also have to be made in findCourseModules(course, course_view)
    #         module_name = make_valid_filename(str(module.name))
    #         module_name = shorten_file_name(module_name, len(module_name) - MAX_FOLDER_NAME_SIZE)
    #         items_dir = os.path.join(modules_dir, module_name)
    #
    #         if item.url != "":
    #             if not os.path.exists(items_dir):
    #                 os.makedirs(items_dir)
    #
    #             filename = make_valid_filename(str(item.title)) + ".html"
    #             module_item_dir = os.path.join(items_dir, filename)
    #
    #             # Download the module page.
    #             if not os.path.exists(module_item_dir):
    #                 download_page(item.url, cookies_path, items_dir, filename)
    #         bar.update()
    #     bar.close()

    with ThreadPoolExecutor(max_workers=3) as executor:
        for module in tqdm(list(course_view.modules), desc='Downloading Module Pages'):
            bar = tqdm(list(module.items), leave=False, desc=module.name)
            futures = [executor.submit(download_module_item, module, item, modules_dir, cookies_path) for item in module.items]
            for future in as_completed(futures):
                bar.update()
            bar.close()