canvas-student-data-export/stuff/get_canvas.py

import os
from http.cookiejar import MozillaCookieJar

import dateutil.parser
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

from stuff.const import DATE_TEMPLATE, DL_LOCATION, MAX_FOLDER_NAME_SIZE
from stuff.helpers import make_valid_filename, shorten_file_name
from stuff.items import AssignmentView, AttachmentView, DiscussionView, ModuleItemView, ModuleView, PageView, SubmissionView, TopicEntryView, TopicReplyView


def find_course_modules(course, course_view):
    modules_dir = os.path.join(DL_LOCATION, course_view.term, course_view.name, "modules")

    # Create modules directory if not present
    if not os.path.exists(modules_dir):
        os.makedirs(modules_dir)

    module_views = []

    try:
        modules = list(course.get_modules())

        for module in tqdm(modules, desc='Fetching Modules and Downloading Files'):
            module_view = ModuleView()
            module_view.id = module.id if hasattr(module, "id") else ""
            module_view.name = str(module.name) if hasattr(module, "name") else ""

            try:
                # Get module items
                module_items = module.get_module_items()

                for module_item in module_items:
                    module_item_view = ModuleItemView()
                    module_item_view.id = module_item.id if hasattr(module_item, "id") else 0
                    module_item_view.title = str(module_item.title).replace('  ', ' ') if hasattr(module_item, "title") else ""
                    module_item_view.content_type = str(module_item.type) if hasattr(module_item, "type") else ""
                    module_item_view.url = str(module_item.html_url) if hasattr(module_item, "html_url") else ""
                    module_item_view.external_url = str(module_item.external_url) if hasattr(module_item, "external_url") else ""

                    if module_item_view.content_type == "File":
                        # If problems arise due to long pathnames, changing module.name to module.id might help
                        # A change would also have to be made in downloadCourseModulePages(api_url, course_view, cookies_path)
                        module_name = make_valid_filename(str(module.name))
                        module_name = shorten_file_name(module_name, len(module_name) - MAX_FOLDER_NAME_SIZE)
                        module_dir = os.path.join(modules_dir, module_name, "files")

                        try:
                            # Create directory for current module if not present
                            if not os.path.exists(module_dir):
                                os.makedirs(module_dir)

                            # Get the file object
                            module_file = course.get_file(str(module_item.content_id))

                            # Create path for module file download
                            module_file_path = os.path.join(module_dir, make_valid_filename(str(module_file.display_name)))

                            # Download file if it doesn't already exist
                            if not os.path.exists(module_file_path):
                                module_file.download(module_file_path)
                        except Exception as e:
                            tqdm.write(f"Skipping module file download that gave the following error: {e}")

                    module_view.items.append(module_item_view)
            except Exception as e:
                tqdm.write(f"Skipping module file download that gave the following error: {e}")

            module_views.append(module_view)

    except Exception as e:
        print("Skipping entire module that gave the following error:")
        print(e)

    return module_views


def get_extra_assignment_files(html, cookie_jar: MozillaCookieJar):
    soup = BeautifulSoup(html, 'html.parser')
    urls = [a['data-api-endpoint'] for a in soup.find_all('a', {'data-api-returntype': 'File'})]

    s = requests.Session()
    for cookie in cookie_jar:
        s.cookies.set(cookie.name, cookie.value)

    extra_files = []
    for item in urls:
        r = s.get(item)
        if r.status_code == 404:
            continue
        r.raise_for_status()
        j = r.json()
        extra_files.append((j['display_name'], j['url']))

    return extra_files


def get_course_page_urls(course):
    page_urls = []
    try:
        pages = list(course.get_pages())
        for page in pages:
            if hasattr(page, "url"):
                page_urls.append(str(page.url))
    except Exception as e:
        if e.message != "Not Found":
            print(f"Skipping page: {e}")
    return page_urls


def find_course_pages(course):
    page_views = []
    try:
        page_urls = get_course_page_urls(course)
        if not len(page_urls):
            return

        for url in tqdm(page_urls, desc='Fetching Pages'):
            page = course.get_page(url)
            page_view = PageView()
            page_view.id = page.id if hasattr(page, "id") else 0
            page_view.title = str(page.title).replace('  ', ' ') if hasattr(page, "title") else ""
            page_view.body = str(page.body) if hasattr(page, "body") else ""

            if hasattr(page, "created_at"):
                page_view.created_date = dateutil.parser.parse(page.created_at).strftime(DATE_TEMPLATE)
            else:
                page_view.created_date = ''

            if hasattr(page, "updated_at"):
                page_view.last_updated_date = dateutil.parser.parse(page.updated_at).strftime(DATE_TEMPLATE)
            else:
                page_view.last_updated_date = ''

            page_views.append(page_view)
    except Exception as e:
        print("Skipping page download that gave the following error:")
        print(e)
    return page_views


def find_course_assignments(course, user_id):
    assignment_views = []

    # Get all assignments
    assignments = list(course.get_assignments())

    for assignment in tqdm(assignments, desc='Fetching Assignments'):
        assignment_view = AssignmentView()
        assignment_view.id = assignment.id if hasattr(assignment, "id") else ""
        assignment_view.title = make_valid_filename(str(assignment.name).replace('  ', ' ')) if hasattr(assignment, "name") else ""
        assignment_view.description = str(assignment.description) if hasattr(assignment, "description") else ""
        assignment_view.assigned_date = assignment.created_at_date.strftime(DATE_TEMPLATE) if hasattr(assignment, "created_at_date") else ""
        assignment_view.due_date = assignment.due_at_date.strftime(DATE_TEMPLATE) if hasattr(assignment, "due_at_date") else ""
        assignment_view.html_url = assignment.html_url if hasattr(assignment, "html_url") else ""
        assignment_view.ext_url = str(assignment.url) if hasattr(assignment, "url") else ""
        assignment_view.updated_url = str(assignment.submissions_download_url).split("submissions?")[0] if hasattr(assignment, "submissions_download_url") else ""

        # Download submission for this user only
        submissions = [assignment.get_submission(user_id)]
        if not len(submissions):
            raise IndexError(f'No submissions found for assignment: {vars(assignment)}')

        try:
            for submission in submissions:
                sub_view = SubmissionView()
                sub_view.id = submission.id if hasattr(submission, "id") else 0
                sub_view.grade = str(submission.grade) if hasattr(submission, "grade") else ""
                sub_view.raw_score = str(submission.score) if hasattr(submission, "score") else ""
                sub_view.total_possible_points = str(assignment.points_possible) if hasattr(assignment, "points_possible") else ""
                sub_view.submission_comments = str(submission.submission_comments) if hasattr(submission, "submission_comments") else ""
                sub_view.attempt = submission.attempt if hasattr(submission, "attempt") and submission.attempt is not None else 0
                sub_view.user_id = str(submission.user_id) if hasattr(submission, "user_id") else ""
                sub_view.preview_url = str(submission.preview_url) if hasattr(submission, "preview_url") else ""
                sub_view.ext_url = str(submission.url) if hasattr(submission, "url") else ""

                try:
                    submission.attachments
                except AttributeError:
                    print('No attachments')
                else:
                    for attachment in submission.attachments:
                        attach_view = AttachmentView()
                        attach_view.url = attachment.url
                        attach_view.id = attachment.id
                        attach_view.filename = attachment.filename
                        sub_view.attachments.append(attach_view)
                assignment_view.submissions.append(sub_view)
        except Exception as e:
            raise
            # print("Skipping submission that gave the following error:")
            # print(e)

        assignment_views.append(assignment_view)

    return assignment_views


def find_course_announcements(course):
    announcement_views = []

    # try:
    announcements = list(course.get_discussion_topics(only_announcements=True))

    for announcement in tqdm(announcements, desc='Fetching Announcements'):
        discussion_view = get_discussion_view(announcement)

        announcement_views.append(discussion_view)
    # except Exception as e:
    #     print("Skipping announcement that gave the following error:")
    #     print(e)

    return announcement_views


def get_discussion_view(discussion_topic):
    # Create discussion view
    discussion_view = DiscussionView()
    discussion_view.id = discussion_topic.id if hasattr(discussion_topic, "id") else 0
    discussion_view.title = str(discussion_topic.title).replace('  ', ' ') if hasattr(discussion_topic, "title") else ""
    discussion_view.author = str(discussion_topic.user_name) if hasattr(discussion_topic, "user_name") else ""
    discussion_view.posted_date = discussion_topic.created_at_date.strftime("%B %d, %Y %I:%M %p") if hasattr(discussion_topic, "created_at_date") else ""
    discussion_view.body = str(discussion_topic.message) if hasattr(discussion_topic, "message") else ""
    discussion_view.url = str(discussion_topic.html_url) if hasattr(discussion_topic, "html_url") else ""

    # Keeps track of how many topic_entries there are.
    topic_entries_counter = 0

    # Topic entries
    if hasattr(discussion_topic, "discussion_subentry_count") and discussion_topic.discussion_subentry_count > 0:
        # Need to get replies to entries recursively?
        discussion_topic_entries = discussion_topic.get_topic_entries()
        try:
            for topic_entry in discussion_topic_entries:
                topic_entries_counter += 1

                # Create new discussion view for the topic_entry
                topic_entry_view = TopicEntryView()
                topic_entry_view.id = topic_entry.id if hasattr(topic_entry, "id") else 0
                topic_entry_view.author = str(topic_entry.user_name) if hasattr(topic_entry, "user_name") else ""
                topic_entry_view.posted_date = topic_entry.created_at_date.strftime("%B %d, %Y %I:%M %p") if hasattr(topic_entry, "created_at_date") else ""
                topic_entry_view.body = str(topic_entry.message) if hasattr(topic_entry, "message") else ""

                # Get this topic's replies
                topic_entry_replies = topic_entry.get_replies()

                try:
                    for topic_reply in topic_entry_replies:
                        # Create new topic reply view
                        topic_reply_view = TopicReplyView()
                        topic_reply_view.id = topic_reply.id if hasattr(topic_reply, "id") else 0
                        topic_reply_view.author = str(topic_reply.user_name) if hasattr(topic_reply, "user_name") else ""
                        topic_reply_view.posted_date = topic_reply.created_at_date.strftime("%B %d, %Y %I:%M %p") if hasattr(topic_reply, "created_at_date") else ""
                        topic_reply_view.message = str(topic_reply.message) if hasattr(topic_reply, "message") else ""
                        topic_entry_view.topic_replies.append(topic_reply_view)
                except Exception as e:
                    print("Tried to enumerate discussion topic entry replies but received the following error:")
                    print(e)

                discussion_view.topic_entries.append(topic_entry_view)
        except Exception as e:
            print("Tried to enumerate discussion topic entries but received the following error:")
            print(e)

    # Amount of pages.
    # Typically 50 topic entries are stored on a page before it creates another page.
    discussion_view.amount_pages = int(topic_entries_counter / 50) + 1

    return discussion_view


def find_course_discussions(course):
    discussion_views = []

    # try:
    discussion_topics = list(course.get_discussion_topics())

    for discussion_topic in tqdm(discussion_topics, desc='Fetching Discussions'):
        discussion_view = get_discussion_view(discussion_topic)
        discussion_views.append(discussion_view)
    # except Exception as e:
    #     print("Skipping discussion that gave the following error:")
    #     print(e)

    return discussion_views