diff --git a/.all-contributorsrc b/.all-contributorsrc index 5778a7a..ab853a6 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -7,6 +7,16 @@ "commitType": "docs", "commitConvention": "angular", "contributors": [ + { + "login": "OleBialas", + "name": "Ole Bialas", + "avatar_url": "https://avatars.githubusercontent.com/u/38684453?v=4", + "profile": "https://github.com/OleBialas", + "contributions": [ + "code", + "maintenance" + ] + }, { "login": "iamzoltan", "name": "Zoltan", diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 16cb667..5ed6393 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -19,15 +19,15 @@ jobs: - name: Checkout uses: actions/checkout@v3 - - name: Setup Python - uses: actions/setup-python@v4 + - name: Setup uv + uses: astral-sh/setup-uv@v5 with: - python-version: 3.9 + python-version: "3.9" - name: Install dependencies run: | - pip install pytest -r requirements.txt - jupyter kernelspec list + uv sync --extra dev + uv run jupyter kernelspec list - name: Execute tests - run: pytest tests/ + run: uv run pytest tests/ src/nmaci/ diff --git a/README.md b/README.md index 5c7aa1a..2a2a125 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,45 @@ # nmaci -[![CI](https://github.com/neuromatch/nmaci/actions/workflows/ci.yaml/badge.svg)](https://github.com/neuromatch/nmaci/actions/workflows/ci.yaml) +[![CI](https://github.com/OleBialas/nmaci/actions/workflows/ci.yaml/badge.svg)](https://github.com/OleBialas/nmaci/actions/workflows/ci.yaml) -[![All Contributors](https://img.shields.io/badge/all_contributors-1-orange.svg?style=flat-square)](#contributors-) +[![All Contributors](https://img.shields.io/badge/all_contributors-2-orange.svg?style=flat-square)](#contributors-) -Automated tools for building and verifying NMA tutorial materials +Automated tools for building and verifying NMA tutorial materials. + +## Installation + +```bash +pip install git+https://github.com/OleBialas/nmaci@main +``` + +## Usage + +``` +nmaci [args] +``` + +| Command | Description | +|---|---| +| `process-notebooks` | Execute notebooks, extract solutions, create student/instructor versions | +| `verify-exercises` | Check exercise cells match solution cells | +| `lint-tutorial` | Run flake8/pyflakes over notebook code cells | +| `generate-readmes` | Auto-generate tutorial `README.md` files | +| `generate-book` | Build Jupyter Book from `materials.yml` | +| `generate-book-dl` | Build Jupyter Book (Deep Learning variant) | +| `generate-book-precourse` | Build Jupyter Book (Precourse variant) | +| `select-notebooks` | Filter which notebooks to process | +| `make-pr-comment` | Generate PR comment with Colab badges and lint report | +| `find-unreferenced` | Identify unused solution images/scripts | +| `extract-links` | Extract video/slide links from notebooks | +| `parse-html` | Check HTML build output for errors | + +## Development + +```bash +git clone https://github.com/OleBialas/nmaci +cd nmaci +uv sync --extra dev +uv run pytest tests/ +``` ## Contributors ✨ @@ -16,6 +52,7 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d Zoltan
Zoltan

💻 ⚠️ 🚧 + Ole Bialas
Ole Bialas

💻 🚧 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..9a677a1 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,30 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "nmaci" +version = "0.1.0" +requires-python = ">=3.9" +dependencies = [ + "nbformat", + "nbconvert", + "notebook", + "pillow", + "flake8", + "fuzzywuzzy[speedup]", + "pyyaml", + "beautifulsoup4", + "decorator==5.0.9", + "Jinja2==3.0.0", + "jupyter-client", +] + +[project.optional-dependencies] +dev = ["pytest"] + +[project.scripts] +nmaci = "nmaci.cli:main" + +[tool.hatch.build.targets.wheel] +packages = ["src/nmaci"] diff --git a/scripts/extract_links.py b/scripts/extract_links.py index e56a40e..fef60c1 100644 --- a/scripts/extract_links.py +++ b/scripts/extract_links.py @@ -1,138 +1,4 @@ -""" -Neuromatch Academy - -Extract slide and video links from notebooks -""" -import argparse -import ast -import collections -import json -import os -from urllib.request import urlopen, Request -from urllib.error import HTTPError +from nmaci.extract_links import main import sys - -import nbformat - - -def bilibili_url(video_id): - return f"https://www.bilibili.com/video/{video_id}" - - -def youtube_url(video_id): - return f"https://youtube.com/watch?v={video_id}" - - -def osf_url(link_id): - return f"https://osf.io/download/{link_id}" - -def tutorial_order(fname): - fname = os.path.basename(fname) - try: - first, last = fname.split("_") - except ValueError: - return (99, 99, fname) - if first.startswith("Bonus"): - week, day = 9, 9 - else: - try: - week, day = int(first[1]), int(first[3]) - except ValueError: - week, day = 9, 9 - if last.startswith("Intro"): - order = 0 - elif last.startswith("Tutorial"): - order = int(last[8]) - elif last.startswith("Outro"): - order = 10 - elif last.startswith("DaySummary"): - order = 20 - else: - order = 30 - return (week, day, order) - -def main(arglist): - """Process IPython notebooks from a list of files.""" - args = parse_args(arglist) - - nb_paths = [arg for arg in args.files - if arg.endswith(".ipynb") and - 'student' not in arg and - 'instructor' not in arg] - if not nb_paths: - print("No notebook files found") - sys.exit(0) - - videos = collections.defaultdict(list) - slides = collections.defaultdict(list) - - for nb_path in sorted(nb_paths, key=tutorial_order): - # Load the notebook structure - with open(nb_path) as f: - nb = nbformat.read(f, nbformat.NO_CONVERT) - - # Extract components of the notebook path - nb_dir, nb_fname = os.path.split(nb_path) - nb_name, _ = os.path.splitext(nb_fname) - - # Loop through the cells and find video and slide ids - for cell in nb.get("cells", []): - for line in cell.get("source", "").split("\n"): - l = line.strip() - if l.startswith("video_ids = "): - rhs = l.split("=")[1].strip() - video_dict = dict(ast.literal_eval(rhs)) - try: - if args.noyoutube: - url = bilibili_url(video_dict["Bilibili"]) - else: - url = youtube_url(video_dict["Youtube"]) - except KeyError: - print(f"Malformed video id in {nb_name}? '{rhs}'") - continue - if url not in videos[nb_name]: - videos[nb_name].append(url) - elif l.startswith("link_id = "): - rhs = l.split("=")[1].strip() - url = osf_url(ast.literal_eval(rhs)) - # Slides are sometimes used in multiple notebooks, so we - # just store the filename and the link - if url not in slides: - api_request = f"https://api.osf.io/v2/files/{ast.literal_eval(rhs)}/" - httprequest = Request(api_request, - headers={"Accept": "application/json"}) - try: - with urlopen(httprequest) as response: - data = json.load(response) - filename = data["data"]["attributes"]["name"] - except HTTPError as e: - sys.stderr.write(str(e) + "\n") - sys.stderr.write(f"Skipping slide {url}\n") - continue - if 'DaySummary' in nb_name: - filename = os.path.splitext(filename.replace("_", ""))[0] + '_DaySummary.pdf' - slides[url] = filename - - print(json.dumps({"videos": videos, "slides": slides}, indent=4)) - - -def parse_args(arglist): - """Handle the command-line arguments.""" - parser = argparse.ArgumentParser( - description="Process neuromatch tutorial notebooks" - ) - parser.add_argument( - "--noyoutube", - action="store_true", - help="Extract Bilibili links instead of youtube", - ) - parser.add_argument( - "files", - nargs="+", - help="File name(s) to process. Will filter for .ipynb extension.", - ) - return parser.parse_args(arglist) - - if __name__ == "__main__": main(sys.argv[1:]) diff --git a/scripts/find_unreferenced_content.py b/scripts/find_unreferenced_content.py index 43f0691..98df3d2 100644 --- a/scripts/find_unreferenced_content.py +++ b/scripts/find_unreferenced_content.py @@ -1,24 +1,4 @@ -"""Print names of derivative files that are no longer used in the notebooks.""" -from glob import glob - +from nmaci.find_unreferenced_content import main +import sys if __name__ == "__main__": - - day_paths = glob("tutorials/W?D?_*") - for day_path in sorted(day_paths): - - # Read all of the text for this day's student notebooks into one string - student_notebooks = glob(f"{day_path}/student/*.ipynb") - notebook_text = "" - for nb_path in student_notebooks: - with open(nb_path) as f: - notebook_text += f.read() - - # Find solution images and scripts - solution_pattern = "W?D?_*_Solution*" - static_paths = glob(f"{day_path}/static/{solution_pattern}") - script_paths = glob(f"{day_path}/solutions/{solution_pattern}") - - # Print paths that are not referenced in the notebooks - for path in sorted(static_paths + script_paths): - if path not in notebook_text: - print(path) + main(sys.argv[1:]) diff --git a/scripts/generate_book.py b/scripts/generate_book.py index 37ea4dc..075abed 100644 --- a/scripts/generate_book.py +++ b/scripts/generate_book.py @@ -1,339 +1,4 @@ -import os +from nmaci.generate_book import main import sys -import yaml -from jinja2 import Template -import traceback -import json -from bs4 import BeautifulSoup - -ORG = os.environ.get("ORG", "neuromatch") -REPO = os.environ.get("NMA_REPO", "course-content-template") -PREREQ_REPOR = os.environ.get("PREREQ_REPO", "precourse") -PREREQ_INTRO = os.environ.get("PREREQ_INTRO", "ComputationalNeuroscience") -ARG = sys.argv[1] - - -def main(): - with open('tutorials/materials.yml') as fh: - materials = yaml.load(fh, Loader=yaml.FullLoader) - - # Make the dictionary that contains the chapters - toc = {} - for m in materials: - if m['category'] not in toc.keys(): - toc[m['category']] = {'part': m['category'], 'chapters': []} - # Add the project booklet - toc['Project Booklet'] = {'part': 'Project Booklet', 'chapters': []} - - art_file_list = os.listdir('tutorials/Art/') - - for m in materials: - directory = f"{m['day']}_{''.join(m['name'].split())}" - - # Make temporary chapter title file - with open(f"tutorials/{directory}/chapter_title.md", - "w+") as title_file: - title_page = f"# {m['name']}" - art_file = [fname for fname in art_file_list if m['day'] in fname] - if len(art_file) == 1: - artist = art_file[0].split('-')[1].split('.')[0] - artist = artist.replace('_', ' ') - title_page += f"\n\n ````{{div}} full-width \n art relevant to chapter contents \n```` \n\n*Artwork by {artist}*" - title_file.write(title_page) - - chapter = {'file': f"tutorials/{directory}/chapter_title.md", - 'title': f"{m['name']} ({m['day']})", - 'sections': []} - print(m['day']) - part = m['category'] - directory = f"tutorials/{m['day']}_{''.join(m['name'].split())}" - - # Make list of notebook sections - notebook_list = [] - notebook_list += [f"{directory}/{ARG}/{m['day']}_Intro.ipynb"] if os.path.exists(f"{directory}/{m['day']}_Intro.ipynb") else [] - notebook_list += [f"{directory}/{ARG}/{m['day']}_Tutorial{i + 1}.ipynb" for i in range(m['tutorials'])] - notebook_list += [f"{directory}/{ARG}/{m['day']}_Outro.ipynb"] if os.path.exists(f"{directory}/{m['day']}_Outro.ipynb") else [] - - # Add and process all notebooks - for notebook_file_path in notebook_list: - chapter['sections'].append({'file': notebook_file_path}) - pre_process_notebook(notebook_file_path) - - # Add further reading page - chapter['sections'].append({'file': f"{directory}/further_reading.md"}) - - # Add day summary page - notebook_file_path = f"{directory}/{ARG}/{m['day']}_DaySummary.ipynb" - if os.path.exists(notebook_file_path): - chapter['sections'].append({'file': notebook_file_path}) - pre_process_notebook(notebook_file_path) - - # Add chapter - toc[part]['chapters'].append(chapter) - - # Project chapter -- based on the repo - # TODO: get this from the project_materials.yml - # with open('projects/project_materials.yml') as fh: - # project_materials = yaml.load(fh, Loader=yaml.FullLoader) - # print(project_materials) -# -# part = 'Project Booklet' -# toc[part]['chapters'].append({'file': 'projects/README.md', 'title': 'Introduction'}) -# toc[part]['chapters'].append({'file': 'projects/docs/project_guidance.md'}) - - -# # Add the project booklet - toc["Project Booklet"] = {"part": "Project Booklet", "chapters": []} - toc["Professional Development"] = { - "part": "Professional Development", "chapters": []} - - with open("projects/professional_development/prof_dev_materials.yml") as fh: - prof_dev_materials = yaml.load(fh, Loader=yaml.FullLoader) - - part = "Professional Development" - toc[part]['chapters'] = prof_dev_materials - - # Project chapter -- based on the repo - with open("projects/project_materials.yml") as fh: - project_materials = yaml.load(fh, Loader=yaml.FullLoader) - - part = "Project Booklet" - toc[part]['chapters'] = project_materials - - # Process Project Notebooks - for m in project_materials: - if m["title"] == "Project materials": - for project in m["sections"]: - pre_process_notebook(project["file"]) - - # Loop over dataset types - # project_datasets = {"file": "projects/docs/datasets_overview.md", "sections": []} - - - - # toc[part]['chapters'].append({'file': 'projects/docs/past_projects_overview.md'}) - # toc[part]['chapters'].append({'file': 'projects/docs/datasets_overview.md'}) - # toc[part]['chapters'].append({'file': 'projects/docs/continuing_your_project_after_the_course.md'}) -# -# # Add Modeling Steps -# toc[part]['chapters'].append({'file': 'projects/modelingsteps/intro.md', -# 'sections': [{'file': 'projects/modelingsteps/ModelingSteps_1through4.ipynb'}, -# {'file': 'projects/modelingsteps/ModelingSteps_5through10.ipynb'}, -# {'file': 'projects/modelingsteps/TrainIllusionModel.ipynb'}, -# {'file': 'projects/modelingsteps/TrainIllusionDataProject.ipynb'} -# ]}) -# pre_process_notebook('projects/modelingsteps/ModelingSteps_1through4.ipynb') -# pre_process_notebook('projects/modelingsteps/ModelingSteps_5through10.ipynb') -# pre_process_notebook('projects/modelingsteps/TrainIllusionModel.ipynb') -# pre_process_notebook('projects/modelingsteps/TrainIllusionDataProject.ipynb') -# - # Loop over dataset types - # project_datasets = {'file': 'projects/docs/datasets_overview.md', 'sections': []} - -# for category in project_materials[0]['categories']: -# # this_section = {'file': f'projects/docs/{category}.md', 'sections': []} - -# # Add README guide -# this_section['sections'].append({'file': f"projects/{category}/README.md", 'title': 'Guide'}) - -# # Add and process all notebooks -# try: -# this_section['sections'].append({'file': f"projects/{category}/{category}_videos.ipynb"}) -# pre_process_notebook(f"projects/{category}/{category}_videos.ipynb") -# except: -# pass -# ## dataset_loaders = [entry for entry in project_materials if entry['category'] == category] -# ## for notebook in dataset_loaders: -# ## this_section['sections'].append({'file': notebook['link'], 'title': notebook['title']}) -# ## pre_process_notebook(notebook['link']) -# project_datasets['sections'].append(this_section) -# toc[part]['chapters'].append(project_datasets) -# toc[part]['chapters'].append({'file': 'projects/docs/project_templates.md'}) - -# # Past Projects -# p_sections = [] -# year = project_materials[1]['year'] -# for past_categories in project_materials[1]['past_categories']: -# p_sections.append( -# {'file': f'projects/docs/projects_{year}/{past_categories}.md'} -# ) -# toc[part]['chapters'].append( -# { -# 'file': f'projects/docs/project_{year}_highlights.md', -# 'sections': p_sections -# } -# ) - # toc[part]['chapters'].append({'file': 'projects/docs/project_2020_highlights.md', - # 'sections': [{'file': 'projects/docs/projects_2020/neurons.md'}, - # {'file': 'projects/docs/projects_2020/theory.md'}, - # {'file': 'projects/docs/projects_2020/behavior.md'}, - # {'file': 'projects/docs/projects_2020/fMRI.md'}, - # {'file': 'projects/docs/projects_2020/eeg.md'} - # ]}) - - # TODO: Fix TOC for new format - # Turn toc into list - toc_list = [{'file': f"tutorials/intro.ipynb"}] - if os.path.exists(f"tutorials/intro.ipynb"): - pre_process_notebook(f"tutorials/intro.ipynb") - - # TODO: fix this for the generic case - # TA training file - if ARG == "instructor" and ("climate" not in REPO and "neuroai" not in REPO): - chapter = {'chapters': [{'file': 'tatraining/TA_Training_CN.ipynb'}]} - pre_process_notebook('tatraining/TA_Training_CN.ipynb') - toc_list += [chapter] - # Schedule chapter - chapter = {'chapters': [{'file': 'tutorials/Schedule/schedule_intro.md', - 'sections': [{'file': 'tutorials/Schedule/daily_schedules.md'}, - {'file': 'tutorials/Schedule/shared_calendars.md'}, - {'file': 'tutorials/Schedule/timezone_widget.md'} - ]}]} - toc_list += [chapter] - - # Technical help chapter - chapter = {'chapters': [{'file': 'tutorials/TechnicalHelp/tech_intro.md', - 'sections': [{'file': 'tutorials/TechnicalHelp/Jupyterbook.md', - 'sections': [{'file': 'tutorials/TechnicalHelp/Tutorial_colab.md'}, - {'file': 'tutorials/TechnicalHelp/Tutorial_kaggle.md'} - ] - }, - {'file': 'tutorials/TechnicalHelp/Discord.md'} - ]}]} - toc_list += [chapter] - - # Links and Policy file - chapter = {'chapters': [{'file': 'tutorials/TechnicalHelp/Links_Policy.md'}]} - toc_list += [chapter] - - # Pre-reqs file - if "climate" in REPO: - chapter = {'chapters': [{'file': f'tutorials/prereqs/{PREREQ_INTRO}.md'}]} - else: - chapter = {'chapters': [{'file': f'prereqs/{PREREQ_INTRO}.md'}]} - toc_list += [chapter] - - for key in toc.keys(): - - # Add wrap-up if it exists - wrapup_name = f'tutorials/Module_WrapUps/{key.replace(" ", "")}.ipynb' - if os.path.exists(wrapup_name): - toc[key]['chapters'].append({'file': wrapup_name}) - - toc_list.append(toc[key]) - - with open('book/_toc.yml', 'w') as fh: - yaml.dump(toc_list, fh) - - -def pre_process_notebook(file_path): - - with open(file_path, encoding="utf-8") as read_notebook: - content = json.load(read_notebook) - pre_processed_content = open_in_colab_new_tab(content) - pre_processed_content = change_video_widths(pre_processed_content) - pre_processed_content = link_hidden_cells(pre_processed_content) - with open(file_path, "w", encoding="utf-8") as write_notebook: - json.dump(pre_processed_content, write_notebook, indent=1, ensure_ascii=False) - - -def open_in_colab_new_tab(content): - cells = content['cells'] - parsed_html = BeautifulSoup(cells[0]['source'][0], "html.parser") - for anchor in parsed_html.findAll('a'): - # Open in new tab - anchor['target'] = '_blank' - cells[0]['source'][0] = str(parsed_html) - return content - -def link_hidden_cells(content): - cells = content['cells'] - updated_cells = cells.copy() - - i_updated_cell = 0 - for i_cell, cell in enumerate(cells): - updated_cell = updated_cells[i_updated_cell] - if "source" not in cell: - continue - source = cell['source'][0] - - if source.startswith("#") and cell['cell_type'] == 'markdown': - header_level = source.count('#') - elif source.startswith("---") and cell['cell_type'] == 'markdown': - if len(cell['source']) > 1 and cell['source'][1].startswith("#") and cell['cell_type'] == 'markdown': - header_level = cell['source'][1].count('#') - - if '@title' in source or '@markdown' in source: - if 'metadata' not in cell: - updated_cell['metadata'] = {} - if 'tags' not in cell['metadata']: - updated_cell['metadata']['tags'] = [] - - # Check if cell is video one - if 'YouTubeVideo' in ''.join(cell['source']) or 'IFrame' in ''.join(cell['source']): - if "remove-input" not in cell['metadata']['tags']: - updated_cell['metadata']['tags'].append("remove-input") - else: - if "hide-input" not in cell['metadata']['tags']: - updated_cell['metadata']['tags'].append("hide-input") - - # If header is lost, create one in markdown - if '@title' in source: - - if source.split('@title')[1] != '': - header_cell = { - 'cell_type': 'markdown', - 'metadata': {}, - 'source': ['#'*(header_level + 1) + ' ' + source.split('@title')[1]]} - updated_cells.insert(i_updated_cell, header_cell) - i_updated_cell += 1 - - strings_with_markdown = [(i, string) for i, string in enumerate(cell['source']) if '@markdown' in string] - if len(strings_with_markdown) == 1: - i = strings_with_markdown[0][0] - if cell['source'][i].split('@markdown')[1] != '': - header_cell = { - 'cell_type': 'markdown', - 'metadata': {}, - 'source': [cell['source'][i].split('@markdown')[1]]} - updated_cells.insert(i_updated_cell, header_cell) - i_updated_cell += 1 - - i_updated_cell += 1 - - content['cells'] = updated_cells - return content - -def change_video_widths(content): - - for i, cell in enumerate(content['cells']): - if 'YouTubeVideo' in ''.join(cell['source']): - - for ind in range(len(cell['source'])): - # Change sizes - cell['source'][ind] = cell['source'][ind].replace('854', '730') - cell['source'][ind] = cell['source'][ind].replace('480', '410') - - # Put slides in ipywidget so they don't overlap margin - if '# @title Tutorial slides\n' in cell['source'] or '# @title Slides\n' in cell['source'] or '## Slides' in content['cells'][i-1]['source']: - for line in cell['source']: - if line.startswith('link_id'): - slide_link = line.split('"')[1] - break - # Catch the case with missing link_id - else: - slide_link = "" - download_link = f"https://osf.io/download/{slide_link}/" - render_link = f"https://mfr.ca-1.osf.io/render?url=https://osf.io/{slide_link}/?direct%26mode=render%26action=download%26mode=render" - cell['source'] = ['# @markdown\n', - 'from IPython.display import IFrame\n', - 'from ipywidgets import widgets\n', - 'out = widgets.Output()\n', - 'with out:\n', - f' print(f"If you want to download the slides: {download_link}")\n', - f' display(IFrame(src=f"{render_link}", width=730, height=410))\n', - 'display(out)'] - return content - -if __name__ == '__main__': - main() +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/scripts/generate_book_dl.py b/scripts/generate_book_dl.py index 82f7589..4e8ff18 100644 --- a/scripts/generate_book_dl.py +++ b/scripts/generate_book_dl.py @@ -1,259 +1,4 @@ -import os +from nmaci.generate_book_dl import main import sys -import yaml -from jinja2 import Template -import traceback -import json -from bs4 import BeautifulSoup - -REPO = os.environ.get("NMA_REPO", "course-content-dl") -ARG = sys.argv[1] - -def main(): - with open('tutorials/materials.yml') as fh: - materials = yaml.load(fh, Loader=yaml.FullLoader) - - # Make the dictionary that contains the chapters - toc = {} - for m in materials: - if m['category'] not in toc.keys(): - toc[m['category']] = {'part': m['category'], 'chapters': []} - # Add the project booklet - toc['Project Booklet'] = {'part': 'Project Booklet', 'chapters': []} - - art_file_list = os.listdir('tutorials/Art/') - - for m in materials: - directory = f"{m['day']}_{''.join(m['name'].split())}" - - # Make temporary chapter title file - with open(f"tutorials/{directory}/chapter_title.md", - "w+") as title_file: - title_page = f"# {m['name']}" - art_file = [fname for fname in art_file_list if m['day'] in fname] - if len(art_file) == 1: - artist = art_file[0].split('-')[1].split('.')[0] - artist = artist.replace('_', ' ') - title_page += f"\n\n ````{{div}} full-width \n art relevant to chapter contents \n```` \n\n*Artwork by {artist}*" - title_file.write(title_page) - - chapter = {'file': f"tutorials/{directory}/chapter_title.md", - 'title': f"{m['name']} ({m['day']})", - 'sections': []} - print(m['day']) - part = m['category'] - directory = f"tutorials/{m['day']}_{''.join(m['name'].split())}" - - # Make list of notebook sections - notebook_list = [] - notebook_list += [f"{directory}/{ARG}/{m['day']}_Tutorial{i + 1}.ipynb" for i in range(m['tutorials'])] - notebook_list += [f"{directory}/{ARG}/{m['day']}_BonusLecture.ipynb"] if os.path.exists(f"{directory}/{m['day']}_BonusLecture.ipynb") else [] - - # Add and process all notebooks - for notebook_file_path in notebook_list: - chapter['sections'].append({'file': notebook_file_path}) - pre_process_notebook(notebook_file_path) - - # Add further reading page - # chapter['sections'].append({'file': f"{directory}/further_reading.md"}) - - # Add chapter - toc[part]['chapters'].append(chapter) - - # Project chapter -- under construction - part = 'Project Booklet' - toc[part]['chapters'].append({'file': 'projects/README.md', 'title': 'Introduction'}) - toc[part]['chapters'].append({'file': 'projects/docs/project_guidance.md'}) - - with open('projects/project_materials.yml') as fh: - project_materials = yaml.load(fh, Loader=yaml.FullLoader) - - # Add modelling steps - category = 'modelingsteps' - this_section = {'file': f'projects/{category}/intro.md', 'sections': []} - for m in project_materials: - if m['category'] == category: - this_section['sections'].append({'file': f"projects/{category}/{m['link']}"}) - pre_process_notebook(f"projects/{category}/{m['link']}") - toc[part]['chapters'].append(this_section) - print(category) - - # Add project templates - project_datasets = {'file': 'projects/docs/projects_overview.md', 'sections': []} - # Loop over project categories - for category in ['ComputerVision', 'ReinforcementLearning', 'NaturalLanguageProcessing', 'Neuroscience']: - print(category) - # Add each category section - this_section = {'file': f'projects/{category}/README.md', - 'sections': [{'file': f'projects/{category}/slides.md'}, - {'file': f'projects/{category}/ideas_and_datasets.md'}]} - for m in project_materials: - if m['category'] == category: - # Add and process all notebooks - try: - this_section['sections'].append({'file': f"projects/{category}/{m['link']}"}) - pre_process_notebook(f"projects/{category}/{m['link']}") - except: - pass - project_datasets['sections'].append(this_section) - toc[part]['chapters'].append(project_datasets) - - # Add models and datasets - toc[part]['chapters'].append({'file': 'projects/docs/datasets_and_models.md'}) - # Turn toc into list - toc_list = [{'file': 'tutorials/intro.ipynb'}] - if os.path.exists("tutorials/intro.ipynb"): - pre_process_notebook('tutorials/intro.ipynb') - - # TA training file - if ARG == "instructor": - chapter = {'chapters': [{'file': 'tatraining/TA_Training_DL.ipynb'}]} - pre_process_notebook('tatraining/TA_Training_DL.ipynb') - toc_list += [chapter] - - # Schedule chapter - chapter = {'chapters': [{'file': 'tutorials/Schedule/schedule_intro.md', - 'sections': [{'file': 'tutorials/Schedule/daily_schedules.md'}, - {'file': 'tutorials/Schedule/shared_calendars.md'}, - {'file': 'tutorials/Schedule/timezone_widget.md'} - ]}]} - toc_list += [chapter] - - # Technical help chapter - chapter = {'chapters': [{'file': 'tutorials/TechnicalHelp/tech_intro.md', - 'sections': [{'file': 'tutorials/TechnicalHelp/Jupyterbook.md', - 'sections': [{'file': 'tutorials/TechnicalHelp/Tutorial_colab.md'}, - {'file': 'tutorials/TechnicalHelp/Tutorial_kaggle.md'} - ] - }, - {'file': 'tutorials/TechnicalHelp/Discord.md'} - ]}]} - toc_list += [chapter] - - # Links and Policy file - chapter = {'chapters': [{'file': 'tutorials/TechnicalHelp/Links_Policy.md'}]} - toc_list += [chapter] - - # Pre-reqs file - chapter = {'chapters': [{'file': 'prereqs/DeepLearning.md'}]} - toc_list += [chapter] - - for key in toc.keys(): - # Add wrap-up if it exists - wrapup_name = f'tutorials/Module_WrapUps/{key.replace(" ", "")}.ipynb' - if os.path.exists(wrapup_name): - toc[key]['chapters'].append({'file': wrapup_name}) - - toc_list.append(toc[key]) - - with open('book/_toc.yml', 'w') as fh: - yaml.dump(toc_list, fh) - - -def pre_process_notebook(file_path): - - with open(file_path, encoding="utf-8") as read_notebook: - content = json.load(read_notebook) - pre_processed_content = open_in_colab_new_tab(content) - pre_processed_content = change_video_widths(pre_processed_content) - pre_processed_content = link_hidden_cells(pre_processed_content) - with open(file_path, "w", encoding="utf-8") as write_notebook: - json.dump(pre_processed_content, write_notebook, indent=1, ensure_ascii=False) - - -def open_in_colab_new_tab(content): - cells = content['cells'] - parsed_html = BeautifulSoup(cells[0]['source'][0], "html.parser") - for anchor in parsed_html.findAll('a'): - # Open in new tab - anchor['target'] = '_blank' - cells[0]['source'][0] = str(parsed_html) - return content - -def link_hidden_cells(content): - cells = content['cells'] - updated_cells = cells.copy() - - i_updated_cell = 0 - for i_cell, cell in enumerate(cells): - updated_cell = updated_cells[i_updated_cell] - if "source" not in cell: - continue - source = cell['source'][0] - - if source.startswith("#") and cell['cell_type'] == 'markdown': - header_level = source.count('#') - elif source.startswith("---") and cell['cell_type'] == 'markdown': - if len(cell['source']) > 1 and cell['source'][1].startswith("#") and cell['cell_type'] == 'markdown': - header_level = cell['source'][1].count('#') - - if '@title' in source or '@markdown' in source: - if 'metadata' not in cell: - updated_cell['metadata'] = {} - if 'tags' not in cell['metadata']: - updated_cell['metadata']['tags'] = [] - - # Check if cell is video one - if 'YouTubeVideo' in ''.join(cell['source']) or 'IFrame' in ''.join(cell['source']): - if "remove-input" not in cell['metadata']['tags']: - updated_cell['metadata']['tags'].append("remove-input") - else: - if "hide-input" not in cell['metadata']['tags']: - updated_cell['metadata']['tags'].append("hide-input") - - # If header is lost, create one in markdown - if '@title' in source: - - if source.split('@title')[1] != '': - header_cell = { - 'cell_type': 'markdown', - 'metadata': {}, - 'source': ['#'*(header_level + 1) + ' ' + source.split('@title')[1]]} - updated_cells.insert(i_updated_cell, header_cell) - i_updated_cell += 1 - - strings_with_markdown = [(i, string) for i, string in enumerate(cell['source']) if '@markdown' in string] - if len(strings_with_markdown) == 1: - i = strings_with_markdown[0][0] - if cell['source'][i].split('@markdown')[1] != '': - header_cell = { - 'cell_type': 'markdown', - 'metadata': {}, - 'source': [cell['source'][i].split('@markdown')[1]]} - updated_cells.insert(i_updated_cell, header_cell) - i_updated_cell += 1 - - i_updated_cell += 1 - - content['cells'] = updated_cells - return content - -def change_video_widths(content): - - for i, cell in enumerate(content['cells']): - if 'YouTubeVideo' in ''.join(cell['source']): - - for ind in range(len(cell['source'])): - # Change sizes - cell['source'][ind] = cell['source'][ind].replace('854', '730') - cell['source'][ind] = cell['source'][ind].replace('480', '410') - - # Put slides in ipywidget so they don't overlap margin - if '# @title Tutorial slides\n' in cell['source'] or '# @title Slides\n' in cell['source'] or '## Slides' in content['cells'][i-1]['source']: - for line in cell['source']: - if line.startswith('link_id'): - slide_link = line.split('"')[1] - download_link = f"https://osf.io/download/{slide_link}/" - render_link = f"https://mfr.ca-1.osf.io/render?url=https://osf.io/{slide_link}/?direct%26mode=render%26action=download%26mode=render" - cell['source'] = ['# @markdown\n', - 'from IPython.display import IFrame\n', - 'from ipywidgets import widgets\n', - 'out = widgets.Output()\n', - 'with out:\n', - f' print(f"If you want to download the slides: {download_link}")\n', - f' display(IFrame(src=f"{render_link}", width=730, height=410))\n', - 'display(out)'] - return content - -if __name__ == '__main__': - main() +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/scripts/generate_book_precourse.py b/scripts/generate_book_precourse.py index 5067fab..7001df2 100644 --- a/scripts/generate_book_precourse.py +++ b/scripts/generate_book_precourse.py @@ -1,197 +1,4 @@ -import os - -import yaml -from jinja2 import Template -import traceback -import json -from bs4 import BeautifulSoup - -REPO = os.environ.get("NMA_REPO", "precourse") - -def main(): - with open('tutorials/materials.yml') as fh: - materials = yaml.load(fh, Loader=yaml.FullLoader) - - # Make the dictionary that contains the chapters - toc = {} - for m in materials: - if m['category'] not in toc.keys(): - toc[m['category']] = {'part': m['category'], 'chapters': []} - - art_file_list = os.listdir('tutorials/Art/') - - art_file_list = os.listdir('tutorials/Art/') - - for m in materials: - directory = f"{m['day']}_{''.join(m['name'].split())}" - - # Make temporary chapter title file - with open(f"tutorials/{directory}/chapter_title.md", - "w+") as title_file: - title_page = f"# {m['name']}" - art_file = [fname for fname in art_file_list if m['day'] in fname] - if len(art_file) == 1: - artist = art_file[0].split('-')[1].split('.')[0] - artist = artist.replace('_', ' ') - title_page += f"\n\n ````{{div}} full-width \n art relevant to chapter contents \n```` \n\n*Artwork by {artist}*" - title_file.write(title_page) - - chapter = {'file': f"tutorials/{directory}/chapter_title.md", - 'title': f"{m['name']} ({m['day']})", - 'sections': []} - print(m['day']) - part = m['category'] - directory = f"tutorials/{m['day']}_{''.join(m['name'].split())}" - - # Make list of notebook sections - notebook_list = [] - notebook_list += [f"{directory}/{m['day']}_Intro.ipynb"] if os.path.exists(f"{directory}/{m['day']}_Intro.ipynb") else [] - notebook_list += [f"{directory}/student/{m['day']}_Tutorial{i + 1}.ipynb" for i in range(m['tutorials'])] - notebook_list += [f"{directory}/{m['day']}_Outro.ipynb"] if os.path.exists(f"{directory}/{m['day']}_Outro.ipynb") else [] - notebook_list += [f"{directory}/{m['day']}_DaySummary.ipynb"] if os.path.exists(f"{directory}/{m['day']}_DaySummary.ipynb") else [] - - # Add and process all notebooks - for notebook_file_path in notebook_list: - chapter['sections'].append({'file': notebook_file_path}) - pre_process_notebook(notebook_file_path) - - # Add chapter - toc[part]['chapters'].append(chapter) - - # Turn toc into list - toc_list = [{'file': 'tutorials/intro.ipynb'}] - if os.path.exists("tutorials/intro.ipynb"): - pre_process_notebook('tutorials/intro.ipynb') - - # Technical help chapter - chapter = {'chapters': [{'file': 'tutorials/TechnicalHelp/tech_intro.md', - 'sections': [{'file': 'tutorials/TechnicalHelp/Jupyterbook.md', - 'sections': [{'file': 'tutorials/TechnicalHelp/Tutorial_colab.md'}, - {'file': 'tutorials/TechnicalHelp/Tutorial_kaggle.md'} - ] - }, - {'file': 'tutorials/TechnicalHelp/Discord.md'} - ]}]} - toc_list += [chapter] - for key in toc.keys(): - - # Add wrap-up if it exists - wrapup_name = f'tutorials/Module_WrapUps/{key.replace(" ", "")}.ipynb' - if os.path.exists(wrapup_name): - toc[key]['chapters'].append({'file': wrapup_name}) - - toc_list.append(toc[key]) - - with open('book/_toc.yml', 'w') as fh: - yaml.dump(toc_list, fh) - - -def pre_process_notebook(file_path): - - with open(file_path, encoding="utf-8") as read_notebook: - content = json.load(read_notebook) - pre_processed_content = open_in_colab_new_tab(content) - pre_processed_content = change_video_widths(pre_processed_content) - pre_processed_content = link_hidden_cells(pre_processed_content) - with open(file_path, "w", encoding="utf-8") as write_notebook: - json.dump(pre_processed_content, write_notebook, indent=1, ensure_ascii=False) - - -def open_in_colab_new_tab(content): - cells = content['cells'] - parsed_html = BeautifulSoup(cells[0]['source'][0], "html.parser") - for anchor in parsed_html.findAll('a'): - # Open in new tab - anchor['target'] = '_blank' - cells[0]['source'][0] = str(parsed_html) - return content - - -def link_hidden_cells(content): - cells = content['cells'] - updated_cells = cells.copy() - - i_updated_cell = 0 - for i_cell, cell in enumerate(cells): - updated_cell = updated_cells[i_updated_cell] - if "source" not in cell: - continue - source = cell['source'][0] - - if source.startswith("#") and cell['cell_type'] == 'markdown': - header_level = source.count('#') - elif source.startswith("---") and cell['cell_type'] == 'markdown': - if len(cell['source']) > 1 and cell['source'][1].startswith("#") and cell['cell_type'] == 'markdown': - header_level = cell['source'][1].count('#') - - if '@title' in source or '@markdown' in source: - if 'metadata' not in cell: - updated_cell['metadata'] = {} - if 'tags' not in cell['metadata']: - updated_cell['metadata']['tags'] = [] - - # Check if cell is video one - if 'YouTubeVideo' in ''.join(cell['source']) or 'IFrame' in ''.join(cell['source']): - if "remove-input" not in cell['metadata']['tags']: - updated_cell['metadata']['tags'].append("remove-input") - else: - if "hide-input" not in cell['metadata']['tags']: - updated_cell['metadata']['tags'].append("hide-input") - - # If header is lost, create one in markdown - if '@title' in source: - - if source.split('@title')[1] != '': - header_cell = { - 'cell_type': 'markdown', - 'metadata': {}, - 'source': ['#'*(header_level + 1) + ' ' + source.split('@title')[1]]} - updated_cells.insert(i_updated_cell, header_cell) - i_updated_cell += 1 - - strings_with_markdown = [(i, string) for i, string in enumerate(cell['source']) if '@markdown' in string] - if len(strings_with_markdown) == 1: - i = strings_with_markdown[0][0] - if cell['source'][i].split('@markdown')[1] != '': - header_cell = { - 'cell_type': 'markdown', - 'metadata': {}, - 'source': [cell['source'][i].split('@markdown')[1]]} - updated_cells.insert(i_updated_cell, header_cell) - i_updated_cell += 1 - - i_updated_cell += 1 - - content['cells'] = updated_cells - return content - - -def change_video_widths(content): - - for i, cell in enumerate(content['cells']): - if 'YouTubeVideo' in ''.join(cell['source']): - - for ind in range(len(cell['source'])): - # Change sizes - cell['source'][ind] = cell['source'][ind].replace('854', '730') - cell['source'][ind] = cell['source'][ind].replace('480', '410') - - # Put slides in ipywidget so they don't overlap margin - if '# @title Tutorial slides\n' in cell['source'] or '# @title Slides\n' in cell['source'] or '## Slides' in content['cells'][i-1]['source']: - for line in cell['source']: - if line.startswith('link_id'): - slide_link = line.split('"')[1] - download_link = f"https://osf.io/download/{slide_link}/" - render_link = f"https://mfr.ca-1.osf.io/render?url=https://osf.io/{slide_link}/?direct%26mode=render%26action=download%26mode=render" - cell['source'] = ['# @markdown\n', - 'from IPython.display import IFrame\n', - 'from ipywidgets import widgets\n', - 'out = widgets.Output()\n', - 'with out:\n', - f' print(f"If you want to download the slides: {download_link}")\n', - f' display(IFrame(src=f"{render_link}", width=730, height=410))\n', - 'display(out)'] - return content - -if __name__ == '__main__': - main() +from nmaci.generate_book_precourse import main +import sys +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/scripts/generate_tutorial_readmes.py b/scripts/generate_tutorial_readmes.py index 3eecc46..e7c863f 100644 --- a/scripts/generate_tutorial_readmes.py +++ b/scripts/generate_tutorial_readmes.py @@ -1,261 +1,4 @@ -"""Write a directory of tutorial notebooks to the README file. - -Run this script from the root of the github repository. - -""" -import os -from glob import glob -import yaml - -ORG = os.environ.get("ORG", "neuromatch") -REPO = os.environ.get("NMA_REPO", "course-content") -MAIN_BRANCH = os.environ.get("NMA_MAIN_BRANCH", "main") - - -def main(): - - # Initialize the lines in tutorials/README.md - course_readme_text = [ - ] - - try: - playlist_urls = load_youtube_playlist_urls() - except Exception as err: - print("Encountered error while loading youtube playlist links") - print(err) - playlist_urls = {} - - try: - slide_urls = load_slide_urls() - except Exception as err: - print("Encountered error while loading slide links") - print(err) - slide_urls = {} - - day_anchors = {} - - day_paths = sorted(glob("tutorials/W?D?_*")) - for day_path in day_paths: - - day_name = os.path.split(day_path)[-1] - day_code, topic_code = day_name.split("_") - - # Split the UpperCamelCase topic name into separate words - topic_words = [] - for letter in topic_code: - if letter.isupper(): - topic_words.append(letter) - else: - topic_words[-1] += letter - topic = " ".join(topic_words) - - # Note: this will fail if we have 10+ notebooks - notebooks = sorted(glob(f"{day_path}/*.ipynb")) - - if not notebooks: - continue - - # Track the anchor to this section for embed in the header - anchor = "-".join([ - day_code.lower(), - "-", - ("-".join(topic_words)).lower(), - ]) - - day_anchors[day_code] = "#" + anchor - - instructor_notebooks = get_instructor_links(notebooks) - student_notebooks = get_student_links(notebooks) - - # Write the day information into the course README - course_readme_text.extend([ - f"## {day_code} - {topic}", - "", - ]) - - # Add a link to the YouTube lecture playlist, if we have one - youtube_url = playlist_urls.get(day_code, None) - if youtube_url is not None: - course_readme_text.extend([ - f"[YouTube Playlist]({youtube_url})" - "", - ]) - - slide_links_by_topic = slide_urls.get(day_code, None) - if slide_links_by_topic is not None: - slide_links = [ - f"[{topic}]({url})" for topic, url in slide_links_by_topic - ] - course_readme_text.extend([ - "", - "Slides: " + " | ".join(slide_links), - "", - ]) - - course_readme_text.extend(write_badge_table(student_notebooks)) - course_readme_text.append("\n") - - # Add further reading - further_reading_file = f"{day_path}/further_reading.md" - if os.path.exists(further_reading_file): - reading_url = f"https://github.com/{ORG}/{REPO}/blob/{MAIN_BRANCH}/{further_reading_file}" - course_readme_text.extend([f"[Further Reading]({reading_url})"]) - course_readme_text.append("\n") - - # Now make the day-specific README - # with links to both instructor and student versions - day_readme_text = [ - f"# {day_code} - {topic}", - "", - "## Instructor notebooks", - "", - ] - day_readme_text.extend(write_badge_table(instructor_notebooks)) - - day_readme_text.extend([ - "## Student notebooks", - "", - ]) - day_readme_text.extend(write_badge_table(student_notebooks)) - - # Write the day README file - with open(f"{day_path}/README.md", "w") as f: - f.write("\n".join(day_readme_text)) - - # Create relative anchor links to each day - nav_line = " | ".join([ - f"[{day_code}]({anchor})" for day_code, anchor in day_anchors.items() - ]) - - # Add an introductory header to the main README - course_readme_header = [ - "# Neuromatch Academy Tutorial Materials", - "", - "", - "", - nav_line, - "", - "*Warning:* The 'render with NBViewer' buttons may show outdated content.", - "", - ] - course_readme_text = course_readme_header + course_readme_text - - # Write the course README file - with open("tutorials/README.md", "w") as f: - f.write("\n".join(course_readme_text)) - - -def load_youtube_playlist_urls(): - """Create a mapping from day code to youtube link based on text file.""" - with open('tutorials/materials.yml') as fh: - materials = yaml.load(fh, Loader=yaml.FullLoader) - days = [m['day'] for m in materials] - playlists = [m['playlist'] for m in materials] - return dict(zip(days, playlists)) - - -def load_slide_urls(): - """Create a hierarchical mapping to slide PDF urls based on text file.""" - with open('tutorials/materials.yml') as fh: - materials = yaml.load(fh, Loader=yaml.FullLoader) - slide_links = {} - for ind, day_dict in enumerate(materials): - if 'slides' in day_dict: - slide_links[day_dict['day']] = [] - for slide_info in day_dict['slides']: - slide_links[day_dict['day']].append((slide_info['title'], slide_info['link'])) - return slide_links - - -def write_badge_table(notebooks): - """Make a markdown table with colab/nbviewer badge links.""" - - # Add the table header - table_text = [ - "| | Run | Run | View |", - "| - | --- | --- | ---- |", - ] - - # Get ordered list of file names - notebook_list = [name for name in notebooks if 'Intro' in name] - notebook_list += [name for name in notebooks if 'Tutorial' in name] - notebook_list += [name for name in notebooks if 'Outro' in name] - - # Add badges - for local_path in notebook_list: - # Extract type of file (intro vs outro vs tutorial) - notebook_name = local_path.split('_')[-1].split('.ipynb')[0] - - # Add space between Tutorial and number - if 'Tutorial' in notebook_name: - notebook_name = f"Tutorial {notebook_name.split('Tutorial')[1]}" - colab_badge = make_colab_badge(local_path) - kaggle_badge = make_kaggle_badge(local_path) - nbviewer_badge = make_nbviewer_badge(local_path) - table_text.append( - f"| {notebook_name} | {colab_badge} | {kaggle_badge} | {nbviewer_badge} |" - ) - table_text.append("\n") - - return table_text - - -def get_instructor_links(base_notebooks): - """Convert a list of base notebook paths to instructor versions.""" - instructor_notebooks = [] - for base_nb in base_notebooks: - if 'Tutorial' in base_nb: - day_path, nb_fname = os.path.split(base_nb) - instructor_notebooks.append(f"{day_path}/instructor/{nb_fname}") - else: - instructor_notebooks.append(base_nb) - return instructor_notebooks - - -def get_student_links(base_notebooks): - """Convert a list of base notebook paths to student versions.""" - student_notebooks = [] - for base_nb in base_notebooks: - if 'Tutorial' in base_nb: - day_path, nb_fname = os.path.split(base_nb) - student_notebooks.append(f"{day_path}/student/{nb_fname}") - else: - student_notebooks.append(base_nb) - return student_notebooks - - -def make_colab_badge(local_path): - """Generate a Google Colaboratory badge for a notebook on github.""" - alt_text = "Open In Colab" - badge_svg = "https://colab.research.google.com/assets/colab-badge.svg" - service = "https://colab.research.google.com" - url_base = f"{service}/github/{ORG}/{REPO}/blob/{MAIN_BRANCH}" - return make_badge(alt_text, badge_svg, service, local_path, url_base) - - -def make_kaggle_badge(local_path): - """Generate a kaggle badge for a notebook on github.""" - alt_text = "Open In kaggle" - badge_svg = "https://kaggle.com/static/images/open-in-kaggle.svg" - service = "https://kaggle.com/kernels/welcome?src=" - url_base = f"{service}https://raw.githubusercontent.com/{ORG}/{REPO}/{MAIN_BRANCH}" - return make_badge(alt_text, badge_svg, service, local_path, url_base) - - -def make_nbviewer_badge(local_path): - """Generate an NBViewer badge for a notebook on github.""" - alt_text = "View the notebook" - badge_svg = "https://img.shields.io/badge/render-nbviewer-orange.svg" - service = "https://nbviewer.jupyter.org" - url_base = f"{service}/github/{ORG}/{REPO}/blob/{MAIN_BRANCH}" - return make_badge(alt_text, badge_svg, service, f"{local_path}?flush_cache=true", url_base) - - -def make_badge(alt_text, badge_svg, service, local_path, url_base): - """Generate a markdown element for a badge image that links to a file.""" - return f"[![{alt_text}]({badge_svg})]({url_base}/{local_path})" - - +from nmaci.generate_tutorial_readmes import main +import sys if __name__ == "__main__": - - main() + main(sys.argv[1:]) diff --git a/scripts/lint_tutorial.py b/scripts/lint_tutorial.py index 570cbc8..026cf89 100644 --- a/scripts/lint_tutorial.py +++ b/scripts/lint_tutorial.py @@ -1,192 +1,4 @@ -"""Lint tutorial notebooks with pyflakes and pycodestyle (aka flake8). - -Running this script on a notebook will print a report of issues flagged by -pyflakes (which checks some aspects of code quality) and pycodestyle (which -checks adherence to the PEP8 stylistic standards). - -Note that these checks do not capture all potential issues with a codebase, -and some checks will false-alarm because of deliberate choices we have made -about how to write tutorials. Nevertheless, this can be an easy way to flag -potential issues. - -Requires nbformat (part of Jupyter) and flake8. - -""" -import os -import io -import re +from nmaci.lint_tutorial import main import sys -import argparse -import tempfile -import subprocess -import collections -import nbformat -from pyflakes.api import check -from pyflakes.reporter import Reporter - - -def main(arglist): - - args = parse_args(arglist) - - _, fname = os.path.split(args.path) - - script, cell_lines = extract_code(args.path) - warnings, errors = check_code(script) - violations = check_style(script) - - if args.brief: - report_brief(fname, warnings, errors, violations) - else: - line_map = remap_line_numbers(cell_lines) - report_verbose(fname, warnings, errors, violations, line_map) - - -def parse_args(arglist): - - parser = argparse.ArgumentParser(__doc__) - parser.add_argument("path", help="Path to notebook file") - parser.add_argument("--brief", action="store_true", - help="Print brief report (useful for aggregating)") - - return parser.parse_args(arglist) - - -def extract_code(nb_fname): - """Turn code cells from notebook into a script, track cell sizes.""" - with open(nb_fname) as f: - nb = nbformat.read(f, nbformat.NO_CONVERT) - - script_lines = [] - cell_lengths = [] - for cell in nb.get("cells", []): - if cell["cell_type"] == "code": - cell_lines = cell.get("source", "").split("\n") - cell_lengths.append(len(cell_lines)) - for line in cell_lines: - if line and line[0] in ["!", "%"]: # IPython syntax - line = "# " + line - script_lines.append(line) - - script = "\n".join(script_lines) - - return script, cell_lengths - - -def check_code(script): - """Run pyflakes checks over the script and capture warnings/errors.""" - errors = io.StringIO() - warnings = io.StringIO() - reporter = Reporter(warnings, errors) - check(script, "notebook", reporter) - - warnings.seek(0) - errors.seek(0) - - return warnings, errors - - -def check_style(script): - """Write a temporary script and run pycodestyle (PEP8) on it.""" - - with tempfile.NamedTemporaryFile("w", suffix=".py") as f: - - f.write(script) - - cmdline = [ - "pycodestyle", - "--ignore=E111,E114", - "--max-line-length=88", - f.name, - ] - res = subprocess.run(cmdline, capture_output=True) - - output = res.stdout.decode().replace(f.name, "f").split("\n") - - if not output: - return collections.Counter() - - error_classes = [] - pat = re.compile(r"^f:\d+:\d+: (\w\d{3}) (.*)$") - for line in output: - m = pat.match(line) - if m is not None: - error_classes.append(f"{m.group(1)} ({m.group(2)})") - - return collections.Counter(error_classes) - - -def remap_line_numbers(cell_lines): - """Create a mapping from script line number to notebook cell/line.""" - line_map = {} - cell_start = 0 - for cell, cell_length in enumerate(cell_lines, 1): - for line in range(1, cell_length + 1): - line_map[cell_start + line] = cell, line - cell_start += cell_length - return line_map - - -def report_brief(fname, warnings, errors, violations): - """Print a single-line report, suibtable for aggregation.""" - n_warnings = len(warnings.read().splitlines()) - n_errors = len(errors.read().splitlines()) - n_violations = len(list(violations.elements())) - print(f"{fname} {n_warnings + n_errors} {n_violations}") - - -def report_verbose(fname, warnings, errors, violations, line_map): - """Report every pyflakes problem and more codestyle information.""" - s = f"Code report for {fname}" - print("", s, "=" * len(s), sep="\n") - - s = "Quality (pyflakes)" - print("", s, "-" * len(s), "", sep="\n") - - warning_lines = reformat_line_problems(warnings, line_map) - error_lines = reformat_line_problems(errors, line_map, "ERROR in ") - - issues = warning_lines + error_lines - print(f"Total code issues: {len(issues)}") - if issues: - print() - print("\n".join(warning_lines + error_lines)) - - s = "Style (pycodestyle)" - print("", s, "-" * len(s), "", sep="\n") - - n = len(list(violations.elements())) - print(f"Total PEP8 violations: {n}") - - # TODO parametrize n_most_common - if violations: - print() - print("Common problems:") - for code, count in violations.most_common(10): - plural = "" if count == 1 else "s" - print(f"- {count} instance{plural} of {code}") - - print("") - - -def reformat_line_problems(stream, line_map, prefix=""): - """Reformat a pyflakes output stream for notebook cells.""" - pat = re.compile(r"^\w*:(\d+):\d+ (.+)$") - - new_lines = [] - orig_lines = stream.read().splitlines() - - for line in orig_lines: - m = pat.match(line) - if m: - cell, line = line_map[int(m.group(1))] - new_lines.append( - f"{prefix}Cell {cell}, Line {line}: {m.group(2)}" - ) - - return new_lines - - if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/scripts/make_pr_comment.py b/scripts/make_pr_comment.py index 49cbf67..9bbf293 100644 --- a/scripts/make_pr_comment.py +++ b/scripts/make_pr_comment.py @@ -1,102 +1,4 @@ -"""Write a comment to be added to a pull request on github: - -- Add Colab badges for the branch version of the notebooks -- Run the code linter over the notebooks and include the report - -""" -import os +from nmaci.make_pr_comment import main import sys -import argparse -import subprocess - -ORG = os.environ.get("ORG", "neuromatch") -REPO = os.environ.get("NMA_REPO", "course-content") - -def main(arglist): - - args = parse_args(arglist) - - # Start with a table of badges for the branch versions of the notebooks - comment_lines = [ - make_colab_badge_table(args.branch, args.notebooks), - ] - - # Add a code report (under a details tag) for each notebook - for nb_fpath in args.notebooks: - _, nb_fname = os.path.split(nb_fpath) - nb_name, _ = os.path.splitext(nb_fname) - comment_lines.extend([ - "\n" - "
", - f"Code report for {nb_name}", - make_lint_report(nb_fpath), - "---", - "", - "
", - ]) - - # Dump to stdout or a file - comment = "\n".join(comment_lines) - if args.output is None: - print(comment, flush=True) - else: - with open(args.output, "w") as fid: - fid.write(comment) - - -def make_lint_report(nb_fpath): - """Run the tutorial linter on a notebook and capture the output.""" - cmdline = ["python", "ci/lint_tutorial.py", nb_fpath] - res = subprocess.run(cmdline, capture_output=True) - return res.stdout.decode() - - -def make_colab_badge_table(branch, notebooks): - """Add Colab badges for the branch version of each notebook.""" - header = [""] - divider = ["-"] - instructor = ["Instructor"] - student = ["Student"] - - for nb_fpath in notebooks: - nb_dir, nb_fname = os.path.split(nb_fpath) - nb_name, _ = os.path.splitext(nb_fname) - header.append(nb_name) - instructor.append(make_colab_badge(branch, nb_dir, nb_fname)) - if "tutorials" in nb_dir: - student.append(make_colab_badge(branch, nb_dir, nb_fname, student=True)) - divider.append("-") - - rows = header, divider, instructor, student - table = "\n".join( - ["|" + "|".join(row) + "|" for row in rows] - ) - return table - - -def make_colab_badge(branch, nb_dir, nb_fname, student=False): - """Generate a Google Colaboratory badge for a notebook on github.""" - alt_text = "Open In Colab" - badge_svg = "https://colab.research.google.com/assets/colab-badge.svg" - if student: - nb_dir = os.path.join(nb_dir, "student") - url = ( - "https://colab.research.google.com/" - f"github/{ORG}/{REPO}/blob/" - f"{branch}/{nb_dir}/{nb_fname}" - ) - return f"[![{alt_text}]({badge_svg})]({url})" - - -def parse_args(arglist): - - parser = argparse.ArgumentParser() - parser.add_argument("--branch", default=os.environ.get("NMA_MAIN_BRANCH", "main")) - parser.add_argument("--output") - parser.add_argument("notebooks", nargs="*") - return parser.parse_args(arglist) - - if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/scripts/parse_html_for_errors.py b/scripts/parse_html_for_errors.py index df4453a..9197458 100644 --- a/scripts/parse_html_for_errors.py +++ b/scripts/parse_html_for_errors.py @@ -1,46 +1,4 @@ -import yaml +from nmaci.parse_html_for_errors import main import sys -from bs4 import BeautifulSoup - -ARG = sys.argv[1] - -def main(): - with open('tutorials/materials.yml') as fh: - materials = yaml.load(fh, Loader=yaml.FullLoader) - - html_directory = 'book/_build/html/' - - # Loop over days - for m in materials: - name = f"{m['day']}_{''.join(m['name'].split())}" - - # Loop over tutorials - for i in range(m['tutorials']): - - # Load html file - notebook_file_path = f"{html_directory}/tutorials/{name}/{ARG}/{m['day']}_Tutorial{i + 1}.html" - with open(notebook_file_path, 'r') as f: - contents = f.read() - parsed_html = BeautifulSoup(contents, features="html.parser") - - # Find code output divs - mydivs = parsed_html.find_all("div", {"class": "cell_output docutils container"}) - - # Remove div if it has an error - for div in mydivs: - if 'NotImplementedError' in str(div) or 'NameError' in str(div): - div.decompose() - - # Put solution figures in center (to fix layout issues) - for img in parsed_html.find_all('img', alt= True): - if img['alt'] == 'Solution hint': - img['align'] = 'center' - img['class'] = 'align-center' - - # save out html - with open(notebook_file_path, 'w') as f: - f.write(str(parsed_html)) - - -if __name__ == '__main__': - main() +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/scripts/process_notebooks.py b/scripts/process_notebooks.py index 36f7ec4..c437b89 100644 --- a/scripts/process_notebooks.py +++ b/scripts/process_notebooks.py @@ -1,627 +1,4 @@ -"""Process tutorials for Neuromatch Academy - -- Filter input file list for .ipynb files -- Check that the cells have been executed sequentially on a fresh kernel -- Strip trailing whitespace from all code lines -- Either: - - Execute the notebook and fail if errors are encountered (apart from the `NotImplementedError`) - - Check that all code cells have been executed without error -- Extract solution code and write a .py file with the solution -- Create the student version by replacing solution cells with a "hint" image and a link to the solution code -- Create the instructor version by replacing cells with code exercises with text cells with code in markdown form. -- Redirect Colab-inserted badges to the main branch -- Set the Colab notebook name field based on file path -- Standardize some Colab settings (always have ToC, always hide form cells) -- Clean the notebooks (remove outputs and noisy metadata) -- Write the executed version of the input notebook to its original path -- Write the post-processed notebook to a student/ subdirectory -- Write solution images to a static/ subdirectory -- Write solution code to a solutions/ subdirectory - -""" - -import os -import re +from nmaci.process_notebooks import main import sys - -import time -import argparse -import hashlib -from io import BytesIO -from binascii import a2b_base64 -from copy import deepcopy -from pathlib import Path - -from PIL import Image -import nbformat -from nbconvert.preprocessors import ExecutePreprocessor - - -ORG = os.environ.get("ORG", "neuromatch") -REPO = os.environ.get("NMA_REPO", "course-content-template") -MAIN_BRANCH = os.environ.get("NMA_MAIN_BRANCH", "main") - -GITHUB_RAW_URL = f"https://raw.githubusercontent.com/{ORG}/{REPO}/{MAIN_BRANCH}" -GITHUB_TREE_URL = f"https://github.com/{ORG}/{REPO}/tree/{MAIN_BRANCH}" - - -class LoggingExecutePreprocessor(ExecutePreprocessor): - """ExecutePreprocessor that logs cell-by-cell progress.""" - - def preprocess_cell(self, cell, resources, index): - if cell.cell_type == "code" and cell.source.strip(): - code_preview = cell.source.strip().split("\n")[0][:80] - print(f"[Cell {index + 1}] {code_preview}") - sys.stdout.flush() - start = time.time() - result = super().preprocess_cell(cell, resources, index) - elapsed = time.time() - start - if elapsed > 5: - print(f" completed in {elapsed:.1f}s") - sys.stdout.flush() - return result - return super().preprocess_cell(cell, resources, index) - - -def main(arglist): - """Process IPython notebooks from a list of files.""" - args = parse_args(arglist) - - # Filter paths from the git manifest - # - Only process .ipynb - # - Don't process student notebooks - # - Don't process deleted notebooks (which are paths in the git manifest) - def should_process(path): - return all( - [ - path.endswith(".ipynb"), - "student/" not in path, - "instructor/" not in path, - os.path.isfile(path), - ] - ) - - nb_paths = [arg for arg in args.files if should_process(arg)] - if not nb_paths: - print("No notebook files found") - sys.exit(0) - - # Set execution parameters. We allow NotImplementedError as that is raised - # by incomplete exercises and is unlikely to be otherwise encountered. - exec_kws = {"timeout": 14400, "allow_error_names": ["NotImplementedError"]} - - # Allow environment to override stored kernel name - if "NB_KERNEL" in os.environ: - exec_kws["kernel_name"] = os.environ["NB_KERNEL"] - - # Defer failures until after processing all notebooks - notebooks = {} - errors = {} - - for nb_path in nb_paths: - - # Load the notebook structure - with open(nb_path) as f: - nb = nbformat.read(f, nbformat.NO_CONVERT) - - if not sequentially_executed(nb): - if args.require_sequential: - err = ( - "Notebook is not sequentially executed on a fresh kernel." - "\n" - "Please do 'Restart and run all' before pushing to Github." - ) - errors[nb_path] = err - continue - - # Clean whitespace from all code cells - clean_whitespace(nb) - - # Ensure that we have an executed notebook, in one of two ways - executor = LoggingExecutePreprocessor(**exec_kws) - if args.execute: - # Check dynamically by executing and reporting errors - print(f"Executing {nb_path}") - error = execute_notebook(executor, nb, args.raise_fast) - elif args.check_execution: - # Check statically by examining the cell outputs - print(f"Checking {nb_path} execution") - error = check_execution(executor, nb, args.raise_fast) - else: - error = None - - if error is None: - notebooks[nb_path] = nb - else: - errors[nb_path] = error - - if errors or args.check_only: - exit(errors) - - # Post-process notebooks - for nb_path, nb in notebooks.items(): - - # Extract components of the notebook path - nb_dir, nb_fname = os.path.split(nb_path) - nb_name, _ = os.path.splitext(nb_fname) - - # Add badges to the main notebook (pointing at itself) - add_badge_cell(nb, nb_path) - - # Ensure that Colab metadata dict exists and enforce some settings - add_colab_metadata(nb, nb_name) - - # Write the original notebook back to disk, clearing outputs only for tutorials - print(f"Writing complete notebook to {nb_path}") - with open(nb_path, "w") as f: - nb_clean = clean_notebook(nb, clear_outputs=nb_path.startswith("tutorials")) - nbformat.write(nb_clean, f) - - # if the notebook is not in tutorials, skip the creation/update of the student, static, solutions directories - if not nb_path.startswith("tutorials"): - continue - - # Create subdirectories, if they don't exist - student_dir = make_sub_dir(nb_dir, "student") - static_dir = make_sub_dir(nb_dir, "static") - solutions_dir = make_sub_dir(nb_dir, "solutions") - instructor_dir = make_sub_dir(nb_dir, "instructor") - - # Generate the student version and save it to a subdirectory - print(f"Extracting solutions from {nb_path}") - processed = extract_solutions(nb, nb_dir, nb_name) - student_nb, static_images, solution_snippets = processed - - # Generate the instructor version and save it to a subdirectory - print(f"Create instructor notebook from {nb_path}") - instructor_nb = instructor_version(nb, nb_dir, nb_name) - - # Build paths for student and instructor versions - student_nb_path = os.path.join(student_dir, nb_fname) - instructor_nb_path = os.path.join(instructor_dir, nb_fname) - - # Add badges pointing to the student version - add_badge_cell(student_nb, student_nb_path) - - # Add badges pointing to the instructor version - add_badge_cell(instructor_nb, instructor_nb_path) - - # Write the student version of the notebook - print(f"Writing student notebook to {student_nb_path}") - with open(student_nb_path, "w") as f: - clean_student_nb = clean_notebook(student_nb) - nbformat.write(clean_student_nb, f) - - # Write the images extracted from the solution cells - print(f"Writing solution images to {static_dir}") - for fname, image in static_images.items(): - fname = fname.replace("static", static_dir) - image.save(fname) - - # Write the solution snippets - print(f"Writing solution snippets to {solutions_dir}") - for fname, snippet in solution_snippets.items(): - fname = fname.replace("solutions", solutions_dir) - with open(fname, "w") as f: - f.write(snippet) - - # Write the instructor version of the notebook - print(f"Writing instructor notebook to {instructor_nb_path}") - with open(instructor_nb_path, "w") as f: - clean_instructor_nb = clean_notebook(instructor_nb) - nbformat.write(clean_instructor_nb, f) - - exit(errors) - - -# ------------------------------------------------------------------------------------ # - - -def execute_notebook(executor, nb, raise_fast): - """Execute the notebook, returning errors to be handled.""" - try: - executor.preprocess(nb) - except Exception as error: - if raise_fast: - # Exit here (useful for debugging) - raise error - else: - # Raise the error to be handled by the caller - return error - - -def check_execution(executor, nb, raise_fast): - """Check that all code cells with source have been executed without error.""" - error = None - for cell in nb.get("cells", []): - - # Only check code cells - if cell["cell_type"] != "code": - continue - - if cell["source"] and cell["execution_count"] is None: - error = "Notebook has unexecuted code cell(s)." - if raise_fast: - raise RuntimeError(error) - break - else: - for output in cell["outputs"]: - if output["output_type"] == "error": - if output["ename"] in executor.allow_error_names: - continue - error = "\n".join(output["traceback"]) - if raise_fast: - raise RuntimeError("\n" + error) - break - - return error - - -def extract_solutions(nb, nb_dir, nb_name): - """Convert solution cells to markdown; embed images from Python output.""" - nb = deepcopy(nb) - _, tutorial_dir = os.path.split(nb_dir) - - static_images = {} - solution_snippets = {} - - nb_cells = nb.get("cells", []) - for i, cell in enumerate(nb_cells): - - if has_solution(cell): - - # Get the cell source - cell_source = cell["source"] - - # Hash the source to get a unique identifier - cell_id = hashlib.sha1(cell_source.encode("utf-8")).hexdigest()[:8] - - # Extract image data from the cell outputs - cell_images = {} - for j, output in enumerate(cell.get("outputs", [])): - - fname = f"static/{nb_name}_Solution_{cell_id}_{j}.png" - try: - image_data = a2b_base64(output["data"]["image/png"]) - except KeyError: - continue - cell_images[fname] = Image.open(BytesIO(image_data)) - static_images.update(cell_images) - - # Clean up the cell source and assign a filename - snippet = "\n".join(cell_source.split("\n")[1:]) - py_fname = f"solutions/{nb_name}_Solution_{cell_id}.py" - solution_snippets[py_fname] = snippet - - # Convert the solution cell to markdown, - # Insert a link to the solution snippet script on github, - # and embed the image as a link to static file (also on github) - py_url = f"{GITHUB_TREE_URL}/tutorials/{tutorial_dir}/{py_fname}" - new_source = f"[*Click for solution*]({py_url})\n\n" - - if cell_images: - new_source += "*Example output:*\n\n" - for f, img in cell_images.items(): - - url = f"{GITHUB_RAW_URL}/tutorials/{tutorial_dir}/{f}" - - # Handle matplotlib retina mode - dpi_w, dpi_h = img.info["dpi"] - w = img.width // (dpi_w // 72) - h = img.height // (dpi_h // 72) - - tag_args = " ".join( - [ - "alt='Solution hint'", - "align='left'", - f"width={w}", - f"height={h}", - f"src={url}", - ] - ) - new_source += f"\n\n" - - cell["source"] = new_source - cell["cell_type"] = "markdown" - cell["metadata"]["colab_type"] = "text" - if "outputID" in cell["metadata"]: - del cell["metadata"]["outputId"] - if "outputs" in cell: - del cell["outputs"] - if "execution_count" in cell: - del cell["execution_count"] - - return nb, static_images, solution_snippets - - -def instructor_version(nb, nb_dir, nb_name): - """Convert notebook to instructor notebook.""" - nb = deepcopy(nb) - _, tutorial_dir = os.path.split(nb_dir) - - nb_cells = nb.get("cells", []) - for i, cell in enumerate(nb_cells): - - if has_code_exercise(cell): - if nb_cells[i - 1]["cell_type"] == "markdown": - cell_id = i - 2 - else: - cell_id = i - 1 - nb_cells[cell_id]["cell_type"] = "markdown" - nb_cells[cell_id]["metadata"]["colab_type"] = "text" - if "outputID" in nb_cells[cell_id]["metadata"]: - del nb_cells[cell_id]["metadata"]["outputId"] - if "outputs" in nb_cells[cell_id]: - del nb_cells[cell_id]["outputs"] - if "execution_count" in nb_cells[cell_id]: - del nb_cells[cell_id]["execution_count"] - - nb_cells[cell_id]["source"] = ( - "```python\n" + nb_cells[cell_id]["source"] + "\n\n```" - ) - - return nb - - -def clean_notebook(nb, clear_outputs=True): - """Remove cell outputs and most unimportant metadata.""" - # Always operate on a copy of the input notebook - nb = deepcopy(nb) - - # Remove some noisy metadata - nb.metadata.pop("widgets", None) - - # Set kernel to default Python3 - nb.metadata["kernel"] = { - "display_name": "Python 3", - "language": "python", - "name": "python3", - } - - # Iterate through the cells and clean up each one - for cell in nb.get("cells", []): - - # Remove blank cells - if not cell["source"]: - nb.cells.remove(cell) - continue - - # Reset cell-level Jupyter metadata - for key in ["prompt_number", "execution_count"]: - if key in cell: - cell[key] = None - - if "metadata" in cell: - cell.metadata["execution"] = {} - for field in ["colab", "collapsed", "scrolled", "ExecuteTime", "outputId"]: - cell.metadata.pop(field, None) - - # Reset cell-level Colab metadata - if "id" in cell["metadata"]: - if not cell["metadata"]["id"].startswith("view-in"): - cell["metadata"].pop("id") - - if cell["cell_type"] == "code": - # Remove code cell outputs if requested - if clear_outputs: - cell["outputs"] = [] - - # Ensure that form cells are hidden by default - first_line, *_ = cell["source"].splitlines() - if "@title" in first_line or "@markdown" in first_line: - cell["metadata"]["cellView"] = "form" - - return nb - - -def add_colab_metadata(nb, nb_name): - """Ensure that notebook has Colab metadata and enforce some settings.""" - if "colab" not in nb["metadata"]: - nb["metadata"]["colab"] = {} - - # Always overwrite the name and show the ToC/Colab button - nb["metadata"]["colab"].update( - { - "name": nb_name, - "toc_visible": True, - "include_colab_link": True, - } - ) - - # Allow collapsed sections, but default to not having any - nb["metadata"]["colab"].setdefault("collapsed_sections", []) - - -def clean_whitespace(nb): - """Remove trailing whitespace from all code cell lines.""" - for cell in nb.get("cells", []): - if cell.get("cell_type", "") == "code": - source_lines = cell["source"].splitlines() - clean_lines = [line.rstrip() for line in source_lines] - cell["source"] = "\n".join(clean_lines) - - -def has_solution(cell): - """Return True if cell is marked as containing an exercise solution.""" - cell_text = cell["source"].replace(" ", "").lower() - first_line = cell_text.split("\n")[0] - return cell_text.startswith("#@titlesolution") or "to_remove" in first_line - - -def has_code_exercise(cell): - """Return True if cell is marked as containing an exercise solution.""" - cell_text = cell["source"].replace(" ", "").lower() - first_line = cell_text.split("\n")[0] - return cell_text.startswith("#@titlesolution") or "to_removesolution" in first_line - - -def test_has_solution(): - - cell = {"source": "# solution"} - assert not has_solution(cell) - - cell = {"source": "def exercise():\n pass\n# to_remove"} - assert not has_solution(cell) - - cell = {"source": "# to_remove_solution\ndef exercise():\n pass"} - assert has_solution(cell) - - -def remove_existing_badges(nb: dict) -> None: - """Remove existing Colab and Kaggle badges from all cells in the notebook. - - - Removes Colab badge HTML () - - Removes Kaggle badge HTML () - - Removes   spacers between badges - - Deletes cells that become empty after badge removal - - Strips leading/trailing whitespace from cells - """ - colab_pattern = re.compile( - r']*href="[^"]*colab[^"]*"[^>]*>\s*]*colab-badge\.svg[^>]*/>\s*', - re.IGNORECASE, - ) - kaggle_pattern = re.compile( - r']*href="[^"]*kaggle[^"]*"[^>]*>\s*]*open-in-kaggle\.svg[^>]*/>\s*', - re.IGNORECASE, - ) - nbsp_pattern = re.compile(r"\s* \s*") - - cells_to_remove = [] - for i, cell in enumerate(nb.get("cells", [])): - - source = cell.get("source", "") - source = colab_pattern.sub("", source) - source = kaggle_pattern.sub("", source) - source = nbsp_pattern.sub("", source) - source = source.strip() - cell["source"] = source - - if not source: - cells_to_remove.append(i) - - # Remove empty cells in reverse order to maintain indices - for i in reversed(cells_to_remove): - del nb["cells"][i] - - -def generate_badge_cell(nb_path: Path | str) -> dict: - """Generate a markdown cell with Colab and Kaggle badges. - - Args: - nb_path: The destination path where the notebook will be written - (e.g., "tutorials/W1D1_Generalization/student/W1D1_Tutorial1.ipynb") - - Returns: - A notebook cell dict with both badges as markdown content. - """ - colab_url = f"https://colab.research.google.com/github/{ORG}/{REPO}/blob/{MAIN_BRANCH}/{nb_path}" - colab_badge = "https://colab.research.google.com/assets/colab-badge.svg" - kaggle_src = ( - f"https://raw.githubusercontent.com/{ORG}/{REPO}/{MAIN_BRANCH}/{nb_path}" - ) - kaggle_url = f"https://kaggle.com/kernels/welcome?src={kaggle_src}" - kaggle_badge = "https://kaggle.com/static/images/open-in-kaggle.svg" - - badge_html = ( - f'' - f'Open In Colab' - f"   " - f'' - f'Open in Kaggle' - ) - - cell = nbformat.v4.new_markdown_cell(source=badge_html) - cell.metadata["id"] = "view-in-github" - cell.metadata["colab_type"] = "text" - return cell - - -def add_badge_cell(nb: dict, nb_path: dict | str) -> None: - """Remove existing badges and add a new badge cell at the top of the notebook. - - Args: - nb: The notebook dict - nb_path: The destination path where the notebook will be written - """ - remove_existing_badges(nb) - badge_cell = generate_badge_cell(nb_path) - nb["cells"].insert(0, badge_cell) - - -def sequentially_executed(nb): - """Return True if notebook appears freshly executed from top-to-bottom.""" - exec_counts = [ - cell["execution_count"] - for cell in nb.get("cells", []) - if (cell["source"] and cell.get("execution_count", None) is not None) - ] - sequential_counts = list(range(1, 1 + len(exec_counts))) - # Returns True if there are no executed code cells, which is fine? - return exec_counts == sequential_counts - - -def make_sub_dir(nb_dir, name): - """Create nb_dir/name if it does not exist.""" - sub_dir = os.path.join(nb_dir, name) - if not os.path.exists(sub_dir): - os.mkdir(sub_dir) - return sub_dir - - -def exit(errors): - """Exit with message and status dependent on contents of errors dict.""" - for failed_file, error in errors.items(): - print(f"{failed_file} failed quality control.", file=sys.stderr) - print(error, file=sys.stderr) - - status = bool(errors) - report = "Failure" if status else "Success" - print("=" * 30, report, "=" * 30) - sys.exit(status) - - -def parse_args(arglist): - """Handle the command-line arguments.""" - parser = argparse.ArgumentParser( - description="Process neuromatch tutorial notebooks", - ) - parser.add_argument( - "files", - nargs="+", - help="File name(s) to process. Will filter for .ipynb extension.", - ) - parser.add_argument( - "--execute", - action="store_true", - help="Execute the notebook and fail if errors are encountered.", - ) - parser.add_argument( - "--check-execution", - action="store_true", - dest="check_execution", - help="Check that each code cell has been executed and did not error.", - ) - parser.add_argument( - "--allow-non-sequential", - action="store_false", - dest="require_sequential", - help="Don't fail if the notebook is not sequentially executed.", - ) - parser.add_argument( - "--check-only", - action="store_true", - dest="check_only", - help="Only run QC checks; don't do post-processing.", - ) - parser.add_argument( - "--raise-fast", - action="store_true", - dest="raise_fast", - help="Raise errors immediately rather than collecting and reporting.", - ) - return parser.parse_args(arglist) - - if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/scripts/select_notebooks.py b/scripts/select_notebooks.py index bf947e5..171e595 100644 --- a/scripts/select_notebooks.py +++ b/scripts/select_notebooks.py @@ -1,22 +1,4 @@ -"""From a list of files, select process-able notebooks and print.""" -import os +from nmaci.select_notebooks import main import sys - if __name__ == "__main__": - - _, *files = sys.argv - - # Filter paths from the git manifest - # - Only process .ipynb - # - Don't process student notebooks - # - Don't process deleted notebooks - def should_process(path): - return all([ - path.endswith(".ipynb"), - "student/" not in path, - "instructor/" not in path, - os.path.isfile(path), - ]) - - nb_paths = [f for f in files if should_process(f)] - print(" ".join(nb_paths)) + main(sys.argv[1:]) diff --git a/scripts/verify_exercises.py b/scripts/verify_exercises.py index 20471fa..ff02ea9 100644 --- a/scripts/verify_exercises.py +++ b/scripts/verify_exercises.py @@ -1,260 +1,4 @@ -#! /usr/bin/env python -"""Check that exercise code matches solution code. - -Exercises are allowed to deviate from solutions in several ways: - -- Exercise code may replace solution code with an ellipsis (...) -- Exercise code may have "commented-out" solution code - -Additionally: - -- Docstrings are currently ignored -- Blank lines are ignored - -This script will report whether exercises and solutions otherwise match. - -""" -import os -import re +from nmaci.verify_exercises import main import sys -import argparse -from textwrap import dedent -from fuzzywuzzy import fuzz -import nbformat - - -def main(arglist): - - args = parse_args(arglist) - - if "skip verification" in args.commit_message: - # Putting this logic here as I didn't have time to figure - # out how to do it in the github actions workflow - print("Skipping exercise verification") - sys.exit(0) - - # Track overall status - failure = False - unmatched = {} - - for nb_fpath in args.files: - - _, nb_name = os.path.split(nb_fpath) - unmatched[nb_name] = [] - - # Load the notebook file - with open(nb_fpath) as f: - nb = nbformat.read(f, nbformat.NO_CONVERT) - - for i, cell in enumerate(nb.get("cells", [])): - - # Detect solution cells based on removal tag - if has_solution(cell): - - # Find a corresponding exercise cell - # (Assume it is the previous *code* cell) - j, stub_cell = 1, None - while (i - j): - stub_cell = nb["cells"][i - j] - if stub_cell["cell_type"] == "code": - break - else: - stub_cell = None - j += 1 - if stub_cell is None: - continue - - # Extract the code and comments from both cells - stub_code, stub_comments = logical_lines(stub_cell["source"]) - solu_code, solu_comments = logical_lines(cell["source"]) - - # Identify violations in the exercise cell - unmatched_code = unmatched_lines(stub_code, solu_code) - unmatched_comments = unmatched_lines( - stub_comments, solu_code + solu_comments - ) - unmatched[nb_name].append((unmatched_code, unmatched_comments)) - if unmatched_code or unmatched_comments: - failure = True - - # Report the results for this noteobokk - for nb_name, nb_unmatched in unmatched.items(): - print() - print("---" + nb_name + "-" * (69 - 5 - len(nb_name))) - for exercise, (code, comments) in enumerate(nb_unmatched, 1): - report(exercise, code, comments) - - # Print overall summary and exit with return code - message = "Failure" if failure else "Success" - print("\n" + "=" * 30, message, "=" * 30) - sys.exit(failure) - - -def report(exercise, code, comment, thresh=50): - """Print information about unmatched code and comments in an exercise.""" - code_status = "FAIL" if code else "PASS" - comment_status = "FAIL" if comment else "PASS" - print( - f"Exercise {exercise} | Code {code_status} | Comments {comment_status}" - ) - - for kind, unmatched in zip(["Code", "Comment"], [code, comment]): - for (score, stub, solu) in unmatched: - if score < thresh: - print(f" {kind} without close match:") - print(f" * {stub}") - else: - print(f" {kind} with close mismatch ({score}%)") - print(f" + {stub}") - print(f" - {solu}") - - -def logical_lines(func_str): - """Extract code and block comments from cell string.""" - # Standardize docstring string format - func_str = func_str.replace("'''", '"""') - - # Define a regular expression to remove comments - pattern = re.compile(r"^([^#]*)\s*#* {0,1}(.*?)\s*$") - - code_lines = [] - comment_lines = [] - - making_xkcd_plot = False - reading_block_comment = False - - for line in func_str.split("\n"): - - # Detect and ignore lines within multi-line comments - # - triple quotes (docstrings) - # - comment hashmark fences - comment_block_fence = dedent(line).startswith('"""') or "###" in line - if reading_block_comment: - if comment_block_fence: - reading_block_comment = False - continue - else: - - # Detect and ignore single-line docstrings - text = line.strip() - single_line_docstring = ( - text.startswith('"""') - and text.endswith('"""') - and len(text) > 3 - ) - if single_line_docstring: - continue - - # Otherwise, assume we are starting a comment block - if comment_block_fence: - reading_block_comment = True - continue - - match = pattern.match(line) - if match: - - # Split the line on the first comment hash encountered - code_line = match.group(1) - comment_line = match.group(2) - - # If there is code before the comment, assume comment is inline - # use entire line (allows inline comments in commented-out code) - if dedent(code_line): - code_line = match.group(0) - - # Handle xkcd context, which is always last thing in solution cell - if "plt.xkcd()" in code_line: - making_xkcd_plot = True - continue - if making_xkcd_plot: - code_line = code_line[2:] - - # Check for reasons to ignore the line, otherwise keep it - - if not skip_code(code_line): - code_lines.append(code_line) - - if not dedent(code_line) and not skip_comment(comment_line): - comment_lines.append(comment_line) - - return code_lines, comment_lines - - -def unmatched_lines(stub_lines, solu_lines): - """Identify lines in the exercise stub without a match in the solution.""" - unmatched = [] - - for stub_line in stub_lines: - - # When we don't match, we want to track lines that are close - best_score = 0 - best_line = "" - - for line in solu_lines: - - # Match whole lines or parts of lines that need completion - if "..." in stub_line: - part_scores = [] - for part in stub_line.split("..."): - if not part: - continue - part_scores.append(fuzz.partial_ratio(part, line)) - score = min(part_scores) - else: - score = fuzz.ratio(stub_line, line) - - if score > best_score: - best_score = score - best_line = line - - # Track all lines that are not perfect matches - if best_score < 100: - unmatched.append((best_score, stub_line, best_line)) - - return unmatched - - -def skip_code(line): - """Return True if a code line should be skipped based on contents.""" - line = dedent(line) - return not line or "NotImplementedError" in line - - -def skip_comment(line): - """Return True if a comment line should be skipped based on contents.""" - line = dedent(line) - return not line or "to_remove" in line or "uncomment" in line.lower() - - -def has_solution(cell): - """Return True if cell is marked as containing an exercise solution.""" - cell_text = cell["source"].replace(" ", "").lower() - first_line = cell_text.split("\n")[0] - return ( - cell_text.startswith("#@titlesolution") - or "to_remove" in first_line - and "explanation" not in first_line - ) - - -def parse_args(arglist): - """Handle the command-line arguments.""" - parser = argparse.ArgumentParser( - description="Process neuromatch tutorial notebooks", - ) - parser.add_argument( - "files", - nargs="+", - help="File name(s) to process. Will filter for .ipynb extension." - ) - parser.add_argument( - "--commit-message", - default="", - help="Will exit cleanly if message contains 'skip verify'", - ) - return parser.parse_args(arglist) - - if __name__ == "__main__": - main(sys.argv[1:]) diff --git a/src/nmaci/__init__.py b/src/nmaci/__init__.py new file mode 100644 index 0000000..3dc1f76 --- /dev/null +++ b/src/nmaci/__init__.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/src/nmaci/__main__.py b/src/nmaci/__main__.py new file mode 100644 index 0000000..3b1fa24 --- /dev/null +++ b/src/nmaci/__main__.py @@ -0,0 +1,2 @@ +from nmaci.cli import main +main() diff --git a/src/nmaci/chatify/__init__.py b/src/nmaci/chatify/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/nmaci/chatify/install_and_load_chatify.py b/src/nmaci/chatify/install_and_load_chatify.py new file mode 100644 index 0000000..12312d4 --- /dev/null +++ b/src/nmaci/chatify/install_and_load_chatify.py @@ -0,0 +1,2 @@ +smuggle chatify # pip: git+https://github.com/ContextLab/chatify.git +%load_ext chatify \ No newline at end of file diff --git a/src/nmaci/chatify/install_davos.py b/src/nmaci/chatify/install_davos.py new file mode 100644 index 0000000..c3a6488 --- /dev/null +++ b/src/nmaci/chatify/install_davos.py @@ -0,0 +1,3 @@ +%pip install -q davos +import davos +davos.config.suppress_stdout = True \ No newline at end of file diff --git a/src/nmaci/chatify/process_notebooks.py b/src/nmaci/chatify/process_notebooks.py new file mode 100644 index 0000000..cf70168 --- /dev/null +++ b/src/nmaci/chatify/process_notebooks.py @@ -0,0 +1,190 @@ +import os +import yaml +from glob import glob as lsdir + +import nbformat as nbf +from chatify import Chatify +from tqdm import tqdm + +import numpy as np +import pickle + +from langchain.prompts import PromptTemplate +from gptcache import Cache +from gptcache.processor.pre import get_prompt +from gptcache.manager import get_data_manager +from gptcache.similarity_evaluation.exact_match import ExactMatchEvaluation + +source_repo = os.environ.get("SOURCE_REPO", "NeuroAI_Course") +mod_repo = os.environ.get("MOD_REPO", "chatify_NeuroAI_Course") +CACHE = False + + +def get_tutorial_notebooks(basedir): + return lsdir(os.path.join(basedir, 'tutorials', '*', 'student', '*Tutorial*.ipynb')) + + +def chatified(fname): + notebook = nbf.read(fname, nbf.NO_CONVERT) + header_cell = notebook['cells'][0] + return mod_repo in header_cell['source'] + + +def get_text(fname): + with open(os.path.join(os.getcwd(), 'ci', 'chatify', fname), 'r') as f: + return ''.join(f.readlines()) + + +def inject_chatify(fname): + notebook = nbf.read(fname, nbf.NO_CONVERT) + new_notebook = notebook.copy() + + # update header cell + header_cell = new_notebook['cells'][0] + header_cell['source'] = header_cell['source'].replace(source_repo, mod_repo) + + # insert background cell + background_cell = nbf.v4.new_markdown_cell(source=get_text('background.md'), metadata={'execution': {}}) + del background_cell['id'] + + # create davos cell + davos_cell = nbf.v4.new_code_cell(source=get_text('install_davos.py'), metadata={'cellView': 'form', 'execution': {}}) + del davos_cell['id'] + + # create chatify cell + chatify_cell = nbf.v4.new_code_cell(source=get_text('install_and_load_chatify.py'), metadata={'cellView': 'form', 'execution': {}}) + del chatify_cell['id'] + + idx = 0 + for cell in new_notebook['cells']: + idx += 1 + if cell['cell_type'] == 'markdown': + if '# Setup' in cell['source']: + break + + if idx == len(new_notebook['cells']) - 1: + return + + try: + if chatified(fname): + new_notebook.cells[0] = header_cell + new_notebook.cells[idx] = background_cell + new_notebook.cells[idx + 1] = davos_cell + new_notebook.cells[idx + 2] = chatify_cell + else: + new_notebook.cells.insert(idx, background_cell) + new_notebook.cells.insert(idx + 1, davos_cell) + new_notebook.cells.insert(idx + 2, chatify_cell) + except IndexError: + raise ValueError(f"Notebook Missing Setup Header: {fname}, index: {idx}") + + # Write the file + nbf.write( + new_notebook, + fname, + version=nbf.NO_CONVERT, + ) + + +def compress_code(text): + return '\n'.join([line.strip() for line in text.split('\n') if len(line.strip()) > 0]) + + +def get_code_cells(fname): + notebook = nbf.read(fname, nbf.NO_CONVERT) + return [compress_code(cell['source']) for cell in notebook['cells'] if cell['cell_type'] == 'code'] + + +def convert_pickle_file_to_cache(pickle_file, config): + cache_db_version = config['cache_config']['cache_db_version'] + file_name = f'NMA_2023_v{cache_db_version}.cache' + + # Remove file before creating a new one + if os.path.exists(file_name): + os.remove(file_name) + + llm_cache = Cache() + llm_cache.set_openai_key() + data_manager = get_data_manager(data_path=file_name) + + llm_cache.init( + pre_embedding_func=get_prompt, + data_manager=data_manager, + similarity_evaluation=ExactMatchEvaluation(), + ) + + chatify = Chatify() + prompts = chatify._read_prompt_dir()['tutor'] + + with open(pickle_file, 'rb') as f: + cache = pickle.load(f) + + for key, value in cache.items(): + for prompt_name, prompt in prompts.items(): + prompt = PromptTemplate( + template=prompt['content'], + input_variables=prompt['input_variables'], + ) + question = prompt.format(text=compress_code(key)) + try: + answer = value[prompt_name] + data_manager.save(question, answer, embedding_data=question) + except KeyError: + pass + + +tutorials = get_tutorial_notebooks(os.getcwd()) +tutor = Chatify() +prompts = tutor._read_prompt_dir()['tutor'] +code_cells = [] +failed_queries = [] + +for notebook in tqdm(tutorials): + inject_chatify(notebook) + code_cells.extend(get_code_cells(notebook)) + + +if CACHE: + savefile = os.path.join(os.getcwd(), 'chatify', 'cache.pkl') + failed_queries_file = os.path.join(os.getcwd(), 'chatify', 'failed_queries.pkl') + + if os.path.exists(savefile): + with open(savefile, 'rb') as f: + cache = pickle.load(f) + else: + cache = {} + + failed_queries = [] + + tmpfile = os.path.join(os.getcwd(), 'chatify', 'tmp.pkl') + for cell in tqdm(np.unique(code_cells)): + if cell not in cache: + cache[cell] = {} + + for name, content in prompts.items(): + if name not in cache[cell] or len(cache[cell][name]) == 0: + try: + cache[cell][name] = tutor._cache(cell, content) + + with open(tmpfile, 'wb') as f: + pickle.dump(cache, f) + + if cache[cell][name] is None or len(cache[cell][name]) == 0: + failed_queries.append((cell, name, 'null response')) + print('Response failed for cell (null response):\n', cell) + except: + failed_queries.append((cell, name, 'exception raised')) + print('Response failed for cell (exception raised):\n', cell) + + with open(savefile, 'wb') as f: + pickle.dump(cache, f) + + with open(failed_queries_file, 'wb') as f: + pickle.dump(failed_queries, f) + + if os.path.exists(tmpfile): + os.remove(tmpfile) + + # build cache + config = yaml.load(open('config.yaml', 'r'), Loader=yaml.SafeLoader) + convert_pickle_file_to_cache(savefile, config) diff --git a/src/nmaci/cli.py b/src/nmaci/cli.py new file mode 100644 index 0000000..c72f7e1 --- /dev/null +++ b/src/nmaci/cli.py @@ -0,0 +1,28 @@ +"""Single entry point for the nmaci CLI.""" +import sys + + +COMMANDS = { + "process-notebooks": "nmaci.process_notebooks", + "verify-exercises": "nmaci.verify_exercises", + "lint-tutorial": "nmaci.lint_tutorial", + "make-pr-comment": "nmaci.make_pr_comment", + "extract-links": "nmaci.extract_links", + "generate-readmes": "nmaci.generate_tutorial_readmes", + "generate-book": "nmaci.generate_book", + "generate-book-dl": "nmaci.generate_book_dl", + "generate-book-precourse": "nmaci.generate_book_precourse", + "select-notebooks": "nmaci.select_notebooks", + "find-unreferenced": "nmaci.find_unreferenced_content", + "parse-html": "nmaci.parse_html_for_errors", +} + + +def main(): + if len(sys.argv) < 2 or sys.argv[1] not in COMMANDS: + print("Usage: nmaci [args]") + print("Commands:", ", ".join(COMMANDS)) + sys.exit(1) + import importlib + module = importlib.import_module(COMMANDS[sys.argv[1]]) + module.main(sys.argv[2:]) diff --git a/src/nmaci/extract_links.py b/src/nmaci/extract_links.py new file mode 100644 index 0000000..e56a40e --- /dev/null +++ b/src/nmaci/extract_links.py @@ -0,0 +1,138 @@ +""" +Neuromatch Academy + +Extract slide and video links from notebooks +""" +import argparse +import ast +import collections +import json +import os +from urllib.request import urlopen, Request +from urllib.error import HTTPError +import sys + +import nbformat + + +def bilibili_url(video_id): + return f"https://www.bilibili.com/video/{video_id}" + + +def youtube_url(video_id): + return f"https://youtube.com/watch?v={video_id}" + + +def osf_url(link_id): + return f"https://osf.io/download/{link_id}" + +def tutorial_order(fname): + fname = os.path.basename(fname) + try: + first, last = fname.split("_") + except ValueError: + return (99, 99, fname) + if first.startswith("Bonus"): + week, day = 9, 9 + else: + try: + week, day = int(first[1]), int(first[3]) + except ValueError: + week, day = 9, 9 + if last.startswith("Intro"): + order = 0 + elif last.startswith("Tutorial"): + order = int(last[8]) + elif last.startswith("Outro"): + order = 10 + elif last.startswith("DaySummary"): + order = 20 + else: + order = 30 + return (week, day, order) + +def main(arglist): + """Process IPython notebooks from a list of files.""" + args = parse_args(arglist) + + nb_paths = [arg for arg in args.files + if arg.endswith(".ipynb") and + 'student' not in arg and + 'instructor' not in arg] + if not nb_paths: + print("No notebook files found") + sys.exit(0) + + videos = collections.defaultdict(list) + slides = collections.defaultdict(list) + + for nb_path in sorted(nb_paths, key=tutorial_order): + # Load the notebook structure + with open(nb_path) as f: + nb = nbformat.read(f, nbformat.NO_CONVERT) + + # Extract components of the notebook path + nb_dir, nb_fname = os.path.split(nb_path) + nb_name, _ = os.path.splitext(nb_fname) + + # Loop through the cells and find video and slide ids + for cell in nb.get("cells", []): + for line in cell.get("source", "").split("\n"): + l = line.strip() + if l.startswith("video_ids = "): + rhs = l.split("=")[1].strip() + video_dict = dict(ast.literal_eval(rhs)) + try: + if args.noyoutube: + url = bilibili_url(video_dict["Bilibili"]) + else: + url = youtube_url(video_dict["Youtube"]) + except KeyError: + print(f"Malformed video id in {nb_name}? '{rhs}'") + continue + if url not in videos[nb_name]: + videos[nb_name].append(url) + elif l.startswith("link_id = "): + rhs = l.split("=")[1].strip() + url = osf_url(ast.literal_eval(rhs)) + # Slides are sometimes used in multiple notebooks, so we + # just store the filename and the link + if url not in slides: + api_request = f"https://api.osf.io/v2/files/{ast.literal_eval(rhs)}/" + httprequest = Request(api_request, + headers={"Accept": "application/json"}) + try: + with urlopen(httprequest) as response: + data = json.load(response) + filename = data["data"]["attributes"]["name"] + except HTTPError as e: + sys.stderr.write(str(e) + "\n") + sys.stderr.write(f"Skipping slide {url}\n") + continue + if 'DaySummary' in nb_name: + filename = os.path.splitext(filename.replace("_", ""))[0] + '_DaySummary.pdf' + slides[url] = filename + + print(json.dumps({"videos": videos, "slides": slides}, indent=4)) + + +def parse_args(arglist): + """Handle the command-line arguments.""" + parser = argparse.ArgumentParser( + description="Process neuromatch tutorial notebooks" + ) + parser.add_argument( + "--noyoutube", + action="store_true", + help="Extract Bilibili links instead of youtube", + ) + parser.add_argument( + "files", + nargs="+", + help="File name(s) to process. Will filter for .ipynb extension.", + ) + return parser.parse_args(arglist) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/src/nmaci/find_unreferenced_content.py b/src/nmaci/find_unreferenced_content.py new file mode 100644 index 0000000..c6b0293 --- /dev/null +++ b/src/nmaci/find_unreferenced_content.py @@ -0,0 +1,28 @@ +"""Print names of derivative files that are no longer used in the notebooks.""" +from glob import glob + +def main(arglist=None): + # arglist is ignored — operates on cwd + day_paths = glob("tutorials/W?D?_*") + for day_path in sorted(day_paths): + + # Read all of the text for this day's student notebooks into one string + student_notebooks = glob(f"{day_path}/student/*.ipynb") + notebook_text = "" + for nb_path in student_notebooks: + with open(nb_path) as f: + notebook_text += f.read() + + # Find solution images and scripts + solution_pattern = "W?D?_*_Solution*" + static_paths = glob(f"{day_path}/static/{solution_pattern}") + script_paths = glob(f"{day_path}/solutions/{solution_pattern}") + + # Print paths that are not referenced in the notebooks + for path in sorted(static_paths + script_paths): + if path not in notebook_text: + print(path) + + +if __name__ == "__main__": + main() diff --git a/src/nmaci/generate_book.py b/src/nmaci/generate_book.py new file mode 100644 index 0000000..586513a --- /dev/null +++ b/src/nmaci/generate_book.py @@ -0,0 +1,343 @@ +import argparse +import os +import sys +import yaml +from jinja2 import Template +import traceback +import json +from bs4 import BeautifulSoup + +ORG = os.environ.get("ORG", "neuromatch") +REPO = os.environ.get("NMA_REPO", "course-content-template") +PREREQ_REPOR = os.environ.get("PREREQ_REPO", "precourse") +PREREQ_INTRO = os.environ.get("PREREQ_INTRO", "ComputationalNeuroscience") + + +def main(arglist=None): + parser = argparse.ArgumentParser() + parser.add_argument("book_type", choices=["student", "instructor"]) + args = parser.parse_args(arglist) + ARG = args.book_type + with open('tutorials/materials.yml') as fh: + materials = yaml.load(fh, Loader=yaml.FullLoader) + + # Make the dictionary that contains the chapters + toc = {} + for m in materials: + if m['category'] not in toc.keys(): + toc[m['category']] = {'part': m['category'], 'chapters': []} + # Add the project booklet + toc['Project Booklet'] = {'part': 'Project Booklet', 'chapters': []} + + art_file_list = os.listdir('tutorials/Art/') + + for m in materials: + directory = f"{m['day']}_{''.join(m['name'].split())}" + + # Make temporary chapter title file + with open(f"tutorials/{directory}/chapter_title.md", + "w+") as title_file: + title_page = f"# {m['name']}" + art_file = [fname for fname in art_file_list if m['day'] in fname] + if len(art_file) == 1: + artist = art_file[0].split('-')[1].split('.')[0] + artist = artist.replace('_', ' ') + title_page += f"\n\n ````{{div}} full-width \n art relevant to chapter contents \n```` \n\n*Artwork by {artist}*" + title_file.write(title_page) + + chapter = {'file': f"tutorials/{directory}/chapter_title.md", + 'title': f"{m['name']} ({m['day']})", + 'sections': []} + print(m['day']) + part = m['category'] + directory = f"tutorials/{m['day']}_{''.join(m['name'].split())}" + + # Make list of notebook sections + notebook_list = [] + notebook_list += [f"{directory}/{ARG}/{m['day']}_Intro.ipynb"] if os.path.exists(f"{directory}/{m['day']}_Intro.ipynb") else [] + notebook_list += [f"{directory}/{ARG}/{m['day']}_Tutorial{i + 1}.ipynb" for i in range(m['tutorials'])] + notebook_list += [f"{directory}/{ARG}/{m['day']}_Outro.ipynb"] if os.path.exists(f"{directory}/{m['day']}_Outro.ipynb") else [] + + # Add and process all notebooks + for notebook_file_path in notebook_list: + chapter['sections'].append({'file': notebook_file_path}) + pre_process_notebook(notebook_file_path) + + # Add further reading page + chapter['sections'].append({'file': f"{directory}/further_reading.md"}) + + # Add day summary page + notebook_file_path = f"{directory}/{ARG}/{m['day']}_DaySummary.ipynb" + if os.path.exists(notebook_file_path): + chapter['sections'].append({'file': notebook_file_path}) + pre_process_notebook(notebook_file_path) + + # Add chapter + toc[part]['chapters'].append(chapter) + + # Project chapter -- based on the repo + # TODO: get this from the project_materials.yml + # with open('projects/project_materials.yml') as fh: + # project_materials = yaml.load(fh, Loader=yaml.FullLoader) + # print(project_materials) +# +# part = 'Project Booklet' +# toc[part]['chapters'].append({'file': 'projects/README.md', 'title': 'Introduction'}) +# toc[part]['chapters'].append({'file': 'projects/docs/project_guidance.md'}) + + +# # Add the project booklet + toc["Project Booklet"] = {"part": "Project Booklet", "chapters": []} + toc["Professional Development"] = { + "part": "Professional Development", "chapters": []} + + with open("projects/professional_development/prof_dev_materials.yml") as fh: + prof_dev_materials = yaml.load(fh, Loader=yaml.FullLoader) + + part = "Professional Development" + toc[part]['chapters'] = prof_dev_materials + + # Project chapter -- based on the repo + with open("projects/project_materials.yml") as fh: + project_materials = yaml.load(fh, Loader=yaml.FullLoader) + + part = "Project Booklet" + toc[part]['chapters'] = project_materials + + # Process Project Notebooks + for m in project_materials: + if m["title"] == "Project materials": + for project in m["sections"]: + pre_process_notebook(project["file"]) + + # Loop over dataset types + # project_datasets = {"file": "projects/docs/datasets_overview.md", "sections": []} + + + + # toc[part]['chapters'].append({'file': 'projects/docs/past_projects_overview.md'}) + # toc[part]['chapters'].append({'file': 'projects/docs/datasets_overview.md'}) + # toc[part]['chapters'].append({'file': 'projects/docs/continuing_your_project_after_the_course.md'}) +# +# # Add Modeling Steps +# toc[part]['chapters'].append({'file': 'projects/modelingsteps/intro.md', +# 'sections': [{'file': 'projects/modelingsteps/ModelingSteps_1through4.ipynb'}, +# {'file': 'projects/modelingsteps/ModelingSteps_5through10.ipynb'}, +# {'file': 'projects/modelingsteps/TrainIllusionModel.ipynb'}, +# {'file': 'projects/modelingsteps/TrainIllusionDataProject.ipynb'} +# ]}) +# pre_process_notebook('projects/modelingsteps/ModelingSteps_1through4.ipynb') +# pre_process_notebook('projects/modelingsteps/ModelingSteps_5through10.ipynb') +# pre_process_notebook('projects/modelingsteps/TrainIllusionModel.ipynb') +# pre_process_notebook('projects/modelingsteps/TrainIllusionDataProject.ipynb') +# + # Loop over dataset types + # project_datasets = {'file': 'projects/docs/datasets_overview.md', 'sections': []} + +# for category in project_materials[0]['categories']: +# # this_section = {'file': f'projects/docs/{category}.md', 'sections': []} + +# # Add README guide +# this_section['sections'].append({'file': f"projects/{category}/README.md", 'title': 'Guide'}) + +# # Add and process all notebooks +# try: +# this_section['sections'].append({'file': f"projects/{category}/{category}_videos.ipynb"}) +# pre_process_notebook(f"projects/{category}/{category}_videos.ipynb") +# except: +# pass +# ## dataset_loaders = [entry for entry in project_materials if entry['category'] == category] +# ## for notebook in dataset_loaders: +# ## this_section['sections'].append({'file': notebook['link'], 'title': notebook['title']}) +# ## pre_process_notebook(notebook['link']) +# project_datasets['sections'].append(this_section) +# toc[part]['chapters'].append(project_datasets) +# toc[part]['chapters'].append({'file': 'projects/docs/project_templates.md'}) + +# # Past Projects +# p_sections = [] +# year = project_materials[1]['year'] +# for past_categories in project_materials[1]['past_categories']: +# p_sections.append( +# {'file': f'projects/docs/projects_{year}/{past_categories}.md'} +# ) +# toc[part]['chapters'].append( +# { +# 'file': f'projects/docs/project_{year}_highlights.md', +# 'sections': p_sections +# } +# ) + # toc[part]['chapters'].append({'file': 'projects/docs/project_2020_highlights.md', + # 'sections': [{'file': 'projects/docs/projects_2020/neurons.md'}, + # {'file': 'projects/docs/projects_2020/theory.md'}, + # {'file': 'projects/docs/projects_2020/behavior.md'}, + # {'file': 'projects/docs/projects_2020/fMRI.md'}, + # {'file': 'projects/docs/projects_2020/eeg.md'} + # ]}) + + # TODO: Fix TOC for new format + # Turn toc into list + toc_list = [{'file': f"tutorials/intro.ipynb"}] + if os.path.exists(f"tutorials/intro.ipynb"): + pre_process_notebook(f"tutorials/intro.ipynb") + + # TODO: fix this for the generic case + # TA training file + if ARG == "instructor" and ("climate" not in REPO and "neuroai" not in REPO): + chapter = {'chapters': [{'file': 'tatraining/TA_Training_CN.ipynb'}]} + pre_process_notebook('tatraining/TA_Training_CN.ipynb') + toc_list += [chapter] + # Schedule chapter + chapter = {'chapters': [{'file': 'tutorials/Schedule/schedule_intro.md', + 'sections': [{'file': 'tutorials/Schedule/daily_schedules.md'}, + {'file': 'tutorials/Schedule/shared_calendars.md'}, + {'file': 'tutorials/Schedule/timezone_widget.md'} + ]}]} + toc_list += [chapter] + + # Technical help chapter + chapter = {'chapters': [{'file': 'tutorials/TechnicalHelp/tech_intro.md', + 'sections': [{'file': 'tutorials/TechnicalHelp/Jupyterbook.md', + 'sections': [{'file': 'tutorials/TechnicalHelp/Tutorial_colab.md'}, + {'file': 'tutorials/TechnicalHelp/Tutorial_kaggle.md'} + ] + }, + {'file': 'tutorials/TechnicalHelp/Discord.md'} + ]}]} + toc_list += [chapter] + + # Links and Policy file + chapter = {'chapters': [{'file': 'tutorials/TechnicalHelp/Links_Policy.md'}]} + toc_list += [chapter] + + # Pre-reqs file + if "climate" in REPO: + chapter = {'chapters': [{'file': f'tutorials/prereqs/{PREREQ_INTRO}.md'}]} + else: + chapter = {'chapters': [{'file': f'prereqs/{PREREQ_INTRO}.md'}]} + toc_list += [chapter] + + for key in toc.keys(): + + # Add wrap-up if it exists + wrapup_name = f'tutorials/Module_WrapUps/{key.replace(" ", "")}.ipynb' + if os.path.exists(wrapup_name): + toc[key]['chapters'].append({'file': wrapup_name}) + + toc_list.append(toc[key]) + + with open('book/_toc.yml', 'w') as fh: + yaml.dump(toc_list, fh) + + +def pre_process_notebook(file_path): + + with open(file_path, encoding="utf-8") as read_notebook: + content = json.load(read_notebook) + pre_processed_content = open_in_colab_new_tab(content) + pre_processed_content = change_video_widths(pre_processed_content) + pre_processed_content = link_hidden_cells(pre_processed_content) + with open(file_path, "w", encoding="utf-8") as write_notebook: + json.dump(pre_processed_content, write_notebook, indent=1, ensure_ascii=False) + + +def open_in_colab_new_tab(content): + cells = content['cells'] + parsed_html = BeautifulSoup(cells[0]['source'][0], "html.parser") + for anchor in parsed_html.findAll('a'): + # Open in new tab + anchor['target'] = '_blank' + cells[0]['source'][0] = str(parsed_html) + return content + +def link_hidden_cells(content): + cells = content['cells'] + updated_cells = cells.copy() + + i_updated_cell = 0 + for i_cell, cell in enumerate(cells): + updated_cell = updated_cells[i_updated_cell] + if "source" not in cell: + continue + source = cell['source'][0] + + if source.startswith("#") and cell['cell_type'] == 'markdown': + header_level = source.count('#') + elif source.startswith("---") and cell['cell_type'] == 'markdown': + if len(cell['source']) > 1 and cell['source'][1].startswith("#") and cell['cell_type'] == 'markdown': + header_level = cell['source'][1].count('#') + + if '@title' in source or '@markdown' in source: + if 'metadata' not in cell: + updated_cell['metadata'] = {} + if 'tags' not in cell['metadata']: + updated_cell['metadata']['tags'] = [] + + # Check if cell is video one + if 'YouTubeVideo' in ''.join(cell['source']) or 'IFrame' in ''.join(cell['source']): + if "remove-input" not in cell['metadata']['tags']: + updated_cell['metadata']['tags'].append("remove-input") + else: + if "hide-input" not in cell['metadata']['tags']: + updated_cell['metadata']['tags'].append("hide-input") + + # If header is lost, create one in markdown + if '@title' in source: + + if source.split('@title')[1] != '': + header_cell = { + 'cell_type': 'markdown', + 'metadata': {}, + 'source': ['#'*(header_level + 1) + ' ' + source.split('@title')[1]]} + updated_cells.insert(i_updated_cell, header_cell) + i_updated_cell += 1 + + strings_with_markdown = [(i, string) for i, string in enumerate(cell['source']) if '@markdown' in string] + if len(strings_with_markdown) == 1: + i = strings_with_markdown[0][0] + if cell['source'][i].split('@markdown')[1] != '': + header_cell = { + 'cell_type': 'markdown', + 'metadata': {}, + 'source': [cell['source'][i].split('@markdown')[1]]} + updated_cells.insert(i_updated_cell, header_cell) + i_updated_cell += 1 + + i_updated_cell += 1 + + content['cells'] = updated_cells + return content + +def change_video_widths(content): + + for i, cell in enumerate(content['cells']): + if 'YouTubeVideo' in ''.join(cell['source']): + + for ind in range(len(cell['source'])): + # Change sizes + cell['source'][ind] = cell['source'][ind].replace('854', '730') + cell['source'][ind] = cell['source'][ind].replace('480', '410') + + # Put slides in ipywidget so they don't overlap margin + if '# @title Tutorial slides\n' in cell['source'] or '# @title Slides\n' in cell['source'] or '## Slides' in content['cells'][i-1]['source']: + for line in cell['source']: + if line.startswith('link_id'): + slide_link = line.split('"')[1] + break + # Catch the case with missing link_id + else: + slide_link = "" + download_link = f"https://osf.io/download/{slide_link}/" + render_link = f"https://mfr.ca-1.osf.io/render?url=https://osf.io/{slide_link}/?direct%26mode=render%26action=download%26mode=render" + cell['source'] = ['# @markdown\n', + 'from IPython.display import IFrame\n', + 'from ipywidgets import widgets\n', + 'out = widgets.Output()\n', + 'with out:\n', + f' print(f"If you want to download the slides: {download_link}")\n', + f' display(IFrame(src=f"{render_link}", width=730, height=410))\n', + 'display(out)'] + return content + +if __name__ == '__main__': + main() diff --git a/src/nmaci/generate_book_dl.py b/src/nmaci/generate_book_dl.py new file mode 100644 index 0000000..60d9995 --- /dev/null +++ b/src/nmaci/generate_book_dl.py @@ -0,0 +1,264 @@ +import argparse +import os +import sys +import yaml +from jinja2 import Template +import traceback +import json +from bs4 import BeautifulSoup + +REPO = os.environ.get("NMA_REPO", "course-content-dl") + + +def main(arglist=None): + parser = argparse.ArgumentParser() + parser.add_argument("book_type", choices=["student", "instructor"]) + args = parser.parse_args(arglist) + ARG = args.book_type + with open('tutorials/materials.yml') as fh: + materials = yaml.load(fh, Loader=yaml.FullLoader) + + # Make the dictionary that contains the chapters + toc = {} + for m in materials: + if m['category'] not in toc.keys(): + toc[m['category']] = {'part': m['category'], 'chapters': []} + # Add the project booklet + toc['Project Booklet'] = {'part': 'Project Booklet', 'chapters': []} + + art_file_list = os.listdir('tutorials/Art/') + + for m in materials: + directory = f"{m['day']}_{''.join(m['name'].split())}" + + # Make temporary chapter title file + with open(f"tutorials/{directory}/chapter_title.md", + "w+") as title_file: + title_page = f"# {m['name']}" + art_file = [fname for fname in art_file_list if m['day'] in fname] + if len(art_file) == 1: + artist = art_file[0].split('-')[1].split('.')[0] + artist = artist.replace('_', ' ') + title_page += f"\n\n ````{{div}} full-width \n art relevant to chapter contents \n```` \n\n*Artwork by {artist}*" + title_file.write(title_page) + + chapter = {'file': f"tutorials/{directory}/chapter_title.md", + 'title': f"{m['name']} ({m['day']})", + 'sections': []} + print(m['day']) + part = m['category'] + directory = f"tutorials/{m['day']}_{''.join(m['name'].split())}" + + # Make list of notebook sections + notebook_list = [] + notebook_list += [f"{directory}/{ARG}/{m['day']}_Tutorial{i + 1}.ipynb" for i in range(m['tutorials'])] + notebook_list += [f"{directory}/{ARG}/{m['day']}_BonusLecture.ipynb"] if os.path.exists(f"{directory}/{m['day']}_BonusLecture.ipynb") else [] + + # Add and process all notebooks + for notebook_file_path in notebook_list: + chapter['sections'].append({'file': notebook_file_path}) + pre_process_notebook(notebook_file_path) + + # Add further reading page + # chapter['sections'].append({'file': f"{directory}/further_reading.md"}) + + # Add chapter + toc[part]['chapters'].append(chapter) + + # Project chapter -- under construction + part = 'Project Booklet' + toc[part]['chapters'].append({'file': 'projects/README.md', 'title': 'Introduction'}) + toc[part]['chapters'].append({'file': 'projects/docs/project_guidance.md'}) + + with open('projects/project_materials.yml') as fh: + project_materials = yaml.load(fh, Loader=yaml.FullLoader) + + # Add modelling steps + category = 'modelingsteps' + this_section = {'file': f'projects/{category}/intro.md', 'sections': []} + for m in project_materials: + if m['category'] == category: + this_section['sections'].append({'file': f"projects/{category}/{m['link']}"}) + pre_process_notebook(f"projects/{category}/{m['link']}") + toc[part]['chapters'].append(this_section) + print(category) + + # Add project templates + project_datasets = {'file': 'projects/docs/projects_overview.md', 'sections': []} + # Loop over project categories + for category in ['ComputerVision', 'ReinforcementLearning', 'NaturalLanguageProcessing', 'Neuroscience']: + print(category) + # Add each category section + this_section = {'file': f'projects/{category}/README.md', + 'sections': [{'file': f'projects/{category}/slides.md'}, + {'file': f'projects/{category}/ideas_and_datasets.md'}]} + for m in project_materials: + if m['category'] == category: + # Add and process all notebooks + try: + this_section['sections'].append({'file': f"projects/{category}/{m['link']}"}) + pre_process_notebook(f"projects/{category}/{m['link']}") + except: + pass + project_datasets['sections'].append(this_section) + toc[part]['chapters'].append(project_datasets) + + # Add models and datasets + toc[part]['chapters'].append({'file': 'projects/docs/datasets_and_models.md'}) + # Turn toc into list + toc_list = [{'file': 'tutorials/intro.ipynb'}] + if os.path.exists("tutorials/intro.ipynb"): + pre_process_notebook('tutorials/intro.ipynb') + + # TA training file + if ARG == "instructor": + chapter = {'chapters': [{'file': 'tatraining/TA_Training_DL.ipynb'}]} + pre_process_notebook('tatraining/TA_Training_DL.ipynb') + toc_list += [chapter] + + # Schedule chapter + chapter = {'chapters': [{'file': 'tutorials/Schedule/schedule_intro.md', + 'sections': [{'file': 'tutorials/Schedule/daily_schedules.md'}, + {'file': 'tutorials/Schedule/shared_calendars.md'}, + {'file': 'tutorials/Schedule/timezone_widget.md'} + ]}]} + toc_list += [chapter] + + # Technical help chapter + chapter = {'chapters': [{'file': 'tutorials/TechnicalHelp/tech_intro.md', + 'sections': [{'file': 'tutorials/TechnicalHelp/Jupyterbook.md', + 'sections': [{'file': 'tutorials/TechnicalHelp/Tutorial_colab.md'}, + {'file': 'tutorials/TechnicalHelp/Tutorial_kaggle.md'} + ] + }, + {'file': 'tutorials/TechnicalHelp/Discord.md'} + ]}]} + toc_list += [chapter] + + # Links and Policy file + chapter = {'chapters': [{'file': 'tutorials/TechnicalHelp/Links_Policy.md'}]} + toc_list += [chapter] + + # Pre-reqs file + chapter = {'chapters': [{'file': 'prereqs/DeepLearning.md'}]} + toc_list += [chapter] + + for key in toc.keys(): + # Add wrap-up if it exists + wrapup_name = f'tutorials/Module_WrapUps/{key.replace(" ", "")}.ipynb' + if os.path.exists(wrapup_name): + toc[key]['chapters'].append({'file': wrapup_name}) + + toc_list.append(toc[key]) + + with open('book/_toc.yml', 'w') as fh: + yaml.dump(toc_list, fh) + + +def pre_process_notebook(file_path): + + with open(file_path, encoding="utf-8") as read_notebook: + content = json.load(read_notebook) + pre_processed_content = open_in_colab_new_tab(content) + pre_processed_content = change_video_widths(pre_processed_content) + pre_processed_content = link_hidden_cells(pre_processed_content) + with open(file_path, "w", encoding="utf-8") as write_notebook: + json.dump(pre_processed_content, write_notebook, indent=1, ensure_ascii=False) + + +def open_in_colab_new_tab(content): + cells = content['cells'] + parsed_html = BeautifulSoup(cells[0]['source'][0], "html.parser") + for anchor in parsed_html.findAll('a'): + # Open in new tab + anchor['target'] = '_blank' + cells[0]['source'][0] = str(parsed_html) + return content + +def link_hidden_cells(content): + cells = content['cells'] + updated_cells = cells.copy() + + i_updated_cell = 0 + for i_cell, cell in enumerate(cells): + updated_cell = updated_cells[i_updated_cell] + if "source" not in cell: + continue + source = cell['source'][0] + + if source.startswith("#") and cell['cell_type'] == 'markdown': + header_level = source.count('#') + elif source.startswith("---") and cell['cell_type'] == 'markdown': + if len(cell['source']) > 1 and cell['source'][1].startswith("#") and cell['cell_type'] == 'markdown': + header_level = cell['source'][1].count('#') + + if '@title' in source or '@markdown' in source: + if 'metadata' not in cell: + updated_cell['metadata'] = {} + if 'tags' not in cell['metadata']: + updated_cell['metadata']['tags'] = [] + + # Check if cell is video one + if 'YouTubeVideo' in ''.join(cell['source']) or 'IFrame' in ''.join(cell['source']): + if "remove-input" not in cell['metadata']['tags']: + updated_cell['metadata']['tags'].append("remove-input") + else: + if "hide-input" not in cell['metadata']['tags']: + updated_cell['metadata']['tags'].append("hide-input") + + # If header is lost, create one in markdown + if '@title' in source: + + if source.split('@title')[1] != '': + header_cell = { + 'cell_type': 'markdown', + 'metadata': {}, + 'source': ['#'*(header_level + 1) + ' ' + source.split('@title')[1]]} + updated_cells.insert(i_updated_cell, header_cell) + i_updated_cell += 1 + + strings_with_markdown = [(i, string) for i, string in enumerate(cell['source']) if '@markdown' in string] + if len(strings_with_markdown) == 1: + i = strings_with_markdown[0][0] + if cell['source'][i].split('@markdown')[1] != '': + header_cell = { + 'cell_type': 'markdown', + 'metadata': {}, + 'source': [cell['source'][i].split('@markdown')[1]]} + updated_cells.insert(i_updated_cell, header_cell) + i_updated_cell += 1 + + i_updated_cell += 1 + + content['cells'] = updated_cells + return content + +def change_video_widths(content): + + for i, cell in enumerate(content['cells']): + if 'YouTubeVideo' in ''.join(cell['source']): + + for ind in range(len(cell['source'])): + # Change sizes + cell['source'][ind] = cell['source'][ind].replace('854', '730') + cell['source'][ind] = cell['source'][ind].replace('480', '410') + + # Put slides in ipywidget so they don't overlap margin + if '# @title Tutorial slides\n' in cell['source'] or '# @title Slides\n' in cell['source'] or '## Slides' in content['cells'][i-1]['source']: + for line in cell['source']: + if line.startswith('link_id'): + slide_link = line.split('"')[1] + download_link = f"https://osf.io/download/{slide_link}/" + render_link = f"https://mfr.ca-1.osf.io/render?url=https://osf.io/{slide_link}/?direct%26mode=render%26action=download%26mode=render" + cell['source'] = ['# @markdown\n', + 'from IPython.display import IFrame\n', + 'from ipywidgets import widgets\n', + 'out = widgets.Output()\n', + 'with out:\n', + f' print(f"If you want to download the slides: {download_link}")\n', + f' display(IFrame(src=f"{render_link}", width=730, height=410))\n', + 'display(out)'] + return content + +if __name__ == '__main__': + main() diff --git a/src/nmaci/generate_book_precourse.py b/src/nmaci/generate_book_precourse.py new file mode 100644 index 0000000..3bde81a --- /dev/null +++ b/src/nmaci/generate_book_precourse.py @@ -0,0 +1,197 @@ +import os + +import yaml +from jinja2 import Template +import traceback +import json +from bs4 import BeautifulSoup + +REPO = os.environ.get("NMA_REPO", "precourse") + +def main(arglist=None): + with open('tutorials/materials.yml') as fh: + materials = yaml.load(fh, Loader=yaml.FullLoader) + + # Make the dictionary that contains the chapters + toc = {} + for m in materials: + if m['category'] not in toc.keys(): + toc[m['category']] = {'part': m['category'], 'chapters': []} + + art_file_list = os.listdir('tutorials/Art/') + + art_file_list = os.listdir('tutorials/Art/') + + for m in materials: + directory = f"{m['day']}_{''.join(m['name'].split())}" + + # Make temporary chapter title file + with open(f"tutorials/{directory}/chapter_title.md", + "w+") as title_file: + title_page = f"# {m['name']}" + art_file = [fname for fname in art_file_list if m['day'] in fname] + if len(art_file) == 1: + artist = art_file[0].split('-')[1].split('.')[0] + artist = artist.replace('_', ' ') + title_page += f"\n\n ````{{div}} full-width \n art relevant to chapter contents \n```` \n\n*Artwork by {artist}*" + title_file.write(title_page) + + chapter = {'file': f"tutorials/{directory}/chapter_title.md", + 'title': f"{m['name']} ({m['day']})", + 'sections': []} + print(m['day']) + part = m['category'] + directory = f"tutorials/{m['day']}_{''.join(m['name'].split())}" + + # Make list of notebook sections + notebook_list = [] + notebook_list += [f"{directory}/{m['day']}_Intro.ipynb"] if os.path.exists(f"{directory}/{m['day']}_Intro.ipynb") else [] + notebook_list += [f"{directory}/student/{m['day']}_Tutorial{i + 1}.ipynb" for i in range(m['tutorials'])] + notebook_list += [f"{directory}/{m['day']}_Outro.ipynb"] if os.path.exists(f"{directory}/{m['day']}_Outro.ipynb") else [] + notebook_list += [f"{directory}/{m['day']}_DaySummary.ipynb"] if os.path.exists(f"{directory}/{m['day']}_DaySummary.ipynb") else [] + + # Add and process all notebooks + for notebook_file_path in notebook_list: + chapter['sections'].append({'file': notebook_file_path}) + pre_process_notebook(notebook_file_path) + + # Add chapter + toc[part]['chapters'].append(chapter) + + # Turn toc into list + toc_list = [{'file': 'tutorials/intro.ipynb'}] + if os.path.exists("tutorials/intro.ipynb"): + pre_process_notebook('tutorials/intro.ipynb') + + # Technical help chapter + chapter = {'chapters': [{'file': 'tutorials/TechnicalHelp/tech_intro.md', + 'sections': [{'file': 'tutorials/TechnicalHelp/Jupyterbook.md', + 'sections': [{'file': 'tutorials/TechnicalHelp/Tutorial_colab.md'}, + {'file': 'tutorials/TechnicalHelp/Tutorial_kaggle.md'} + ] + }, + {'file': 'tutorials/TechnicalHelp/Discord.md'} + ]}]} + toc_list += [chapter] + for key in toc.keys(): + + # Add wrap-up if it exists + wrapup_name = f'tutorials/Module_WrapUps/{key.replace(" ", "")}.ipynb' + if os.path.exists(wrapup_name): + toc[key]['chapters'].append({'file': wrapup_name}) + + toc_list.append(toc[key]) + + with open('book/_toc.yml', 'w') as fh: + yaml.dump(toc_list, fh) + + +def pre_process_notebook(file_path): + + with open(file_path, encoding="utf-8") as read_notebook: + content = json.load(read_notebook) + pre_processed_content = open_in_colab_new_tab(content) + pre_processed_content = change_video_widths(pre_processed_content) + pre_processed_content = link_hidden_cells(pre_processed_content) + with open(file_path, "w", encoding="utf-8") as write_notebook: + json.dump(pre_processed_content, write_notebook, indent=1, ensure_ascii=False) + + +def open_in_colab_new_tab(content): + cells = content['cells'] + parsed_html = BeautifulSoup(cells[0]['source'][0], "html.parser") + for anchor in parsed_html.findAll('a'): + # Open in new tab + anchor['target'] = '_blank' + cells[0]['source'][0] = str(parsed_html) + return content + + +def link_hidden_cells(content): + cells = content['cells'] + updated_cells = cells.copy() + + i_updated_cell = 0 + for i_cell, cell in enumerate(cells): + updated_cell = updated_cells[i_updated_cell] + if "source" not in cell: + continue + source = cell['source'][0] + + if source.startswith("#") and cell['cell_type'] == 'markdown': + header_level = source.count('#') + elif source.startswith("---") and cell['cell_type'] == 'markdown': + if len(cell['source']) > 1 and cell['source'][1].startswith("#") and cell['cell_type'] == 'markdown': + header_level = cell['source'][1].count('#') + + if '@title' in source or '@markdown' in source: + if 'metadata' not in cell: + updated_cell['metadata'] = {} + if 'tags' not in cell['metadata']: + updated_cell['metadata']['tags'] = [] + + # Check if cell is video one + if 'YouTubeVideo' in ''.join(cell['source']) or 'IFrame' in ''.join(cell['source']): + if "remove-input" not in cell['metadata']['tags']: + updated_cell['metadata']['tags'].append("remove-input") + else: + if "hide-input" not in cell['metadata']['tags']: + updated_cell['metadata']['tags'].append("hide-input") + + # If header is lost, create one in markdown + if '@title' in source: + + if source.split('@title')[1] != '': + header_cell = { + 'cell_type': 'markdown', + 'metadata': {}, + 'source': ['#'*(header_level + 1) + ' ' + source.split('@title')[1]]} + updated_cells.insert(i_updated_cell, header_cell) + i_updated_cell += 1 + + strings_with_markdown = [(i, string) for i, string in enumerate(cell['source']) if '@markdown' in string] + if len(strings_with_markdown) == 1: + i = strings_with_markdown[0][0] + if cell['source'][i].split('@markdown')[1] != '': + header_cell = { + 'cell_type': 'markdown', + 'metadata': {}, + 'source': [cell['source'][i].split('@markdown')[1]]} + updated_cells.insert(i_updated_cell, header_cell) + i_updated_cell += 1 + + i_updated_cell += 1 + + content['cells'] = updated_cells + return content + + +def change_video_widths(content): + + for i, cell in enumerate(content['cells']): + if 'YouTubeVideo' in ''.join(cell['source']): + + for ind in range(len(cell['source'])): + # Change sizes + cell['source'][ind] = cell['source'][ind].replace('854', '730') + cell['source'][ind] = cell['source'][ind].replace('480', '410') + + # Put slides in ipywidget so they don't overlap margin + if '# @title Tutorial slides\n' in cell['source'] or '# @title Slides\n' in cell['source'] or '## Slides' in content['cells'][i-1]['source']: + for line in cell['source']: + if line.startswith('link_id'): + slide_link = line.split('"')[1] + download_link = f"https://osf.io/download/{slide_link}/" + render_link = f"https://mfr.ca-1.osf.io/render?url=https://osf.io/{slide_link}/?direct%26mode=render%26action=download%26mode=render" + cell['source'] = ['# @markdown\n', + 'from IPython.display import IFrame\n', + 'from ipywidgets import widgets\n', + 'out = widgets.Output()\n', + 'with out:\n', + f' print(f"If you want to download the slides: {download_link}")\n', + f' display(IFrame(src=f"{render_link}", width=730, height=410))\n', + 'display(out)'] + return content + +if __name__ == '__main__': + main() diff --git a/src/nmaci/generate_tutorial_readmes.py b/src/nmaci/generate_tutorial_readmes.py new file mode 100644 index 0000000..4a1affc --- /dev/null +++ b/src/nmaci/generate_tutorial_readmes.py @@ -0,0 +1,261 @@ +"""Write a directory of tutorial notebooks to the README file. + +Run this script from the root of the github repository. + +""" +import os +from glob import glob +import yaml + +ORG = os.environ.get("ORG", "neuromatch") +REPO = os.environ.get("NMA_REPO", "course-content") +MAIN_BRANCH = os.environ.get("NMA_MAIN_BRANCH", "main") + + +def main(arglist=None): + + # Initialize the lines in tutorials/README.md + course_readme_text = [ + ] + + try: + playlist_urls = load_youtube_playlist_urls() + except Exception as err: + print("Encountered error while loading youtube playlist links") + print(err) + playlist_urls = {} + + try: + slide_urls = load_slide_urls() + except Exception as err: + print("Encountered error while loading slide links") + print(err) + slide_urls = {} + + day_anchors = {} + + day_paths = sorted(glob("tutorials/W?D?_*")) + for day_path in day_paths: + + day_name = os.path.split(day_path)[-1] + day_code, topic_code = day_name.split("_") + + # Split the UpperCamelCase topic name into separate words + topic_words = [] + for letter in topic_code: + if letter.isupper(): + topic_words.append(letter) + else: + topic_words[-1] += letter + topic = " ".join(topic_words) + + # Note: this will fail if we have 10+ notebooks + notebooks = sorted(glob(f"{day_path}/*.ipynb")) + + if not notebooks: + continue + + # Track the anchor to this section for embed in the header + anchor = "-".join([ + day_code.lower(), + "-", + ("-".join(topic_words)).lower(), + ]) + + day_anchors[day_code] = "#" + anchor + + instructor_notebooks = get_instructor_links(notebooks) + student_notebooks = get_student_links(notebooks) + + # Write the day information into the course README + course_readme_text.extend([ + f"## {day_code} - {topic}", + "", + ]) + + # Add a link to the YouTube lecture playlist, if we have one + youtube_url = playlist_urls.get(day_code, None) + if youtube_url is not None: + course_readme_text.extend([ + f"[YouTube Playlist]({youtube_url})" + "", + ]) + + slide_links_by_topic = slide_urls.get(day_code, None) + if slide_links_by_topic is not None: + slide_links = [ + f"[{topic}]({url})" for topic, url in slide_links_by_topic + ] + course_readme_text.extend([ + "", + "Slides: " + " | ".join(slide_links), + "", + ]) + + course_readme_text.extend(write_badge_table(student_notebooks)) + course_readme_text.append("\n") + + # Add further reading + further_reading_file = f"{day_path}/further_reading.md" + if os.path.exists(further_reading_file): + reading_url = f"https://github.com/{ORG}/{REPO}/blob/{MAIN_BRANCH}/{further_reading_file}" + course_readme_text.extend([f"[Further Reading]({reading_url})"]) + course_readme_text.append("\n") + + # Now make the day-specific README + # with links to both instructor and student versions + day_readme_text = [ + f"# {day_code} - {topic}", + "", + "## Instructor notebooks", + "", + ] + day_readme_text.extend(write_badge_table(instructor_notebooks)) + + day_readme_text.extend([ + "## Student notebooks", + "", + ]) + day_readme_text.extend(write_badge_table(student_notebooks)) + + # Write the day README file + with open(f"{day_path}/README.md", "w") as f: + f.write("\n".join(day_readme_text)) + + # Create relative anchor links to each day + nav_line = " | ".join([ + f"[{day_code}]({anchor})" for day_code, anchor in day_anchors.items() + ]) + + # Add an introductory header to the main README + course_readme_header = [ + "# Neuromatch Academy Tutorial Materials", + "", + "", + "", + nav_line, + "", + "*Warning:* The 'render with NBViewer' buttons may show outdated content.", + "", + ] + course_readme_text = course_readme_header + course_readme_text + + # Write the course README file + with open("tutorials/README.md", "w") as f: + f.write("\n".join(course_readme_text)) + + +def load_youtube_playlist_urls(): + """Create a mapping from day code to youtube link based on text file.""" + with open('tutorials/materials.yml') as fh: + materials = yaml.load(fh, Loader=yaml.FullLoader) + days = [m['day'] for m in materials] + playlists = [m['playlist'] for m in materials] + return dict(zip(days, playlists)) + + +def load_slide_urls(): + """Create a hierarchical mapping to slide PDF urls based on text file.""" + with open('tutorials/materials.yml') as fh: + materials = yaml.load(fh, Loader=yaml.FullLoader) + slide_links = {} + for ind, day_dict in enumerate(materials): + if 'slides' in day_dict: + slide_links[day_dict['day']] = [] + for slide_info in day_dict['slides']: + slide_links[day_dict['day']].append((slide_info['title'], slide_info['link'])) + return slide_links + + +def write_badge_table(notebooks): + """Make a markdown table with colab/nbviewer badge links.""" + + # Add the table header + table_text = [ + "| | Run | Run | View |", + "| - | --- | --- | ---- |", + ] + + # Get ordered list of file names + notebook_list = [name for name in notebooks if 'Intro' in name] + notebook_list += [name for name in notebooks if 'Tutorial' in name] + notebook_list += [name for name in notebooks if 'Outro' in name] + + # Add badges + for local_path in notebook_list: + # Extract type of file (intro vs outro vs tutorial) + notebook_name = local_path.split('_')[-1].split('.ipynb')[0] + + # Add space between Tutorial and number + if 'Tutorial' in notebook_name: + notebook_name = f"Tutorial {notebook_name.split('Tutorial')[1]}" + colab_badge = make_colab_badge(local_path) + kaggle_badge = make_kaggle_badge(local_path) + nbviewer_badge = make_nbviewer_badge(local_path) + table_text.append( + f"| {notebook_name} | {colab_badge} | {kaggle_badge} | {nbviewer_badge} |" + ) + table_text.append("\n") + + return table_text + + +def get_instructor_links(base_notebooks): + """Convert a list of base notebook paths to instructor versions.""" + instructor_notebooks = [] + for base_nb in base_notebooks: + if 'Tutorial' in base_nb: + day_path, nb_fname = os.path.split(base_nb) + instructor_notebooks.append(f"{day_path}/instructor/{nb_fname}") + else: + instructor_notebooks.append(base_nb) + return instructor_notebooks + + +def get_student_links(base_notebooks): + """Convert a list of base notebook paths to student versions.""" + student_notebooks = [] + for base_nb in base_notebooks: + if 'Tutorial' in base_nb: + day_path, nb_fname = os.path.split(base_nb) + student_notebooks.append(f"{day_path}/student/{nb_fname}") + else: + student_notebooks.append(base_nb) + return student_notebooks + + +def make_colab_badge(local_path): + """Generate a Google Colaboratory badge for a notebook on github.""" + alt_text = "Open In Colab" + badge_svg = "https://colab.research.google.com/assets/colab-badge.svg" + service = "https://colab.research.google.com" + url_base = f"{service}/github/{ORG}/{REPO}/blob/{MAIN_BRANCH}" + return make_badge(alt_text, badge_svg, service, local_path, url_base) + + +def make_kaggle_badge(local_path): + """Generate a kaggle badge for a notebook on github.""" + alt_text = "Open In kaggle" + badge_svg = "https://kaggle.com/static/images/open-in-kaggle.svg" + service = "https://kaggle.com/kernels/welcome?src=" + url_base = f"{service}https://raw.githubusercontent.com/{ORG}/{REPO}/{MAIN_BRANCH}" + return make_badge(alt_text, badge_svg, service, local_path, url_base) + + +def make_nbviewer_badge(local_path): + """Generate an NBViewer badge for a notebook on github.""" + alt_text = "View the notebook" + badge_svg = "https://img.shields.io/badge/render-nbviewer-orange.svg" + service = "https://nbviewer.jupyter.org" + url_base = f"{service}/github/{ORG}/{REPO}/blob/{MAIN_BRANCH}" + return make_badge(alt_text, badge_svg, service, f"{local_path}?flush_cache=true", url_base) + + +def make_badge(alt_text, badge_svg, service, local_path, url_base): + """Generate a markdown element for a badge image that links to a file.""" + return f"[![{alt_text}]({badge_svg})]({url_base}/{local_path})" + + +if __name__ == "__main__": + + main() diff --git a/src/nmaci/lint_tutorial.py b/src/nmaci/lint_tutorial.py new file mode 100644 index 0000000..570cbc8 --- /dev/null +++ b/src/nmaci/lint_tutorial.py @@ -0,0 +1,192 @@ +"""Lint tutorial notebooks with pyflakes and pycodestyle (aka flake8). + +Running this script on a notebook will print a report of issues flagged by +pyflakes (which checks some aspects of code quality) and pycodestyle (which +checks adherence to the PEP8 stylistic standards). + +Note that these checks do not capture all potential issues with a codebase, +and some checks will false-alarm because of deliberate choices we have made +about how to write tutorials. Nevertheless, this can be an easy way to flag +potential issues. + +Requires nbformat (part of Jupyter) and flake8. + +""" +import os +import io +import re +import sys +import argparse +import tempfile +import subprocess +import collections +import nbformat +from pyflakes.api import check +from pyflakes.reporter import Reporter + + +def main(arglist): + + args = parse_args(arglist) + + _, fname = os.path.split(args.path) + + script, cell_lines = extract_code(args.path) + warnings, errors = check_code(script) + violations = check_style(script) + + if args.brief: + report_brief(fname, warnings, errors, violations) + else: + line_map = remap_line_numbers(cell_lines) + report_verbose(fname, warnings, errors, violations, line_map) + + +def parse_args(arglist): + + parser = argparse.ArgumentParser(__doc__) + parser.add_argument("path", help="Path to notebook file") + parser.add_argument("--brief", action="store_true", + help="Print brief report (useful for aggregating)") + + return parser.parse_args(arglist) + + +def extract_code(nb_fname): + """Turn code cells from notebook into a script, track cell sizes.""" + with open(nb_fname) as f: + nb = nbformat.read(f, nbformat.NO_CONVERT) + + script_lines = [] + cell_lengths = [] + for cell in nb.get("cells", []): + if cell["cell_type"] == "code": + cell_lines = cell.get("source", "").split("\n") + cell_lengths.append(len(cell_lines)) + for line in cell_lines: + if line and line[0] in ["!", "%"]: # IPython syntax + line = "# " + line + script_lines.append(line) + + script = "\n".join(script_lines) + + return script, cell_lengths + + +def check_code(script): + """Run pyflakes checks over the script and capture warnings/errors.""" + errors = io.StringIO() + warnings = io.StringIO() + reporter = Reporter(warnings, errors) + check(script, "notebook", reporter) + + warnings.seek(0) + errors.seek(0) + + return warnings, errors + + +def check_style(script): + """Write a temporary script and run pycodestyle (PEP8) on it.""" + + with tempfile.NamedTemporaryFile("w", suffix=".py") as f: + + f.write(script) + + cmdline = [ + "pycodestyle", + "--ignore=E111,E114", + "--max-line-length=88", + f.name, + ] + res = subprocess.run(cmdline, capture_output=True) + + output = res.stdout.decode().replace(f.name, "f").split("\n") + + if not output: + return collections.Counter() + + error_classes = [] + pat = re.compile(r"^f:\d+:\d+: (\w\d{3}) (.*)$") + for line in output: + m = pat.match(line) + if m is not None: + error_classes.append(f"{m.group(1)} ({m.group(2)})") + + return collections.Counter(error_classes) + + +def remap_line_numbers(cell_lines): + """Create a mapping from script line number to notebook cell/line.""" + line_map = {} + cell_start = 0 + for cell, cell_length in enumerate(cell_lines, 1): + for line in range(1, cell_length + 1): + line_map[cell_start + line] = cell, line + cell_start += cell_length + return line_map + + +def report_brief(fname, warnings, errors, violations): + """Print a single-line report, suibtable for aggregation.""" + n_warnings = len(warnings.read().splitlines()) + n_errors = len(errors.read().splitlines()) + n_violations = len(list(violations.elements())) + print(f"{fname} {n_warnings + n_errors} {n_violations}") + + +def report_verbose(fname, warnings, errors, violations, line_map): + """Report every pyflakes problem and more codestyle information.""" + s = f"Code report for {fname}" + print("", s, "=" * len(s), sep="\n") + + s = "Quality (pyflakes)" + print("", s, "-" * len(s), "", sep="\n") + + warning_lines = reformat_line_problems(warnings, line_map) + error_lines = reformat_line_problems(errors, line_map, "ERROR in ") + + issues = warning_lines + error_lines + print(f"Total code issues: {len(issues)}") + if issues: + print() + print("\n".join(warning_lines + error_lines)) + + s = "Style (pycodestyle)" + print("", s, "-" * len(s), "", sep="\n") + + n = len(list(violations.elements())) + print(f"Total PEP8 violations: {n}") + + # TODO parametrize n_most_common + if violations: + print() + print("Common problems:") + for code, count in violations.most_common(10): + plural = "" if count == 1 else "s" + print(f"- {count} instance{plural} of {code}") + + print("") + + +def reformat_line_problems(stream, line_map, prefix=""): + """Reformat a pyflakes output stream for notebook cells.""" + pat = re.compile(r"^\w*:(\d+):\d+ (.+)$") + + new_lines = [] + orig_lines = stream.read().splitlines() + + for line in orig_lines: + m = pat.match(line) + if m: + cell, line = line_map[int(m.group(1))] + new_lines.append( + f"{prefix}Cell {cell}, Line {line}: {m.group(2)}" + ) + + return new_lines + + +if __name__ == "__main__": + + main(sys.argv[1:]) diff --git a/src/nmaci/make_pr_comment.py b/src/nmaci/make_pr_comment.py new file mode 100644 index 0000000..49cbf67 --- /dev/null +++ b/src/nmaci/make_pr_comment.py @@ -0,0 +1,102 @@ +"""Write a comment to be added to a pull request on github: + +- Add Colab badges for the branch version of the notebooks +- Run the code linter over the notebooks and include the report + +""" +import os +import sys +import argparse +import subprocess + +ORG = os.environ.get("ORG", "neuromatch") +REPO = os.environ.get("NMA_REPO", "course-content") + +def main(arglist): + + args = parse_args(arglist) + + # Start with a table of badges for the branch versions of the notebooks + comment_lines = [ + make_colab_badge_table(args.branch, args.notebooks), + ] + + # Add a code report (under a details tag) for each notebook + for nb_fpath in args.notebooks: + _, nb_fname = os.path.split(nb_fpath) + nb_name, _ = os.path.splitext(nb_fname) + comment_lines.extend([ + "\n" + "
", + f"Code report for {nb_name}", + make_lint_report(nb_fpath), + "---", + "", + "
", + ]) + + # Dump to stdout or a file + comment = "\n".join(comment_lines) + if args.output is None: + print(comment, flush=True) + else: + with open(args.output, "w") as fid: + fid.write(comment) + + +def make_lint_report(nb_fpath): + """Run the tutorial linter on a notebook and capture the output.""" + cmdline = ["python", "ci/lint_tutorial.py", nb_fpath] + res = subprocess.run(cmdline, capture_output=True) + return res.stdout.decode() + + +def make_colab_badge_table(branch, notebooks): + """Add Colab badges for the branch version of each notebook.""" + header = [""] + divider = ["-"] + instructor = ["Instructor"] + student = ["Student"] + + for nb_fpath in notebooks: + nb_dir, nb_fname = os.path.split(nb_fpath) + nb_name, _ = os.path.splitext(nb_fname) + header.append(nb_name) + instructor.append(make_colab_badge(branch, nb_dir, nb_fname)) + if "tutorials" in nb_dir: + student.append(make_colab_badge(branch, nb_dir, nb_fname, student=True)) + divider.append("-") + + rows = header, divider, instructor, student + table = "\n".join( + ["|" + "|".join(row) + "|" for row in rows] + ) + return table + + +def make_colab_badge(branch, nb_dir, nb_fname, student=False): + """Generate a Google Colaboratory badge for a notebook on github.""" + alt_text = "Open In Colab" + badge_svg = "https://colab.research.google.com/assets/colab-badge.svg" + if student: + nb_dir = os.path.join(nb_dir, "student") + url = ( + "https://colab.research.google.com/" + f"github/{ORG}/{REPO}/blob/" + f"{branch}/{nb_dir}/{nb_fname}" + ) + return f"[![{alt_text}]({badge_svg})]({url})" + + +def parse_args(arglist): + + parser = argparse.ArgumentParser() + parser.add_argument("--branch", default=os.environ.get("NMA_MAIN_BRANCH", "main")) + parser.add_argument("--output") + parser.add_argument("notebooks", nargs="*") + return parser.parse_args(arglist) + + +if __name__ == "__main__": + + main(sys.argv[1:]) diff --git a/src/nmaci/parse_html_for_errors.py b/src/nmaci/parse_html_for_errors.py new file mode 100644 index 0000000..b66bcd1 --- /dev/null +++ b/src/nmaci/parse_html_for_errors.py @@ -0,0 +1,50 @@ +import argparse +import yaml +import sys +from bs4 import BeautifulSoup + + +def main(arglist=None): + parser = argparse.ArgumentParser() + parser.add_argument("book_type", choices=["student", "instructor"]) + args = parser.parse_args(arglist) + ARG = args.book_type + with open('tutorials/materials.yml') as fh: + materials = yaml.load(fh, Loader=yaml.FullLoader) + + html_directory = 'book/_build/html/' + + # Loop over days + for m in materials: + name = f"{m['day']}_{''.join(m['name'].split())}" + + # Loop over tutorials + for i in range(m['tutorials']): + + # Load html file + notebook_file_path = f"{html_directory}/tutorials/{name}/{ARG}/{m['day']}_Tutorial{i + 1}.html" + with open(notebook_file_path, 'r') as f: + contents = f.read() + parsed_html = BeautifulSoup(contents, features="html.parser") + + # Find code output divs + mydivs = parsed_html.find_all("div", {"class": "cell_output docutils container"}) + + # Remove div if it has an error + for div in mydivs: + if 'NotImplementedError' in str(div) or 'NameError' in str(div): + div.decompose() + + # Put solution figures in center (to fix layout issues) + for img in parsed_html.find_all('img', alt= True): + if img['alt'] == 'Solution hint': + img['align'] = 'center' + img['class'] = 'align-center' + + # save out html + with open(notebook_file_path, 'w') as f: + f.write(str(parsed_html)) + + +if __name__ == '__main__': + main() diff --git a/src/nmaci/process_notebooks.py b/src/nmaci/process_notebooks.py new file mode 100644 index 0000000..dee3fda --- /dev/null +++ b/src/nmaci/process_notebooks.py @@ -0,0 +1,620 @@ +"""Process tutorials for Neuromatch Academy + +- Filter input file list for .ipynb files +- Check that the cells have been executed sequentially on a fresh kernel +- Strip trailing whitespace from all code lines +- Either: + - Execute the notebook and fail if errors are encountered (apart from the `NotImplementedError`) + - Check that all code cells have been executed without error +- Extract solution code and write a .py file with the solution +- Create the student version by replacing solution cells with a "hint" image and a link to the solution code +- Create the instructor version by replacing cells with code exercises with text cells with code in markdown form. +- Redirect Colab-inserted badges to the main branch +- Set the Colab notebook name field based on file path +- Standardize some Colab settings (always have ToC, always hide form cells) +- Clean the notebooks (remove outputs and noisy metadata) +- Write the executed version of the input notebook to its original path +- Write the post-processed notebook to a student/ subdirectory +- Write solution images to a static/ subdirectory +- Write solution code to a solutions/ subdirectory + +""" + +from __future__ import annotations +import os +import re +import sys +import argparse +import hashlib +from io import BytesIO +from binascii import a2b_base64 +from copy import deepcopy +from pathlib import Path + +from PIL import Image +import nbformat +from nbconvert.preprocessors import ExecutePreprocessor + +ORG = os.environ.get("ORG", "neuromatch") +REPO = os.environ.get("NMA_REPO", "course-content-template") +MAIN_BRANCH = os.environ.get("NMA_MAIN_BRANCH", "main") + +GITHUB_RAW_URL = f"https://raw.githubusercontent.com/{ORG}/{REPO}/{MAIN_BRANCH}" +GITHUB_TREE_URL = f"https://github.com/{ORG}/{REPO}/tree/{MAIN_BRANCH}" + + +def main(arglist): + """Process IPython notebooks from a list of files.""" + args = parse_args(arglist) + + # Filter paths from the git manifest + # - Only process .ipynb + # - Don't process student notebooks + # - Don't process deleted notebooks (which are paths in the git manifest) + def should_process(path): + return all( + [ + path.endswith(".ipynb"), + "student/" not in path, + "instructor/" not in path, + os.path.isfile(path), + ] + ) + + nb_paths = [arg for arg in args.files if should_process(arg)] + if not nb_paths: + print("No notebook files found") + sys.exit(0) + + # Set execution parameters. We allow NotImplementedError as that is raised + # by incomplete exercises and is unlikely to be otherwise encountered. + exec_kws = {"timeout": 14400, "allow_error_names": ["NotImplementedError"]} + + # Allow environment to override stored kernel name + if "NB_KERNEL" in os.environ: + exec_kws["kernel_name"] = os.environ["NB_KERNEL"] + + # Defer failures until after processing all notebooks + notebooks = {} + errors = {} + + for nb_path in nb_paths: + + # Load the notebook structure + with open(nb_path) as f: + nb = nbformat.read(f, nbformat.NO_CONVERT) + + if not sequentially_executed(nb): + if args.require_sequential: + err = ( + "Notebook is not sequentially executed on a fresh kernel." + "\n" + "Please do 'Restart and run all' before pushing to Github." + ) + errors[nb_path] = err + continue + + # Clean whitespace from all code cells + clean_whitespace(nb) + + # Ensure that we have an executed notebook, in one of two ways + executor = ExecutePreprocessor(**exec_kws) + if args.execute: + # Check dynamically by executing and reporting errors + print(f"Executing {nb_path}") + error = execute_notebook(executor, nb, args.raise_fast) + elif args.check_execution: + # Check statically by examining the cell outputs + print(f"Checking {nb_path} execution") + error = check_execution(executor, nb, args.raise_fast) + else: + error = None + + if error is None: + notebooks[nb_path] = nb + else: + errors[nb_path] = error + + if errors or args.check_only: + exit(errors) + + # Post-process notebooks + for nb_path, nb in notebooks.items(): + + # Extract components of the notebook path + nb_dir, nb_fname = os.path.split(nb_path) + nb_name, _ = os.path.splitext(nb_fname) + + # Add badges to the main notebook (pointing at itself) + add_badge_cell(nb, nb_path) + + # Ensure that Colab metadata dict exists and enforce some settings + add_colab_metadata(nb, nb_name) + + # Write the original notebook back to disk, clearing outputs only for tutorials + print(f"Writing complete notebook to {nb_path}") + with open(nb_path, "w") as f: + nb_clean = clean_notebook(nb, clear_outputs=nb_path.startswith("tutorials")) + nbformat.write(nb_clean, f) + + # if the notebook is not in tutorials, skip the creation/update of the student, static, solutions directories + if not nb_path.startswith("tutorials"): + continue + + # Create subdirectories, if they don't exist + student_dir = make_sub_dir(nb_dir, "student") + static_dir = make_sub_dir(nb_dir, "static") + solutions_dir = make_sub_dir(nb_dir, "solutions") + instructor_dir = make_sub_dir(nb_dir, "instructor") + + # Generate the student version and save it to a subdirectory + print(f"Extracting solutions from {nb_path}") + processed = extract_solutions(nb, nb_dir, nb_name) + student_nb, static_images, solution_snippets = processed + + # Generate the instructor version and save it to a subdirectory + print(f"Create instructor notebook from {nb_path}") + instructor_nb = instructor_version(nb, nb_dir, nb_name) + + # Build paths for student and instructor versions + student_nb_path = os.path.join(student_dir, nb_fname) + instructor_nb_path = os.path.join(instructor_dir, nb_fname) + + # Add badges pointing to the student version + add_badge_cell(student_nb, student_nb_path) + + # Add badges pointing to the instructor version + add_badge_cell(instructor_nb, instructor_nb_path) + + # Write the student version of the notebook + print(f"Writing student notebook to {student_nb_path}") + with open(student_nb_path, "w") as f: + clean_student_nb = clean_notebook(student_nb) + nbformat.write(clean_student_nb, f) + + # Write the images extracted from the solution cells + print(f"Writing solution images to {static_dir}") + for fname, image in static_images.items(): + fname = fname.replace("static", static_dir) + image.save(fname) + + # Write the solution snippets + print(f"Writing solution snippets to {solutions_dir}") + for fname, snippet in solution_snippets.items(): + fname = fname.replace("solutions", solutions_dir) + with open(fname, "w") as f: + f.write(snippet) + + # Write the instructor version of the notebook + print(f"Writing instructor notebook to {instructor_nb_path}") + with open(instructor_nb_path, "w") as f: + clean_instructor_nb = clean_notebook(instructor_nb) + nbformat.write(clean_instructor_nb, f) + + exit(errors) + + +# ------------------------------------------------------------------------------------ # + + +def execute_notebook(executor, nb, raise_fast): + """Execute the notebook, returning errors to be handled.""" + try: + executor.preprocess(nb) + except Exception as error: + if raise_fast: + # Exit here (useful for debugging) + raise error + else: + # Raise the error to be handled by the caller + return error + + +def check_execution(executor, nb, raise_fast): + """Check that all code cells with source have been executed without error.""" + error = None + for cell in nb.get("cells", []): + + # Only check code cells + if cell["cell_type"] != "code": + continue + + if cell["source"] and cell["execution_count"] is None: + error = "Notebook has unexecuted code cell(s)." + if raise_fast: + raise RuntimeError(error) + break + else: + for output in cell["outputs"]: + if output["output_type"] == "error": + if output["ename"] in executor.allow_error_names: + continue + error = "\n".join(output["traceback"]) + if raise_fast: + raise RuntimeError("\n" + error) + break + + return error + + +def extract_solutions(nb, nb_dir, nb_name): + """Convert solution cells to markdown; embed images from Python output.""" + nb = deepcopy(nb) + _, tutorial_dir = os.path.split(nb_dir) + + static_images = {} + solution_snippets = {} + + nb_cells = nb.get("cells", []) + for i, cell in enumerate(nb_cells): + + if has_solution(cell): + + # Get the cell source + cell_source = cell["source"] + + # Hash the source to get a unique identifier + cell_id = hashlib.sha1(cell_source.encode("utf-8")).hexdigest()[:8] + + # Extract image data from the cell outputs + cell_images = {} + for j, output in enumerate(cell.get("outputs", [])): + + fname = f"static/{nb_name}_Solution_{cell_id}_{j}.png" + try: + image_data = a2b_base64(output["data"]["image/png"]) + except KeyError: + continue + cell_images[fname] = Image.open(BytesIO(image_data)) + static_images.update(cell_images) + + # Clean up the cell source and assign a filename + snippet = "\n".join(cell_source.split("\n")[1:]) + py_fname = f"solutions/{nb_name}_Solution_{cell_id}.py" + solution_snippets[py_fname] = snippet + + # Convert the solution cell to markdown, + # Insert a link to the solution snippet script on github, + # and embed the image as a link to static file (also on github) + py_url = f"{GITHUB_TREE_URL}/tutorials/{tutorial_dir}/{py_fname}" + new_source = f"[*Click for solution*]({py_url})\n\n" + + if cell_images: + new_source += "*Example output:*\n\n" + for f, img in cell_images.items(): + + url = f"{GITHUB_RAW_URL}/tutorials/{tutorial_dir}/{f}" + + # Handle matplotlib retina mode + dpi_w, dpi_h = img.info["dpi"] + w = img.width // (dpi_w // 72) + h = img.height // (dpi_h // 72) + + tag_args = " ".join( + [ + "alt='Solution hint'", + "align='left'", + f"width={w}", + f"height={h}", + f"src={url}", + ] + ) + new_source += f"\n\n" + + cell["source"] = new_source + cell["cell_type"] = "markdown" + cell["metadata"]["colab_type"] = "text" + if "outputID" in cell["metadata"]: + del cell["metadata"]["outputId"] + if "outputs" in cell: + del cell["outputs"] + if "execution_count" in cell: + del cell["execution_count"] + + return nb, static_images, solution_snippets + + +def instructor_version(nb, nb_dir, nb_name): + """Convert notebook to instructor notebook.""" + nb = deepcopy(nb) + _, tutorial_dir = os.path.split(nb_dir) + + nb_cells = nb.get("cells", []) + for i, cell in enumerate(nb_cells): + + if has_code_exercise(cell): + if nb_cells[i - 1]["cell_type"] == "markdown": + cell_id = i - 2 + else: + cell_id = i - 1 + nb_cells[cell_id]["cell_type"] = "markdown" + nb_cells[cell_id]["metadata"]["colab_type"] = "text" + if "outputID" in nb_cells[cell_id]["metadata"]: + del nb_cells[cell_id]["metadata"]["outputId"] + if "outputs" in nb_cells[cell_id]: + del nb_cells[cell_id]["outputs"] + if "execution_count" in nb_cells[cell_id]: + del nb_cells[cell_id]["execution_count"] + + nb_cells[cell_id]["source"] = ( + "```python\n" + nb_cells[cell_id]["source"] + "\n\n```" + ) + + return nb + + +def clean_notebook(nb, clear_outputs=True): + """Remove cell outputs and most unimportant metadata.""" + # Always operate on a copy of the input notebook + nb = deepcopy(nb) + + # Remove some noisy metadata + nb.metadata.pop("widgets", None) + + # Set kernel to default Python3 + nb.metadata["kernel"] = { + "display_name": "Python 3", + "language": "python", + "name": "python3", + } + + # Iterate through the cells and clean up each one + for cell in nb.get("cells", []): + + # Remove blank cells + if not cell["source"]: + nb.cells.remove(cell) + continue + + # Reset cell-level Jupyter metadata + for key in ["prompt_number", "execution_count"]: + if key in cell: + cell[key] = None + + if "metadata" in cell: + cell.metadata["execution"] = {} + for field in ["colab", "collapsed", "scrolled", "ExecuteTime", "outputId"]: + cell.metadata.pop(field, None) + + # Reset cell-level Colab metadata + if "id" in cell["metadata"]: + if not cell["metadata"]["id"].startswith("view-in"): + cell["metadata"].pop("id") + + if cell["cell_type"] == "code": + # Remove code cell outputs if requested + if clear_outputs: + cell["outputs"] = [] + + # Ensure that form cells are hidden by default + first_line, *_ = cell["source"].splitlines() + if "@title" in first_line or "@markdown" in first_line: + cell["metadata"]["cellView"] = "form" + + return nb + + +def add_colab_metadata(nb, nb_name): + """Ensure that notebook has Colab metadata and enforce some settings.""" + if "colab" not in nb["metadata"]: + nb["metadata"]["colab"] = {} + + # Always overwrite the name and show the ToC/Colab button + nb["metadata"]["colab"].update( + { + "name": nb_name, + "toc_visible": True, + "include_colab_link": True, + } + ) + + # Allow collapsed sections, but default to not having any + nb["metadata"]["colab"].setdefault("collapsed_sections", []) + + +def clean_whitespace(nb): + """Remove trailing whitespace from all code cell lines.""" + for cell in nb.get("cells", []): + if cell.get("cell_type", "") == "code": + source_lines = cell["source"].splitlines() + clean_lines = [line.rstrip() for line in source_lines] + cell["source"] = "\n".join(clean_lines) + + +def test_clean_whitespace(): + + nb = { + "cells": [ + {"cell_type": "code", "source": "import numpy \nimport matplotlib "}, + {"cell_type": "markdown", "source": "# Test notebook "}, + ] + } + clean_whitespace(nb) + assert nb["cells"][0]["source"] == "import numpy\nimport matplotlib" + assert nb["cells"][1]["source"] == "# Test notebook " + + +def has_solution(cell): + """Return True if cell is marked as containing an exercise solution.""" + cell_text = cell["source"].replace(" ", "").lower() + first_line = cell_text.split("\n")[0] + return cell_text.startswith("#@titlesolution") or "to_remove" in first_line + + +def has_code_exercise(cell): + """Return True if cell is marked as containing an exercise solution.""" + cell_text = cell["source"].replace(" ", "").lower() + first_line = cell_text.split("\n")[0] + return cell_text.startswith("#@titlesolution") or "to_removesolution" in first_line + + +def test_has_solution(): + + cell = {"source": "# solution"} + assert not has_solution(cell) + + cell = {"source": "def exercise():\n pass\n# to_remove"} + assert not has_solution(cell) + + cell = {"source": "# to_remove_solution\ndef exercise():\n pass"} + assert has_solution(cell) + + +def remove_existing_badges(nb: dict) -> None: + """Remove existing Colab and Kaggle badges from all cells in the notebook. + + - Removes Colab badge HTML () + - Removes Kaggle badge HTML () + - Removes   spacers between badges + - Deletes cells that become empty after badge removal + - Strips leading/trailing whitespace from cells + """ + colab_pattern = re.compile( + r']*href="[^"]*colab[^"]*"[^>]*>\s*]*colab-badge\.svg[^>]*/>\s*', + re.IGNORECASE, + ) + kaggle_pattern = re.compile( + r']*href="[^"]*kaggle[^"]*"[^>]*>\s*]*open-in-kaggle\.svg[^>]*/>\s*', + re.IGNORECASE, + ) + nbsp_pattern = re.compile(r"\s* \s*") + + cells_to_remove = [] + for i, cell in enumerate(nb.get("cells", [])): + + source = cell.get("source", "") + source = colab_pattern.sub("", source) + source = kaggle_pattern.sub("", source) + source = nbsp_pattern.sub("", source) + source = source.strip() + cell["source"] = source + + if not source: + cells_to_remove.append(i) + + # Remove empty cells in reverse order to maintain indices + for i in reversed(cells_to_remove): + del nb["cells"][i] + + +def generate_badge_cell(nb_path: Path | str) -> dict: + """Generate a markdown cell with Colab and Kaggle badges. + + Args: + nb_path: The destination path where the notebook will be written + (e.g., "tutorials/W1D1_Generalization/student/W1D1_Tutorial1.ipynb") + + Returns: + A notebook cell dict with both badges as markdown content. + """ + colab_url = f"https://colab.research.google.com/github/{ORG}/{REPO}/blob/{MAIN_BRANCH}/{nb_path}" + colab_badge = "https://colab.research.google.com/assets/colab-badge.svg" + kaggle_src = ( + f"https://raw.githubusercontent.com/{ORG}/{REPO}/{MAIN_BRANCH}/{nb_path}" + ) + kaggle_url = f"https://kaggle.com/kernels/welcome?src={kaggle_src}" + kaggle_badge = "https://kaggle.com/static/images/open-in-kaggle.svg" + + badge_html = ( + f'' + f'Open In Colab' + f"   " + f'' + f'Open in Kaggle' + ) + + cell = nbformat.v4.new_markdown_cell(source=badge_html) + cell.metadata["id"] = "view-in-github" + cell.metadata["colab_type"] = "text" + return cell + + +def add_badge_cell(nb: dict, nb_path: dict | str) -> None: + """Remove existing badges and add a new badge cell at the top of the notebook. + + Args: + nb: The notebook dict + nb_path: The destination path where the notebook will be written + """ + remove_existing_badges(nb) + badge_cell = generate_badge_cell(nb_path) + nb["cells"].insert(0, badge_cell) + + +def sequentially_executed(nb): + """Return True if notebook appears freshly executed from top-to-bottom.""" + exec_counts = [ + cell["execution_count"] + for cell in nb.get("cells", []) + if (cell["source"] and cell.get("execution_count", None) is not None) + ] + sequential_counts = list(range(1, 1 + len(exec_counts))) + # Returns True if there are no executed code cells, which is fine? + return exec_counts == sequential_counts + + +def make_sub_dir(nb_dir, name): + """Create nb_dir/name if it does not exist.""" + sub_dir = os.path.join(nb_dir, name) + if not os.path.exists(sub_dir): + os.mkdir(sub_dir) + return sub_dir + + +def exit(errors): + """Exit with message and status dependent on contents of errors dict.""" + for failed_file, error in errors.items(): + print(f"{failed_file} failed quality control.", file=sys.stderr) + print(error, file=sys.stderr) + + status = bool(errors) + report = "Failure" if status else "Success" + print("=" * 30, report, "=" * 30) + sys.exit(status) + + +def parse_args(arglist): + """Handle the command-line arguments.""" + parser = argparse.ArgumentParser( + description="Process neuromatch tutorial notebooks", + ) + parser.add_argument( + "files", + nargs="+", + help="File name(s) to process. Will filter for .ipynb extension.", + ) + parser.add_argument( + "--execute", + action="store_true", + help="Execute the notebook and fail if errors are encountered.", + ) + parser.add_argument( + "--check-execution", + action="store_true", + dest="check_execution", + help="Check that each code cell has been executed and did not error.", + ) + parser.add_argument( + "--allow-non-sequential", + action="store_false", + dest="require_sequential", + help="Don't fail if the notebook is not sequentially executed.", + ) + parser.add_argument( + "--check-only", + action="store_true", + dest="check_only", + help="Only run QC checks; don't do post-processing.", + ) + parser.add_argument( + "--raise-fast", + action="store_true", + dest="raise_fast", + help="Raise errors immediately rather than collecting and reporting.", + ) + return parser.parse_args(arglist) + + +if __name__ == "__main__": + + main(sys.argv[1:]) diff --git a/src/nmaci/select_notebooks.py b/src/nmaci/select_notebooks.py new file mode 100644 index 0000000..4ca0526 --- /dev/null +++ b/src/nmaci/select_notebooks.py @@ -0,0 +1,26 @@ +"""From a list of files, select process-able notebooks and print.""" +import os +import sys + +def main(arglist=None): + if arglist is None: + arglist = sys.argv[1:] + + # Filter paths from the git manifest + # - Only process .ipynb + # - Don't process student notebooks + # - Don't process deleted notebooks + def should_process(path): + return all([ + path.endswith(".ipynb"), + "student/" not in path, + "instructor/" not in path, + os.path.isfile(path), + ]) + + nb_paths = [f for f in arglist if should_process(f)] + print(" ".join(nb_paths)) + + +if __name__ == "__main__": + main() diff --git a/src/nmaci/verify_exercises.py b/src/nmaci/verify_exercises.py new file mode 100644 index 0000000..20471fa --- /dev/null +++ b/src/nmaci/verify_exercises.py @@ -0,0 +1,260 @@ +#! /usr/bin/env python +"""Check that exercise code matches solution code. + +Exercises are allowed to deviate from solutions in several ways: + +- Exercise code may replace solution code with an ellipsis (...) +- Exercise code may have "commented-out" solution code + +Additionally: + +- Docstrings are currently ignored +- Blank lines are ignored + +This script will report whether exercises and solutions otherwise match. + +""" +import os +import re +import sys +import argparse +from textwrap import dedent +from fuzzywuzzy import fuzz +import nbformat + + +def main(arglist): + + args = parse_args(arglist) + + if "skip verification" in args.commit_message: + # Putting this logic here as I didn't have time to figure + # out how to do it in the github actions workflow + print("Skipping exercise verification") + sys.exit(0) + + # Track overall status + failure = False + unmatched = {} + + for nb_fpath in args.files: + + _, nb_name = os.path.split(nb_fpath) + unmatched[nb_name] = [] + + # Load the notebook file + with open(nb_fpath) as f: + nb = nbformat.read(f, nbformat.NO_CONVERT) + + for i, cell in enumerate(nb.get("cells", [])): + + # Detect solution cells based on removal tag + if has_solution(cell): + + # Find a corresponding exercise cell + # (Assume it is the previous *code* cell) + j, stub_cell = 1, None + while (i - j): + stub_cell = nb["cells"][i - j] + if stub_cell["cell_type"] == "code": + break + else: + stub_cell = None + j += 1 + if stub_cell is None: + continue + + # Extract the code and comments from both cells + stub_code, stub_comments = logical_lines(stub_cell["source"]) + solu_code, solu_comments = logical_lines(cell["source"]) + + # Identify violations in the exercise cell + unmatched_code = unmatched_lines(stub_code, solu_code) + unmatched_comments = unmatched_lines( + stub_comments, solu_code + solu_comments + ) + unmatched[nb_name].append((unmatched_code, unmatched_comments)) + if unmatched_code or unmatched_comments: + failure = True + + # Report the results for this noteobokk + for nb_name, nb_unmatched in unmatched.items(): + print() + print("---" + nb_name + "-" * (69 - 5 - len(nb_name))) + for exercise, (code, comments) in enumerate(nb_unmatched, 1): + report(exercise, code, comments) + + # Print overall summary and exit with return code + message = "Failure" if failure else "Success" + print("\n" + "=" * 30, message, "=" * 30) + sys.exit(failure) + + +def report(exercise, code, comment, thresh=50): + """Print information about unmatched code and comments in an exercise.""" + code_status = "FAIL" if code else "PASS" + comment_status = "FAIL" if comment else "PASS" + print( + f"Exercise {exercise} | Code {code_status} | Comments {comment_status}" + ) + + for kind, unmatched in zip(["Code", "Comment"], [code, comment]): + for (score, stub, solu) in unmatched: + if score < thresh: + print(f" {kind} without close match:") + print(f" * {stub}") + else: + print(f" {kind} with close mismatch ({score}%)") + print(f" + {stub}") + print(f" - {solu}") + + +def logical_lines(func_str): + """Extract code and block comments from cell string.""" + # Standardize docstring string format + func_str = func_str.replace("'''", '"""') + + # Define a regular expression to remove comments + pattern = re.compile(r"^([^#]*)\s*#* {0,1}(.*?)\s*$") + + code_lines = [] + comment_lines = [] + + making_xkcd_plot = False + reading_block_comment = False + + for line in func_str.split("\n"): + + # Detect and ignore lines within multi-line comments + # - triple quotes (docstrings) + # - comment hashmark fences + comment_block_fence = dedent(line).startswith('"""') or "###" in line + if reading_block_comment: + if comment_block_fence: + reading_block_comment = False + continue + else: + + # Detect and ignore single-line docstrings + text = line.strip() + single_line_docstring = ( + text.startswith('"""') + and text.endswith('"""') + and len(text) > 3 + ) + if single_line_docstring: + continue + + # Otherwise, assume we are starting a comment block + if comment_block_fence: + reading_block_comment = True + continue + + match = pattern.match(line) + if match: + + # Split the line on the first comment hash encountered + code_line = match.group(1) + comment_line = match.group(2) + + # If there is code before the comment, assume comment is inline + # use entire line (allows inline comments in commented-out code) + if dedent(code_line): + code_line = match.group(0) + + # Handle xkcd context, which is always last thing in solution cell + if "plt.xkcd()" in code_line: + making_xkcd_plot = True + continue + if making_xkcd_plot: + code_line = code_line[2:] + + # Check for reasons to ignore the line, otherwise keep it + + if not skip_code(code_line): + code_lines.append(code_line) + + if not dedent(code_line) and not skip_comment(comment_line): + comment_lines.append(comment_line) + + return code_lines, comment_lines + + +def unmatched_lines(stub_lines, solu_lines): + """Identify lines in the exercise stub without a match in the solution.""" + unmatched = [] + + for stub_line in stub_lines: + + # When we don't match, we want to track lines that are close + best_score = 0 + best_line = "" + + for line in solu_lines: + + # Match whole lines or parts of lines that need completion + if "..." in stub_line: + part_scores = [] + for part in stub_line.split("..."): + if not part: + continue + part_scores.append(fuzz.partial_ratio(part, line)) + score = min(part_scores) + else: + score = fuzz.ratio(stub_line, line) + + if score > best_score: + best_score = score + best_line = line + + # Track all lines that are not perfect matches + if best_score < 100: + unmatched.append((best_score, stub_line, best_line)) + + return unmatched + + +def skip_code(line): + """Return True if a code line should be skipped based on contents.""" + line = dedent(line) + return not line or "NotImplementedError" in line + + +def skip_comment(line): + """Return True if a comment line should be skipped based on contents.""" + line = dedent(line) + return not line or "to_remove" in line or "uncomment" in line.lower() + + +def has_solution(cell): + """Return True if cell is marked as containing an exercise solution.""" + cell_text = cell["source"].replace(" ", "").lower() + first_line = cell_text.split("\n")[0] + return ( + cell_text.startswith("#@titlesolution") + or "to_remove" in first_line + and "explanation" not in first_line + ) + + +def parse_args(arglist): + """Handle the command-line arguments.""" + parser = argparse.ArgumentParser( + description="Process neuromatch tutorial notebooks", + ) + parser.add_argument( + "files", + nargs="+", + help="File name(s) to process. Will filter for .ipynb extension." + ) + parser.add_argument( + "--commit-message", + default="", + help="Will exit cleanly if message contains 'skip verify'", + ) + return parser.parse_args(arglist) + + +if __name__ == "__main__": + + main(sys.argv[1:]) diff --git a/tests/test_process_notebooks.py b/tests/test_process_notebooks.py index 460303f..ac4ac89 100644 --- a/tests/test_process_notebooks.py +++ b/tests/test_process_notebooks.py @@ -3,7 +3,7 @@ from subprocess import run from pytest import fixture -from scripts.process_notebooks import ( +from nmaci.process_notebooks import ( add_badge_cell, generate_badge_cell, remove_existing_badges, @@ -14,12 +14,12 @@ @fixture def cmd(): - return ["python", "scripts/process_notebooks.py"] + return ["nmaci", "process-notebooks"] def test_raises_not_implemented_error(cmd): - nb = "tutorials/raises_notimplemented_error.ipynb" + nb = "tests/tutorials/raises_notimplemented_error.ipynb" cmdline = cmd + ["--check-only", "--execute", nb] res = run(cmdline, capture_output=True) assert not res.returncode @@ -28,7 +28,7 @@ def test_raises_not_implemented_error(cmd): def test_raises_name_error(cmd): - nb = "tutorials/raises_name_error.ipynb" + nb = "tests/tutorials/raises_name_error.ipynb" cmdline = cmd + ["--check-only", "--execute", nb] res = run(cmdline, capture_output=True) assert res.returncode @@ -39,7 +39,7 @@ def test_raises_name_error(cmd): def test_executed_out_of_order(cmd): - nb = "tutorials/executed_out_of_order.ipynb" + nb = "tests/tutorials/executed_out_of_order.ipynb" cmdline = cmd + ["--check-only", nb] res = run(cmdline, capture_output=True) assert res.returncode @@ -49,7 +49,7 @@ def test_executed_out_of_order(cmd): def test_executed_partially(cmd): - nb = "tutorials/executed_partially.ipynb" + nb = "tests/tutorials/executed_partially.ipynb" cmdline = cmd + ["--check-only", "--check-execution", nb] res = run(cmdline, capture_output=True) assert res.returncode @@ -59,7 +59,7 @@ def test_executed_partially(cmd): def test_executed_with_error(cmd): - nb = "tutorials/executed_with_error.ipynb" + nb = "tests/tutorials/executed_with_error.ipynb" cmdline = cmd + ["--check-only", "--check-execution", nb] res = run(cmdline, capture_output=True) assert res.returncode @@ -69,7 +69,7 @@ def test_executed_with_error(cmd): def test_executed_successfully(cmd): - nb = "tutorials/executed_successfully.ipynb" + nb = "tests/tutorials/executed_successfully.ipynb" cmdline = cmd + ["--check-only", "--check-execution", nb] res = run(cmdline, capture_output=True) assert not res.returncode @@ -165,7 +165,7 @@ def test_remove_existing_badges_no_badges(): def test_generate_badge_cell_structure(): """Test that generate_badge_cell creates correct cell structure.""" - cell = generate_badge_cell("tutorials/W1D1_Intro/W1D1_Tutorial1.ipynb") + cell = generate_badge_cell("tests/tutorials/W1D1_Intro/W1D1_Tutorial1.ipynb") assert cell["cell_type"] == "markdown" assert cell["metadata"]["id"] == "view-in-github" @@ -174,27 +174,27 @@ def test_generate_badge_cell_structure(): def test_generate_badge_cell_colab_badge(): """Test that generate_badge_cell includes correct Colab badge.""" - cell = generate_badge_cell("tutorials/W1D1_Intro/W1D1_Tutorial1.ipynb") + cell = generate_badge_cell("tests/tutorials/W1D1_Intro/W1D1_Tutorial1.ipynb") assert "colab-badge.svg" in cell["source"] assert "colab.research.google.com/github" in cell["source"] - assert "tutorials/W1D1_Intro/W1D1_Tutorial1.ipynb" in cell["source"] + assert "tests/tutorials/W1D1_Intro/W1D1_Tutorial1.ipynb" in cell["source"] def test_generate_badge_cell_kaggle_badge(): """Test that generate_badge_cell includes correct Kaggle badge.""" - cell = generate_badge_cell("tutorials/W1D1_Intro/W1D1_Tutorial1.ipynb") + cell = generate_badge_cell("tests/tutorials/W1D1_Intro/W1D1_Tutorial1.ipynb") assert "open-in-kaggle.svg" in cell["source"] assert "kaggle.com/kernels/welcome" in cell["source"] - assert "tutorials/W1D1_Intro/W1D1_Tutorial1.ipynb" in cell["source"] + assert "tests/tutorials/W1D1_Intro/W1D1_Tutorial1.ipynb" in cell["source"] def test_generate_badge_cell_student_path(): """Test that generate_badge_cell works with student paths.""" - cell = generate_badge_cell("tutorials/W1D1_Intro/student/W1D1_Tutorial1.ipynb") + cell = generate_badge_cell("tests/tutorials/W1D1_Intro/student/W1D1_Tutorial1.ipynb") - assert "tutorials/W1D1_Intro/student/W1D1_Tutorial1.ipynb" in cell["source"] + assert "tests/tutorials/W1D1_Intro/student/W1D1_Tutorial1.ipynb" in cell["source"] def test_add_badge_cell_replaces_old_badges(): @@ -209,7 +209,7 @@ def test_add_badge_cell_replaces_old_badges(): ] } - add_badge_cell(nb, "tutorials/W1D1_Intro/W1D1_Tutorial1.ipynb") + add_badge_cell(nb, "tests/tutorials/W1D1_Intro/W1D1_Tutorial1.ipynb") # Should have 2 cells: new badge cell + code cell assert len(nb["cells"]) == 2 @@ -228,7 +228,7 @@ def test_add_badge_cell_to_empty_notebook(): ] } - add_badge_cell(nb, "tutorials/W1D1_Intro/W1D1_Tutorial1.ipynb") + add_badge_cell(nb, "tests/tutorials/W1D1_Intro/W1D1_Tutorial1.ipynb") assert len(nb["cells"]) == 2 assert nb["cells"][0]["metadata"]["id"] == "view-in-github" diff --git a/tutorials/executed_out_of_order.ipynb b/tests/tutorials/executed_out_of_order.ipynb similarity index 100% rename from tutorials/executed_out_of_order.ipynb rename to tests/tutorials/executed_out_of_order.ipynb diff --git a/tutorials/executed_partially.ipynb b/tests/tutorials/executed_partially.ipynb similarity index 100% rename from tutorials/executed_partially.ipynb rename to tests/tutorials/executed_partially.ipynb diff --git a/tutorials/executed_successfully.ipynb b/tests/tutorials/executed_successfully.ipynb similarity index 100% rename from tutorials/executed_successfully.ipynb rename to tests/tutorials/executed_successfully.ipynb diff --git a/tutorials/executed_with_error.ipynb b/tests/tutorials/executed_with_error.ipynb similarity index 100% rename from tutorials/executed_with_error.ipynb rename to tests/tutorials/executed_with_error.ipynb diff --git a/tutorials/raises_name_error.ipynb b/tests/tutorials/raises_name_error.ipynb similarity index 100% rename from tutorials/raises_name_error.ipynb rename to tests/tutorials/raises_name_error.ipynb diff --git a/tutorials/raises_notimplemented_error.ipynb b/tests/tutorials/raises_notimplemented_error.ipynb similarity index 100% rename from tutorials/raises_notimplemented_error.ipynb rename to tests/tutorials/raises_notimplemented_error.ipynb