Skip to content

Commit 34140b1

Browse files
lambertacopybara-github
authored andcommitted
High-level spelling check utility for notebooks.
Default checks text cells only. Tested on Linux and OSX. Requires aspell and jupyter-nbconvert Examples ... $ ./tools/spelltest/spelltest.sh -h # Misspelling counts: $ ./tools/spelltest/spelltest.sh ./site/en/guide/eager.ipynb $ ./tools/spelltest/spelltest.sh ./site/en/tutorials/* # Check <code> tags with text cell: $ ./tools/spelltest/spelltest.sh -c ./site/en/guide/eager.ipynb # Check input code cells, too: $ ./tools/spelltest/spelltest.sh -C ./site/en/guide/eager.ipynb # Print text as Markdown (useful for copy/paste into another form checker: $ ./tools/spelltest/spelltest.sh -p ./site/en/guide/eager.ipynb PiperOrigin-RevId: 286618985
1 parent b41f0bf commit 34140b1

File tree

5 files changed

+227
-0
lines changed

5 files changed

+227
-0
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
**/*.pyc
2+
**/*.rws
23
**/.DS_Store
34
**/.idea
45
**/.ipynb_checkpoints

tools/spelltest/spelltest.sh

+173
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
#!/usr/bin/env bash
2+
## High-level spelling check for notebooks (and Markdown and txt).
3+
## Requires aspell and nbconvert.
4+
##
5+
## Display help and command-line options:
6+
## $ spelltest -h
7+
##
8+
## Count misspellings in text cells:
9+
## $ spelltest notebook.ipynb [...]
10+
##
11+
## Count misspellings in text cells including code tags:
12+
## $ spelltest -c notebook.ipynb [...]
13+
##
14+
## Count misspellings in text cells including code tags AND code cells:
15+
## $ spelltest -c -C notebook.ipynb [...]
16+
##
17+
## Dump notebook text (without code cells) and save to clipoard (OSX):
18+
## $ spelltest -p notebook.ipynb | pbcopy
19+
##
20+
set -e
21+
22+
usage() {
23+
echo "Usage: $(basename $0) notebook.ipynb"
24+
echo " High-level spelling check for notebooks (and Markdown and txt)."
25+
echo "Options:"
26+
echo " -c Check <code> and <pre> tags within text cell"
27+
echo " -C Include code blocks"
28+
echo " -p Print Markdown to stdout"
29+
echo " -h Print this help and exit"
30+
}
31+
32+
LOG_NAME="[$(basename $0 '.sh')]"
33+
SRC_ROOT="$(cd $(dirname ${BASH_SOURCE[0]}) && pwd)"
34+
WORDLIST="${SRC_ROOT}/wordlist.txt"
35+
36+
## Parse options
37+
38+
if [ "$#" -eq 0 ]; then
39+
usage
40+
exit 1
41+
fi
42+
43+
while getopts "cCph" opt; do
44+
case $opt in
45+
c) OPT_CHECK_CODE_TAGS=1;;
46+
C) OPT_INCLUDE_CODE_CELLS=1;;
47+
p) OPT_PRINT_STDOUT_ONLY=1;;
48+
h | *)
49+
usage
50+
exit 0
51+
;;
52+
esac
53+
done
54+
55+
# Args after flags
56+
shift $((OPTIND - 1))
57+
58+
59+
## Check requirmeents: aspell and nbconvert
60+
61+
if [[ ! -x "$(which aspell)" ]]; then
62+
echo "${LOG_NAME} Error: Requires the 'aspell' command" >&2
63+
exit 1
64+
fi
65+
66+
if [[ -n "$(which jupyter-nbconvert)" ]]; then
67+
NBCONVERT_BIN="$(which jupyter-nbconvert)"
68+
elif [[ -n "$(which nbconvert)" ]]; then
69+
NBCONVERT_BIN="$(which nbconvert)"
70+
else
71+
echo "${LOG_NAME} Error: Requires the 'jupyter-nbconvert' command" >&2
72+
exit 1
73+
fi
74+
75+
76+
# Read file contents to string.
77+
# Use Markdown if dumping to stdout. Use HTML for aspell check.
78+
read_file_contents() {
79+
local fp="$1"
80+
local opts
81+
local contents
82+
83+
if [[ "${fp: -4}" == ".txt" ]]; then
84+
contents="$(cat $fp)"
85+
86+
elif [[ "${fp: -3}" == ".md" ]]; then
87+
echo "${LOG_NAME} TODO: Test Markdown ${fp}" >&2
88+
contents="$(cat $fp)"
89+
90+
elif [[ "${fp: -6}" == ".ipynb" ]]; then
91+
# Use Markdown for stdout dump, html for aspell
92+
if [[ -n "$OPT_PRINT_STDOUT_ONLY" ]]; then
93+
opts+="--to=markdown"
94+
else
95+
opts+="--to=html"
96+
fi
97+
98+
if [[ -z "$OPT_INCLUDE_CODE_CELLS" ]]; then
99+
# template removes input code cells
100+
if [[ -n "$OPT_PRINT_STDOUT_ONLY" ]]; then
101+
opts+=" --template=${SRC_ROOT}/tmpl/md.tpl"
102+
else
103+
opts+=" --template=${SRC_ROOT}/tmpl/html.tpl"
104+
fi
105+
fi
106+
107+
contents="$($NBCONVERT_BIN $opts --stdout $fp 2>/dev/null)"
108+
109+
else
110+
echo "${LOG_NAME} Error: File format not supported: ${fp}" >&2
111+
exit 1
112+
fi
113+
114+
echo "$contents"
115+
}
116+
117+
# Aspell < 0.60.8 requires a compiled dictionary
118+
if [[ -z "$OPT_PRINT_STDOUT_ONLY" ]]; then
119+
# Only want to compile new dictionary if the wordlist has changed
120+
# Clean up extras at end of file
121+
checksum=$(crc32 "$WORDLIST")
122+
WORDDICT="/tmp/$(basename $0 '.sh')-${checksum}.rws"
123+
124+
if [[ ! -f "$WORDDICT" ]]; then
125+
echo "${LOG_NAME} Compiling dictionary: ${WORDDICT}" >&2
126+
aspell --lang=en --encoding=utf-8 create master "$WORDDICT" < "$WORDLIST"
127+
else
128+
echo "${LOG_NAME} Using pre-compiled dictionary: ${WORDDICT}" >&2
129+
fi
130+
fi
131+
132+
133+
## Main
134+
135+
for fp in "$@"; do
136+
if [[ ! -f "$fp" ]]; then
137+
echo "${LOG_NAME} Error: File doesn't exist: ${fp}" >&2
138+
exit 1
139+
fi
140+
141+
echo "File: $fp" >&2
142+
143+
contents="$(read_file_contents $fp)"
144+
# Strip extras
145+
contents=$(echo "$contents" \
146+
| sed -e '/^<table class="tfo-notebook-buttons" align="left">/,/<\/table>/d')
147+
148+
if [[ -n "$OPT_PRINT_STDOUT_ONLY" ]]; then
149+
# No spell check, just print file contents and move on
150+
echo "$contents"
151+
continue
152+
153+
else
154+
aspell_opts="--lang=en_US --encoding=utf-8"
155+
156+
if [[ -z "$OPT_CHECK_CODE_TAGS" ]]; then
157+
aspell_opts+=" --add-html-skip=code --add-html-skip=pre"
158+
fi
159+
160+
echo "$contents" \
161+
| aspell list $aspell_opts --mode=html --add-extra-dicts="$WORDDICT" \
162+
| sort \
163+
| uniq -c
164+
fi
165+
done
166+
167+
168+
# Cleanup old aspell dicts
169+
if [[ -f "$WORDDICT" ]]; then
170+
find /tmp -maxdepth 1 -type f \
171+
-name "$(basename $0 '.sh')*" ! -wholename "$WORDDICT" \
172+
-delete
173+
fi

tools/spelltest/tmpl/html.tpl

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{%- extends 'full.tpl' -%}
2+
3+
{# Do not print input/code cells #}
4+
{% block input_group -%}
5+
{% endblock input_group %}

tools/spelltest/tmpl/md.tpl

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{%- extends 'markdown.tpl' -%}
2+
3+
{# Do not print input/code cells #}
4+
{% block input_group -%}
5+
{% endblock input_group %}

tools/spelltest/wordlist.txt

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
APIs
2+
autoencoder
3+
backend
4+
backpropagation
5+
bfloat
6+
bytecode
7+
CIFAR
8+
Colab
9+
Colaboratory
10+
CPUs
11+
CSV
12+
CUDA
13+
dataset
14+
datasets
15+
dtype
16+
GPU
17+
GPUs
18+
ImageNet
19+
initializer
20+
JSON
21+
Keras
22+
LSTM
23+
MNIST
24+
NaNs
25+
NumPy
26+
optimizer
27+
perceptron
28+
pseudocode
29+
PyPI
30+
quickstart
31+
ResNet
32+
RNN
33+
runtime
34+
SavedModel
35+
sigmoid
36+
softmax
37+
TensorBoard
38+
TensorFlow
39+
TPU
40+
TPUs
41+
VAE
42+
VGG
43+
XLA

0 commit comments

Comments
 (0)