Skip to content

Commit cb47029

Browse files
Sh postprocess (#147)
* Add support for postprocessor far in sparrowhawk Signed-off-by: Anand Joseph <[email protected]> * Cleanup Signed-off-by: Anand Joseph <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Choose between having a post processor or not Signed-off-by: anand-nv <[email protected]> --------- Signed-off-by: Anand Joseph <[email protected]> Signed-off-by: anand-nv <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent bd10581 commit cb47029

File tree

4 files changed

+14
-4
lines changed

4 files changed

+14
-4
lines changed

tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@ runtest () {
1111
while read testcase; do
1212
IFS='~' read written spoken <<< $testcase
1313
# replace non breaking space with breaking space
14+
# Use below if postprocessor is not used. Comment if it is used
1415
denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g')
16+
# Use below if postprocessor is used. Comment if it is not used
17+
#denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration_pp.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g')
1518

1619
# trim white space
1720
spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"

tools/text_processing_deployment/Dockerfile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/pr
3232
RUN tar xzvf protobuf-2.5.0.tar.gz
3333
RUN cd protobuf-2.5.0 && ./configure && make && make install && ldconfig
3434
RUN conda install -c conda-forge thrax=1.3.4 -y
35-
RUN git clone https://github.com/yzhang123/sparrowhawk.git
36-
RUN cd sparrowhawk && git checkout test && apt-get install -y autoconf && bash autoreconf && ./configure && make && make install && ldconfig
35+
RUN git clone https://github.com/anand-nv/sparrowhawk.git && cd sparrowhawk && git checkout nemo_tests && apt-get install -y autoconf && bash autoreconf && ./configure && make && make install && ldconfig
3736
RUN git clone https://github.com/kward/shunit2.git
3837
RUN echo "DONE"

tools/text_processing_deployment/docker/launch.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ elif [[ $MODE == "test_itn_grammars" ]]; then
5050
fi
5151

5252
echo $MOUNTS
53-
docker run -it --rm \
53+
docker run -it -e LANG=C.UTF-8 -e LC_ALL=C.UTF-8 --rm \
5454
--shm-size=4g \
5555
--ulimit memlock=-1 \
5656
--ulimit stack=67108864 \

tools/text_processing_deployment/pynini_export.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ def tn_grammars(**kwargs):
5252
).fst
5353
}
5454
d['verbalize'] = {'ALL': TNVerbalizeFst(deterministic=True).fst, 'REDUP': pynini.accep("REDUP")}
55+
if TNPostProcessingFst is not None:
56+
d['post_process'] = {'POSTPROCESSOR': TNPostProcessingFst().fst}
5557
return d
5658

5759

@@ -66,6 +68,8 @@ def export_grammars(output_dir, grammars):
6668

6769
for category, graphs in grammars.items():
6870
out_dir = os.path.join(output_dir, category)
71+
if category == "post_process":
72+
out_dir = os.path.join(output_dir, "verbalize")
6973
if not os.path.exists(out_dir):
7074
os.makedirs(out_dir)
7175
time.sleep(1)
@@ -113,7 +117,7 @@ def parse_args():
113117

114118
if args.language in ['pt', 'ru', 'vi', 'es_en', 'mr'] and args.grammars == 'tn_grammars':
115119
raise ValueError('Only ITN grammars could be deployed in Sparrowhawk for the selected languages.')
116-
120+
TNPostProcessingFst = None
117121
if args.language == 'en':
118122
from nemo_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import (
119123
ClassifyFst as ITNClassifyFst,
@@ -124,7 +128,11 @@ def parse_args():
124128
from nemo_text_processing.text_normalization.en.taggers.tokenize_and_classify import (
125129
ClassifyFst as TNClassifyFst,
126130
)
131+
from nemo_text_processing.text_normalization.en.verbalizers.post_processing import (
132+
PostProcessingFst as TNPostProcessingFst,
133+
)
127134
from nemo_text_processing.text_normalization.en.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst
135+
128136
elif args.language == 'de':
129137
from nemo_text_processing.inverse_text_normalization.de.taggers.tokenize_and_classify import (
130138
ClassifyFst as ITNClassifyFst,

0 commit comments

Comments
 (0)