Sh postprocess (#147)

anand-nv · pre-commit-ci[bot] · web-flow · commit cb47029f51c8 · 2024-04-16T12:04:38.000-07:00
* Add support for postprocessor far in sparrowhawk Signed-off-by: Anand Joseph <anajoseph@nvidia.com> * Cleanup Signed-off-by: Anand Joseph <anajoseph@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Choose between having a post processor or not Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> --------- Signed-off-by: Anand Joseph <anajoseph@nvidia.com> Signed-off-by: anand-nv <105917641+anand-nv@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/en/test_sparrowhawk_normalization.sh
@@ -11,7 +11,10 @@ runtest () {
   while read testcase; do
     IFS='~' read written spoken <<< $testcase
     # replace non breaking space with breaking space
+    # Use below if postprocessor is not used. Comment if it is used
     denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g')
+    # Use below if postprocessor is  used. Comment if it is not used
+    #denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration_pp.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g')
 
     # trim white space
     spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"
diff --git a/tools/text_processing_deployment/Dockerfile b/tools/text_processing_deployment/Dockerfile
@@ -32,7 +32,6 @@ RUN wget https://github.com/protocolbuffers/protobuf/releases/download/v2.5.0/pr
 RUN tar xzvf protobuf-2.5.0.tar.gz
 RUN cd protobuf-2.5.0 && ./configure && make && make install && ldconfig
 RUN conda install -c conda-forge thrax=1.3.4 -y
-RUN git clone https://github.com/yzhang123/sparrowhawk.git
-RUN cd sparrowhawk &&  git checkout test &&   apt-get install -y autoconf &&     bash autoreconf && ./configure && make && make install && ldconfig
+RUN git clone https://github.com/anand-nv/sparrowhawk.git && cd sparrowhawk &&  git checkout nemo_tests &&   apt-get install -y autoconf &&     bash autoreconf && ./configure && make && make install && ldconfig
 RUN git clone https://github.com/kward/shunit2.git
 RUN echo "DONE"
diff --git a/tools/text_processing_deployment/docker/launch.sh b/tools/text_processing_deployment/docker/launch.sh
@@ -50,7 +50,7 @@ elif [[ $MODE == "test_itn_grammars" ]]; then
 fi
 
 echo $MOUNTS
-docker run -it --rm \
+docker run -it -e LANG=C.UTF-8 -e LC_ALL=C.UTF-8 --rm \
   --shm-size=4g \
   --ulimit memlock=-1 \
   --ulimit stack=67108864 \
diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py
@@ -52,6 +52,8 @@ def tn_grammars(**kwargs):
         ).fst
     }
     d['verbalize'] = {'ALL': TNVerbalizeFst(deterministic=True).fst, 'REDUP': pynini.accep("REDUP")}
+    if TNPostProcessingFst is not None:
+        d['post_process'] = {'POSTPROCESSOR': TNPostProcessingFst().fst}
     return d
 
 
@@ -66,6 +68,8 @@ def export_grammars(output_dir, grammars):
 
     for category, graphs in grammars.items():
         out_dir = os.path.join(output_dir, category)
+        if category == "post_process":
+            out_dir = os.path.join(output_dir, "verbalize")
         if not os.path.exists(out_dir):
             os.makedirs(out_dir)
             time.sleep(1)
@@ -113,7 +117,7 @@ def parse_args():
 
     if args.language in ['pt', 'ru', 'vi', 'es_en', 'mr'] and args.grammars == 'tn_grammars':
         raise ValueError('Only ITN grammars could be deployed in Sparrowhawk for the selected languages.')
-
+    TNPostProcessingFst = None
     if args.language == 'en':
         from nemo_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import (
             ClassifyFst as ITNClassifyFst,
@@ -124,7 +128,11 @@ def parse_args():
         from nemo_text_processing.text_normalization.en.taggers.tokenize_and_classify import (
             ClassifyFst as TNClassifyFst,
         )
+        from nemo_text_processing.text_normalization.en.verbalizers.post_processing import (
+            PostProcessingFst as TNPostProcessingFst,
+        )
         from nemo_text_processing.text_normalization.en.verbalizers.verbalize import VerbalizeFst as TNVerbalizeFst
+
     elif args.language == 'de':
         from nemo_text_processing.inverse_text_normalization.de.taggers.tokenize_and_classify import (
             ClassifyFst as ITNClassifyFst,