Skip to content

Commit

Permalink
improve the handling on misc language codes
Browse files Browse the repository at this point in the history
  • Loading branch information
baxtree committed May 17, 2021
1 parent 1703c15 commit 38a7e0d
Show file tree
Hide file tree
Showing 9 changed files with 92 additions and 39 deletions.
1 change: 1 addition & 0 deletions requirements-app.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ toolz==0.9.0
torch~=1.8.1
tornado==5.1.0
transformers~=4.5.1
typing-extensions~=3.7.0
urllib3==1.25.9
Werkzeug>=0.15.3
zict==0.1.3
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ toolz==0.9.0
torch~=1.8.1
tornado==5.1.0
transformers~=4.5.1
typing-extensions~=3.7.0
urllib3==1.25.9
Werkzeug>=0.15.3
zict==0.1.3
Expand Down
10 changes: 4 additions & 6 deletions subaligner/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,13 @@ def main():
action="store_true",
help="Switch off stretch on non-English speech and subtitles)",
)
from aeneas.language import Language
from subaligner.utils import Utils
parser.add_argument(
"-sil",
"--stretch_in_language",
type=str,
choices=Language.ALLOWED_VALUES,
default=Language.ENG,
choices=Utils.get_stretch_language_codes(),
default="eng",
help="Stretch the subtitle with the supported ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes].\nNB: This will be ignored if either -so or --stretch_off is present",
)
parser.add_argument(
Expand Down Expand Up @@ -136,8 +136,7 @@ def main():
FLAGS, unparsed = parser.parse_known_args()

if FLAGS.languages:
for line in Language.CODE_TO_HUMAN_LIST:
print(line.replace("\t", " "))
print("\n".join(Utils.get_language_table()))
sys.exit(0)
if FLAGS.mode == "":
print("--mode was not passed in")
Expand Down Expand Up @@ -168,7 +167,6 @@ def main():
from subaligner.translator import Translator
from subaligner.exception import UnsupportedFormatException
from subaligner.exception import TerminalException
from subaligner.utils import Utils

try:
if FLAGS.video_path.lower().startswith("http"):
Expand Down
1 change: 0 additions & 1 deletion subaligner/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from .singleton import Singleton
from .subtitle import Subtitle
from .hyperparameters import Hyperparameters
from .translator import Translator
from .exception import TerminalException
from .exception import NoFrameRateException
from .logger import Logger
Expand Down
6 changes: 2 additions & 4 deletions subaligner/subaligner_1pass/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,9 @@ def main():
parser.add_argument("-ver", "--version", action="version", version=__version__)
FLAGS, unparsed = parser.parse_known_args()

from aeneas.language import Language
from subaligner.utils import Utils
if FLAGS.languages:
for line in Language.CODE_TO_HUMAN_LIST:
print(line.replace("\t", " "))
print("\n".join(Utils.get_language_table()))
sys.exit(0)
if FLAGS.video_path == "":
print("--video_path was not passed in")
Expand All @@ -125,7 +124,6 @@ def main():
from subaligner.translator import Translator
from subaligner.exception import UnsupportedFormatException
from subaligner.exception import TerminalException
from subaligner.utils import Utils

try:
if FLAGS.video_path.lower().startswith("http"):
Expand Down
10 changes: 4 additions & 6 deletions subaligner/subaligner_2pass/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,13 @@ def main():
action="store_true",
help="Switch off stretch on subtitles for non-English speech",
)
from aeneas.language import Language
from subaligner.utils import Utils
parser.add_argument(
"-sil",
"--stretch_in_language",
type=str,
choices=Language.ALLOWED_VALUES,
default=Language.ENG,
choices=Utils.get_stretch_language_codes(),
default="eng",
help="Stretch the subtitle with the supported ISO 639-3 language code [https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes].\nNB: This will be ignored if either -so or --stretch_off is present",
)
parser.add_argument(
Expand Down Expand Up @@ -126,8 +126,7 @@ def main():
FLAGS, unparsed = parser.parse_known_args()

if FLAGS.languages:
for line in Language.CODE_TO_HUMAN_LIST:
print(line.replace("\t", " "))
print("\n".join(Utils.get_language_table()))
sys.exit(0)
if FLAGS.video_path == "":
print("--video_path was not passed in")
Expand Down Expand Up @@ -155,7 +154,6 @@ def main():
from subaligner.translator import Translator
from subaligner.exception import UnsupportedFormatException
from subaligner.exception import TerminalException
from subaligner.utils import Utils

try:
if FLAGS.video_path.lower().startswith("http"):
Expand Down
6 changes: 2 additions & 4 deletions subaligner/subaligner_convert/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,9 @@ def main():
parser.add_argument("-ver", "--version", action="version", version=__version__)
FLAGS, unparsed = parser.parse_known_args()

from aeneas.language import Language
from subaligner.utils import Utils
if FLAGS.languages:
for line in Language.CODE_TO_HUMAN_LIST:
print(line.replace("\t", " "))
print("\n".join(Utils.get_language_table()))
sys.exit(0)
if FLAGS.input_subtitle_path == "":
print("--input_subtitle_path was not passed in")
Expand All @@ -102,7 +101,6 @@ def main():
from subaligner.subtitle import Subtitle
from subaligner.translator import Translator
from subaligner.exception import UnsupportedFormatException, TerminalException
from subaligner.utils import Utils

try:
if FLAGS.input_subtitle_path.lower().startswith("http"):
Expand Down
87 changes: 69 additions & 18 deletions subaligner/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
from captionstransformer.srt import Reader as SrtReader, Writer as SrtWriter
from captionstransformer.transcript import Reader as TranscriptReader, Writer as TranscriptWriter
from bs4 import BeautifulSoup
from typing import Optional, TextIO, BinaryIO, Union, Callable, Any, Tuple
from aeneas.language import Language
from datetime import datetime
from typing import Optional, TextIO, BinaryIO, Union, Callable, Any, Tuple, List, Dict
from .exception import TerminalException
from subaligner.lib.to_srt import STL, SRT

Expand Down Expand Up @@ -597,10 +599,59 @@ def detect_encoding(subtitle_file_path: str) -> str:
return detected["encoding"] if "encoding" in detected else None

@staticmethod
def get_file_root_and_extension(file_path):
def get_file_root_and_extension(file_path: str) -> Tuple[str, str]:
"""Get the root path and the extension of the input file path.
Returns:
tuple -- the root path and the extension of the input file path.
"""

parts = os.path.abspath(file_path).split(os.extsep, 1)
return parts[0], parts[1]

@staticmethod
def get_stretch_language_codes() -> List[str]:
"""Get language codes used by stretch.
Returns:
list -- A list of language codes derived from ISO 639-3.
"""
return Language.ALLOWED_VALUES

@staticmethod
def get_misc_language_codes() -> List[str]:
"""Get all known language codes.
Returns:
list -- A list of all known language codes.
"""
return Language.ALLOWED_VALUES + \
['CELTIC', 'NORTH_EU', 'NORWAY', 'ROMANCE', 'SAMI', 'SCANDINAVIA', 'aav', 'aed', 'afa', 'alv', 'art', 'ase',
'bat', 'bcl', 'bem', 'ber', 'bnt', 'bzs', 'cau', 'ccs', 'ceb', 'cel', 'chk', 'cpf', 'cpp', 'crs', 'csg',
'csn', 'cus', 'dra', 'efi', 'en_el_es_fi', 'euq', 'fi_nb_no_nn_ru_sv_en', 'fiu', 'fse', 'gaa', 'gem',
'gil', 'gmq', 'gmw', 'grk', 'guw', 'hil', 'iir', 'ilo', 'inc', 'ine', 'iso', 'itc', 'jap', 'kab', 'kqn',
'kwn', 'kwy', 'loz', 'lua', 'lue', 'lun', 'luo', 'lus', 'map', 'mfe', 'mfs', 'mkh', 'mos', 'mul', 'nic',
'niu', 'nso', 'nyk', 'pag', 'phi', 'pis', 'pon', 'poz', 'pqe', 'pqw', 'prl', 'rnd', 'roa', 'run', 'sal',
'sem', 'sit', 'sla', 'srn', 'ssp', 'swc', 'taw', 'tdt', 'tiv', 'tll', 'toi', 'tpi', 'trk', 'tum', 'tut',
'tvl', 'tzo', 'umb', 'urj', 'vsl', 'wal', 'war', 'wls', 'yap', 'yua', 'zai', 'zle', 'zls', 'zlw', 'zne']

@staticmethod
def get_language_table() -> List[str]:
"""Get all known language codes and their human-readable versions.
Returns:
list -- A list of all known language codes and their human-readable versions.
"""
return list(map(lambda line: line.replace("\t", " "), Language.CODE_TO_HUMAN_LIST)) + \
['CELTIC', 'NORTH_EU', 'NORWAY', 'ROMANCE', 'SAMI', 'SCANDINAVIA', 'aav', 'aed', 'afa', 'alv', 'art', 'ase',
'bat', 'bcl', 'bem', 'ber', 'bnt', 'bzs', 'cau', 'ccs', 'ceb', 'cel', 'chk', 'cpf', 'cpp', 'crs', 'csg',
'csn', 'cus', 'dra', 'efi', 'en_el_es_fi', 'euq', 'fi_nb_no_nn_ru_sv_en', 'fiu', 'fse', 'gaa', 'gem',
'gil', 'gmq', 'gmw', 'grk', 'guw', 'hil', 'iir', 'ilo', 'inc', 'ine', 'iso', 'itc', 'jap', 'kab', 'kqn',
'kwn', 'kwy', 'loz', 'lua', 'lue', 'lun', 'luo', 'lus', 'map', 'mfe', 'mfs', 'mkh', 'mos', 'mul', 'nic',
'niu', 'nso', 'nyk', 'pag', 'phi', 'pis', 'pon', 'poz', 'pqe', 'pqw', 'prl', 'rnd', 'roa', 'run', 'sal',
'sem', 'sit', 'sla', 'srn', 'ssp', 'swc', 'taw', 'tdt', 'tiv', 'tll', 'toi', 'tpi', 'trk', 'tum', 'tut',
'tvl', 'tzo', 'umb', 'urj', 'vsl', 'wal', 'war', 'wls', 'yap', 'yua', 'zai', 'zle', 'zls', 'zlw', 'zne']

@staticmethod
def __convert_subtitle(source_file_path: str, source_ext: str, target_file_path: Optional[str], target_ext: str, format: str, frame_rate: Optional[float] = None) -> Tuple[str, str]:
encoding = Utils.detect_encoding(source_file_path)
Expand Down Expand Up @@ -639,30 +690,30 @@ def _run_command(command: str, timeout_secs: int, timeout_msg: str, error_msg: s
os.system("stty sane")

@staticmethod
def _set_text_patch(self, value):
self._text = value
def _set_text_patch(self_patched: Any, value: str) -> None:
self_patched._text = value

@staticmethod
def _read_patch(self):
self.rawcontent = self.fileobject.read()
self.text_to_captions()
return self.captions
def _read_patch(self_patched: Any) -> List:
self_patched.rawcontent = self_patched.fileobject.read()
self_patched.text_to_captions()
return self_patched.captions

@staticmethod
def _text_to_captions_patch(self):
soup = BeautifulSoup(self.rawcontent, features="lxml")
def _text_to_captions_patch(self_patched: Any) -> List:
soup = BeautifulSoup(self_patched.rawcontent, features="lxml")
texts = soup.find_all('text')
for text in texts:
caption = Caption()
caption.start = self.get_start(text)
caption.duration = self.get_duration(text)
caption.start = self_patched.get_start(text)
caption.duration = self_patched.get_duration(text)
caption.text = text.text
self.add_caption(caption)
self_patched.add_caption(caption)

return self.captions
return self_patched.captions

@staticmethod
def _get_utime_patch(self, dt):
def _get_utime_patch(_: Any, dt: datetime) -> str:
start = dt
start_seconds = 3600 * start.hour + 60 * start.minute + start.second
start_milliseconds = start.microsecond // 1000
Expand All @@ -675,9 +726,9 @@ def _get_utime_patch(self, dt):
return ustart

@staticmethod
def _format_time_patch(self, caption):
def _format_time_patch(self_patched: Any, caption: Any) -> Dict:
return {
"start": self.get_utime(caption.start),
"end": self.get_utime(caption.end),
"start": self_patched.get_utime(caption.start),
"end": self_patched.get_utime(caption.end),
"duration": caption.duration.total_seconds()
}
9 changes: 9 additions & 0 deletions tests/subaligner/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,15 @@ def test_get_file_root_and_extension(self):
self.assertEqual("/path/to/root", root)
self.assertEqual("ext1.ext2", extension)

def test_get_stretch_language_codes(self):
self.assertEqual(87, len(Undertest.get_stretch_language_codes()))

def test_get_misc_language_codes(self):
self.assertEqual(200, len(Undertest.get_misc_language_codes()))

def test_get_language_table(self):
self.assertEqual(200, len(Undertest.get_language_table()))

@patch("subprocess.Popen.communicate", return_value=1)
def test_throw_exception_on_srt2vtt_with_error_code(self, mock_communicate):
self._assert_exception_on_subproces(lambda: Undertest.srt2vtt(self.real_srt_path, "output"), mock_communicate)
Expand Down

0 comments on commit 38a7e0d

Please sign in to comment.