From ce7110737c25f55c4eeff0d1e7a4b175d9cb5a48 Mon Sep 17 00:00:00 2001 From: yanirmr Date: Mon, 25 Oct 2021 15:53:05 +0300 Subject: [PATCH] update deprecated regex expressions --- nltk_contrib/textgrid.py | 324 +++++++-------------------------------- 1 file changed, 59 insertions(+), 265 deletions(-) diff --git a/nltk_contrib/textgrid.py b/nltk_contrib/textgrid.py index 9cbd77c..51fa030 100644 --- a/nltk_contrib/textgrid.py +++ b/nltk_contrib/textgrid.py @@ -16,8 +16,8 @@ The textgrid corpus reader provides 4 data items and 1 function for each textgrid file. For each tier in the file, the reader provides 10 data items and 2 functions. - -For the full textgrid file: + +For the full textgrid file: - size The number of tiers in the file. @@ -75,7 +75,7 @@ List of (classid, nameid, xmin, xmax, size, transcript). - min_max() - A tuple of (xmin, xmax). + A tuple of (xmin, xmax). - time(non_speech_marker) Returns the utterance time of a given tier. @@ -110,7 +110,6 @@ """) - ################################################################# # TextGrid Class ################################################################# @@ -127,7 +126,7 @@ class TextGrid(object): def __init__(self, read_file): """ - Takes open read file as input, initializes attributes + Takes open read file as input, initializes attributes of the TextGrid file. @type read_file: An open TextGrid file, mode "r". @param size: Number of tiers. @@ -167,21 +166,21 @@ def load(file): def _load_tiers(self, header): """ Iterates over each tier and grabs tier information. - """ + """ tiers = [] if self.text_type == "ChronTextFile": m = re.compile(header) tier_headers = m.findall(self.read_file) - tier_re = " \d+.?\d* \d+.?\d*[\r\n]+\"[^\"]*\"" + tier_re = r" \d+.?\d* \d+.?\d*[\r\n]+\"[^\"]*\"" for i in range(0, self.size): tier_info = [tier_headers[i]] + \ - re.findall(str(i + 1) + tier_re, self.read_file) + re.findall(str(i + 1) + tier_re, self.read_file) tier_info = "\n".join(tier_info) tiers.append(Tier(tier_info, self.text_type, self.t_time)) return tiers - tier_re = header + "[\s\S]+?(?=" + header + "|$$)" + tier_re = header + r"[\s\S]+?(?=" + header + "|$$)" m = re.compile(tier_re) tier_iter = m.finditer(self.read_file) for iterator in tier_iter: @@ -189,13 +188,13 @@ def _load_tiers(self, header): tier_info = self.read_file[begin:end] tiers.append(Tier(tier_info, self.text_type, self.t_time)) return tiers - + def _check_type(self): """ Figures out the TextGrid format. """ - m = re.match("(.*)[\r\n](.*)[\r\n](.*)[\r\n](.*)", self.read_file) + m = re.match(r"(.*)[\r\n](.*)[\r\n](.*)[\r\n](.*)", self.read_file) try: type_id = m.group(1).strip() except AttributeError: @@ -208,24 +207,24 @@ def _check_type(self): text_type = "ooTextFile" elif type_id == "\"Praat chronological TextGrid text file\"": text_type = "ChronTextFile" - else: - raise TypeError("Unknown format '(%s)'", (type_id)) + else: + raise TypeError("Unknown format '(%s)'", type_id) return text_type - + def _find_tiers(self): """ - Splits the textgrid file into substrings corresponding to tiers. + Splits the textgrid file into substrings corresponding to tiers. """ if self.text_type == "ooTextFile": m = OOTEXTFILE - header = " +item \[" + header = r" +item \[" elif self.text_type == "ChronTextFile": m = CHRONTEXTFILE - header = "\"\S+\" \".*\" \d+\.?\d* \d+\.?\d*" + header = r"\"\S+\" \".*\" \d+\.?\d* \d+\.?\d*" elif self.text_type == "OldooTextFile": m = OLDOOTEXTFILE - header = "\".*\"[\r\n]+\".*\"" + header = r"\".*\"[\r\n]+\".*\"" file_info = m.findall(self.read_file)[0] self.xmin = float(file_info[0]) @@ -236,7 +235,7 @@ def _find_tiers(self): return tiers def to_chron(self): - """ + """ @return: String in Chronological TextGrid file format. """ @@ -253,16 +252,16 @@ def to_chron(self): chron_file += tier_header + "\n" transcript = tier.simple_transcript for (xmin, xmax, utt) in transcript: - chron_file += str(idx) + " " + str(xmin) - chron_file += " " + str(xmax) +"\n" + chron_file += str(idx) + " " + str(xmin) + chron_file += " " + str(xmax) + "\n" chron_file += "\"" + utt + "\"\n" return chron_file def to_oo(self): - """ + """ @return: A string in OoTextGrid file format. """ - + oo_file = "" oo_file += "File type = \"ooTextFile\"\n" oo_file += "Object class = \"TextGrid\"\n\n" @@ -293,14 +292,14 @@ def to_oo(self): ################################################################# class Tier(object): - """ + """ A container for each tier. """ def __init__(self, tier, text_type, t_time): """ Initializes attributes of the tier: class, name, xmin, xmax - size, transcript, total time. + size, transcript, total time. Utilizes text_type to guide how to parse the file. @type tier: a tier object; single item in the TextGrid list. @param text_type: TextGrid format @@ -329,69 +328,69 @@ def __init__(self, tier, text_type, t_time): self.mark_type = "intervals" else: self.mark_type = "points" - self.header = [("class", self.classid), ("name", self.nameid), \ - ("xmin", self.xmin), ("xmax", self.xmax), ("size", self.size)] + self.header = [("class", self.classid), ("name", self.nameid), + ("xmin", self.xmin), ("xmax", self.xmax), ("size", self.size)] def __iter__(self): return self - + def _make_info(self): """ Figures out most attributes of the tier object: class, name, xmin, xmax, transcript. """ - trans = "([\S\s]*)" + trans = r"([\S\s]*)" if self.text_type == "ChronTextFile": - classid = "\"(.*)\" +" - nameid = "\"(.*)\" +" - xmin = "(\d+\.?\d*) +" - xmax = "(\d+\.?\d*) *[\r\n]+" + classid = r"\"(.*)\" +" + nameid = r"\"(.*)\" +" + xmin = r"(\d+\.?\d*) +" + xmax = r"(\d+\.?\d*) *[\r\n]+" # No size values are given in the Chronological Text File format. self.size = None size = "" elif self.text_type == "ooTextFile": classid = " +class = \"(.*)\" *[\r\n]+" nameid = " +name = \"(.*)\" *[\r\n]+" - xmin = " +xmin = (\d+\.?\d*) *[\r\n]+" - xmax = " +xmax = (\d+\.?\d*) *[\r\n]+" - size = " +\S+: size = (\d+) *[\r\n]+" + xmin = r" +xmin = (\d+\.?\d*) *[\r\n]+" + xmax = r" +xmax = (\d+\.?\d*) *[\r\n]+" + size = r" +\S+: size = (\d+) *[\r\n]+" elif self.text_type == "OldooTextFile": - classid = "\"(.*)\" *[\r\n]+" - nameid = "\"(.*)\" *[\r\n]+" - xmin = "(\d+\.?\d*) *[\r\n]+" - xmax = "(\d+\.?\d*) *[\r\n]+" - size = "(\d+) *[\r\n]+" + classid = r"\"(.*)\" *[\r\n]+" + nameid = r"\"(.*)\" *[\r\n]+" + xmin = r"(\d+\.?\d*) *[\r\n]+" + xmax = r"(\d+\.?\d*) *[\r\n]+" + size = r"(\d+) *[\r\n]+" m = re.compile(classid + nameid + xmin + xmax + size + trans) self.tier_info = m.findall(self.tier)[0] self.classid = self.tier_info[0] self.nameid = self.tier_info[1] self.xmin = float(self.tier_info[2]) self.xmax = float(self.tier_info[3]) - if self.size != None: + if self.size is not None: self.size = int(self.tier_info[4]) self.transcript = self.tier_info[-1] - + def make_simple_transcript(self): - """ + """ @return: Transcript of the tier, in form [(start_time end_time label)] """ if self.text_type == "ChronTextFile": trans_head = "" - trans_xmin = " (\S+)" - trans_xmax = " (\S+)[\r\n]+" - trans_text = "\"([\S\s]*?)\"" + trans_xmin = r" (\S+)" + trans_xmax = r" (\S+)[\r\n]+" + trans_text = r"\"([\S\s]*?)\"" elif self.text_type == "ooTextFile": - trans_head = " +\S+ \[\d+\]: *[\r\n]+" - trans_xmin = " +\S+ = (\S+) *[\r\n]+" - trans_xmax = " +\S+ = (\S+) *[\r\n]+" - trans_text = " +\S+ = \"([^\"]*?)\"" + trans_head = r" +\S+ \[\d+\]: *[\r\n]+" + trans_xmin = r" +\S+ = (\S+) *[\r\n]+" + trans_xmax = r" +\S+ = (\S+) *[\r\n]+" + trans_text = r" +\S+ = \"([^\"]*?)\"" elif self.text_type == "OldooTextFile": trans_head = "" - trans_xmin = "(.*)[\r\n]+" - trans_xmax = "(.*)[\r\n]+" - trans_text = "\"([\S\s]*?)\"" + trans_xmin = r"(.*)[\r\n]+" + trans_xmax = r"(.*)[\r\n]+" + trans_text = r"\"([\S\s]*?)\"" if self.classid == TEXTTIER: trans_xmin = "" trans_m = re.compile(trans_head + trans_xmin + trans_xmax + trans_text) @@ -402,13 +401,13 @@ def transcript(self): """ @return: Transcript of the tier, as it appears in the file. """ - + return self.transcript def time(self, non_speech_char="."): """ @return: Utterance time of a given tier. - Screens out entries that begin with a non-speech marker. + Screens out entries that begin with a non-speech marker. """ total = 0.0 @@ -418,7 +417,7 @@ def time(self, non_speech_char="."): if utt and not utt[0] == ".": total += (float(time2) - float(time1)) return total - + def tier_name(self): """ @return: Tier name of a given tier. @@ -438,216 +437,11 @@ def min_max(self): @return: (xmin, xmax) tuple for a given tier. """ - return (self.xmin, self.xmax) + return self.xmin, self.xmax def __repr__(self): - return "<%s \"%s\" (%.2f, %.2f) %.2f%%>" % (self.classid, self.nameid, self.xmin, self.xmax, 100*self.time()/self.t_time) + return "<%s \"%s\" (%.2f, %.2f) %.2f%%>" % ( + self.classid, self.nameid, self.xmin, self.xmax, 100 * self.time() / self.t_time) def __str__(self): return self.__repr__() + "\n " + "\n ".join(" ".join(row) for row in self.simple_transcript) - -def demo_TextGrid(demo_data): - print("** Demo of the TextGrid class. **") - - fid = TextGrid(demo_data) - print("Tiers: %s" % (fid.size)) - - for i, tier in enumerate(fid): - print("\n***") - print("Tier: %s" % (i + 1)) - print(tier) - -def demo(): - # Each demo demonstrates different TextGrid formats. - print("Format 1") - demo_TextGrid(demo_data1) - print("\nFormat 2") - demo_TextGrid(demo_data2) - print("\nFormat 3") - demo_TextGrid(demo_data3) - - -demo_data1 = """File type = "ooTextFile" -Object class = "TextGrid" - -xmin = 0 -xmax = 2045.144149659864 -tiers? -size = 3 -item []: - item [1]: - class = "IntervalTier" - name = "utterances" - xmin = 0 - xmax = 2045.144149659864 - intervals: size = 5 - intervals [1]: - xmin = 0 - xmax = 2041.4217474125382 - text = "" - intervals [2]: - xmin = 2041.4217474125382 - xmax = 2041.968276643991 - text = "this" - intervals [3]: - xmin = 2041.968276643991 - xmax = 2042.5281632653062 - text = "is" - intervals [4]: - xmin = 2042.5281632653062 - xmax = 2044.0487352585324 - text = "a" - intervals [5]: - xmin = 2044.0487352585324 - xmax = 2045.144149659864 - text = "demo" - item [2]: - class = "TextTier" - name = "notes" - xmin = 0 - xmax = 2045.144149659864 - points: size = 3 - points [1]: - time = 2041.4217474125382 - mark = ".begin_demo" - points [2]: - time = 2043.8338291031832 - mark = "voice gets quiet here" - points [3]: - time = 2045.144149659864 - mark = ".end_demo" - item [3]: - class = "IntervalTier" - name = "phones" - xmin = 0 - xmax = 2045.144149659864 - intervals: size = 12 - intervals [1]: - xmin = 0 - xmax = 2041.4217474125382 - text = "" - intervals [2]: - xmin = 2041.4217474125382 - xmax = 2041.5438290324326 - text = "D" - intervals [3]: - xmin = 2041.5438290324326 - xmax = 2041.7321032910372 - text = "I" - intervals [4]: - xmin = 2041.7321032910372 - xmax = 2041.968276643991 - text = "s" - intervals [5]: - xmin = 2041.968276643991 - xmax = 2042.232189031843 - text = "I" - intervals [6]: - xmin = 2042.232189031843 - xmax = 2042.5281632653062 - text = "z" - intervals [7]: - xmin = 2042.5281632653062 - xmax = 2044.0487352585324 - text = "eI" - intervals [8]: - xmin = 2044.0487352585324 - xmax = 2044.2487352585324 - text = "dc" - intervals [9]: - xmin = 2044.2487352585324 - xmax = 2044.3102321849011 - text = "d" - intervals [10]: - xmin = 2044.3102321849011 - xmax = 2044.5748932104329 - text = "E" - intervals [11]: - xmin = 2044.5748932104329 - xmax = 2044.8329108578437 - text = "m" - intervals [12]: - xmin = 2044.8329108578437 - xmax = 2045.144149659864 - text = "oU" -""" - -demo_data2 = """File type = "ooTextFile" -Object class = "TextGrid" - -0 -2.8 - -2 -"IntervalTier" -"utterances" -0 -2.8 -3 -0 -1.6229213249309031 -"" -1.6229213249309031 -2.341428074708195 -"demo" -2.341428074708195 -2.8 -"" -"IntervalTier" -"phones" -0 -2.8 -6 -0 -1.6229213249309031 -"" -1.6229213249309031 -1.6428291382019483 -"dc" -1.6428291382019483 -1.65372183721983721 -"d" -1.65372183721983721 -1.94372874328943728 -"E" -1.94372874328943728 -2.13821938291038210 -"m" -2.13821938291038210 -2.341428074708195 -"oU" -2.341428074708195 -2.8 -"" -""" - -demo_data3 = """"Praat chronological TextGrid text file" -0 2.8 ! Time domain. -2 ! Number of tiers. -"IntervalTier" "utterances" 0 2.8 -"IntervalTier" "utterances" 0 2.8 -1 0 1.6229213249309031 -"" -2 0 1.6229213249309031 -"" -2 1.6229213249309031 1.6428291382019483 -"dc" -2 1.6428291382019483 1.65372183721983721 -"d" -2 1.65372183721983721 1.94372874328943728 -"E" -2 1.94372874328943728 2.13821938291038210 -"m" -2 2.13821938291038210 2.341428074708195 -"oU" -1 1.6229213249309031 2.341428074708195 -"demo" -1 2.341428074708195 2.8 -"" -2 2.341428074708195 2.8 -"" -""" - -if __name__ == "__main__": - demo() -