Skip to content

Commit

Permalink
platelminto#64: Add support for parsing non-english chars along with …
Browse files Browse the repository at this point in the history
…english title
  • Loading branch information
mhdzumair committed May 27, 2024
1 parent e6daab6 commit c91e1c1
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 8 deletions.
31 changes: 24 additions & 7 deletions PTN/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,21 +36,27 @@ def _part(self, name, match_slice, clean, overwrite=False):
self.match_slices.append(match_slice)

@staticmethod
def _clean_string(string):
clean = re.sub(r"^( -|\(|\[)", "", string)
if clean.find(" ") == -1 and clean.find(".") != -1:
def _clean_dots(string: str) -> str:
if string.find(" ") == -1 and string.find(".") != -1:
# 4 dots likely means we want an ellipsis and a space
clean = re.sub(r"\.{4,}", "... ", clean)
string = re.sub(r"\.{4,}", "... ", string)

# Replace any instances of less than 3 dots with a space
# Lookarounds are used to prevent the 3-dots (ellipses) from being replaced
clean = re.sub(r"(?<!\.)\.\.(?!\.)", " ", clean)
clean = re.sub(r"(?<!\.)\.(?!\.\.)", " ", clean)
string = re.sub(r"(?<!\.)\.\.(?!\.)", " ", string)
string = re.sub(r"(?<!\.)\.(?!\.\.)", " ", string)
return string

def _clean_string(self, string):
clean = re.sub(r"^( -|\(|\[)", "", string)
clean = self._clean_dots(clean)

clean = re.sub(r"_", " ", clean)
clean = re.sub(r"([\[)_\]]|- )$", "", clean).strip()
clean = clean.strip(" _-")

# Again, we need to clean up the dots & strip for non-english chars titles that get cleaned from above re.sub.
clean = self._clean_dots(clean).strip()
return clean

def parse(self, name, standardise, coherent_types):
Expand Down Expand Up @@ -358,7 +364,7 @@ def process_title(self):
relative_title_start = m.end()
raw = raw[relative_title_start:]
title_start = relative_title_start + title_start
clean = self._clean_string(raw)
clean = self._clean_string(self.clean_title(raw))
# Re-add title_start to unrelative the index from raw to self.torrent_name
self._part("title", (title_start, title_end), clean)
else:
Expand Down Expand Up @@ -433,3 +439,14 @@ def clean_unmatched(self):
):
filtered.append(extra)
return filtered

@staticmethod
def clean_title(raw_title):
cleaned_title = raw_title
cleaned_title = cleaned_title.replace(r"[[(]movie[)\]]", "") # clear movie indication flag
cleaned_title = re.sub(patterns["RUSSIAN_CAST_REGEX"], " ", cleaned_title) # clear russian cast information
cleaned_title = re.sub(patterns["RELEASE_GROUP_REGEX_START"], r"\1", cleaned_title) # remove release group markings sections from the start
cleaned_title = re.sub(patterns["RELEASE_GROUP_REGEX_END"], r"\1", cleaned_title) # remove unneeded markings section at the end if present
cleaned_title = re.sub(patterns["ALT_TITLES_REGEX"], "", cleaned_title) # remove alt language titles
cleaned_title = re.sub(patterns["NOT_ONLY_NON_ENGLISH_REGEX"], "", cleaned_title) # remove non english chars if they are not the only ones left
return cleaned_title
16 changes: 16 additions & 0 deletions PTN/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,3 +411,19 @@
"remux": "boolean",
"internationalCut": "boolean",
}

patterns["NON_ENGLISH_CHARS"] = "\u3040-\u30ff" # Japanese characters
patterns["NON_ENGLISH_CHARS"] += "\u3400-\u4dbf" # Chinese characters
patterns["NON_ENGLISH_CHARS"] += "\u4e00-\u9fff" # Chinese characters
patterns["NON_ENGLISH_CHARS"] += "\uf900-\ufaff" # CJK Compatibility Ideographs
patterns["NON_ENGLISH_CHARS"] += "\uff66-\uff9f" # Halfwidth Katakana Japanese characters
patterns["NON_ENGLISH_CHARS"] += "\u0400-\u04ff" # Cyrillic characters (Russian)
patterns["NON_ENGLISH_CHARS"] += "\u0600-\u06ff" # Arabic characters

patterns["RUSSIAN_CAST_REGEX"] = r"\([^)]*[\u0400-\u04ff][^)]*\)$|\/.*\((.*)\)$"
patterns["ALT_TITLES_REGEX"] = f"[^/|(]*[{patterns['NON_ENGLISH_CHARS']}][^/|]*/|[/|][^/|(]*[{patterns['NON_ENGLISH_CHARS']}][^/|]*"
patterns["NOT_ONLY_NON_ENGLISH_REGEX"] = rf"(?:[a-zA-Z][^{patterns['NON_ENGLISH_CHARS']}]+|^)[{patterns['NON_ENGLISH_CHARS']}].*[{patterns['NON_ENGLISH_CHARS']}]|[{patterns['NON_ENGLISH_CHARS']}].*[{patterns['NON_ENGLISH_CHARS']}](?=[^{patterns['NON_ENGLISH_CHARS']}]+[a-zA-Z])"
patterns["NOT_ALLOWED_SYMBOLS_AT_START_AND_END"] = rf"^[^\w{patterns['NON_ENGLISH_CHARS']}#[【★]+|[ \-:/\\\[|{{(#$&^]+$"
patterns["REMAINING_NOT_ALLOWED_SYMBOLS_AT_START_AND_END"] = rf"^[^\w{patterns['NON_ENGLISH_CHARS']}#]+|]$"
patterns["RELEASE_GROUP_REGEX_START"] = r"^[\[【★].*[\]】★][ .]?(.+)"
patterns["RELEASE_GROUP_REGEX_END"] = r"(.+)[ .]?[\[【★].*[\]】★]$"
9 changes: 8 additions & 1 deletion tests/files/input.json
Original file line number Diff line number Diff line change
Expand Up @@ -414,5 +414,12 @@
"Atonement.2017.KOREAN.ENSUBBED.1080p.WEBRip.x264-VXTT",
"Fauda.S01.HEBREW.1080p.NF.WEBRip.DD5.1.x264-TrollHD[rartv]",
"Chinese Zodiac (2012) 1080p BrRip x264 - YIFY",
"Thai Massage (2022) 720p PDVDRip x264 AAC.mkv"
"Thai Massage (2022) 720p PDVDRip x264 AAC.mkv",
"\u6740\u624b\u4e4b\u738b [\u6e2f\u7248\u539f\u76d8/\u56fd\u7ca4\u53cc\u8bed\u4e2d\u5b57].Hitman.1998.1080p.HKG.Blu-ray.AVC.TrueHD.7.1-TAG",
"[www.arabp2p.net]_-_\u062a\u0631\u0643\u064a \u0645\u062a\u0631\u062c\u0645 \u0648\u0645\u062f\u0628\u0644\u062c Last.Call.for.Istanbul.2023.1080p.NF.WEB-DL.DDP5.1.H.264.MKV.torrent",
"\u0413\u043e\u043b\u0443\u0431\u0430\u044f \u0432\u043e\u043b\u043d\u0430 / Blue Crush (2002) DVDRip",
"\u3010\u55b5\u840c\u5976\u8336\u5c4b\u3011\u260501\u6708\u65b0\u756a\u2605[Rebirth][01][720p][\u7b80\u4f53][\u62db\u52df\u7ffb\u8bd1]",
"08.\u041f\u043b\u0430\u043d\u0435\u0442\u0430.\u043e\u0431\u0435\u0437\u044c\u044f\u043d.\u0420\u0435\u0432\u043e\u043b\u044e\u0446\u0438\u044f.2014.BDRip-HEVC.1080p.mkv",
"\u0413\u0440\u0435\u0447\u0435\u0441\u043a\u0430\u044f \u0441\u043c\u043e\u043a\u043e\u0432\u043d\u0438\u0446\u0430 / The fruit is ripe / Griechische Feigen (Siggi G\u00f6tz) [1976, \u0413\u0435\u0440\u043c\u0430\u043d\u0438\u044f, \u042d\u0440\u043e\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u043a\u043e\u043c\u0435\u0434\u0438\u044f, DVDRip]",
"\u041a\u043d\u0438\u0433\u043e\u043d\u043e\u0448\u0438 / \u041a\u043di\u0433\u0430\u043d\u043e\u0448\u044b (1987) TVRip \u043e\u0442 AND03AND | BLR"
]
53 changes: 53 additions & 0 deletions tests/files/output_raw.json
Original file line number Diff line number Diff line change
Expand Up @@ -3793,5 +3793,58 @@
"resolution": "720p",
"title": "Thai Massage",
"year": 2022
},
{
"audio": "TrueHD.7.1",
"codec": "AVC",
"encoder": "TAG",
"quality": "Blu-ray",
"resolution": "1080p",
"title": "] Hitman",
"year": 1998
},
{
"audio": "DDP5.1",
"codec": "H.264",
"encoder": "torrent",
"filetype": "MKV",
"network": "NF",
"quality": "WEB-DL",
"resolution": "1080p",
"site": "www.arabp2p.net",
"title": "Last Call for Istanbul",
"year": 2023
},
{
"quality": "DVDRip",
"title": "Blue Crush",
"year": 2002
},
{
"encoder": "]",
"resolution": "720p",
"site": "简体][招募翻译",
"title": "Rebirth"
},
{
"codec": "HEVC",
"filetype": "mkv",
"quality": "BDRip",
"resolution": "1080p",
"title": "08 Планета обезьян Революция",
"year": 2014
},
{
"encoder": "комедия",
"quality": "DVDRip",
"title": "The fruit is ripe / Griechische Feigen",
"year": 1976
},
{
"encoder": "|",
"quality": "TVRip",
"site": "BLR",
"title": "Кнiганошы",
"year": 1987
}
]
29 changes: 29 additions & 0 deletions tests/files/output_standard.json
Original file line number Diff line number Diff line change
Expand Up @@ -2231,5 +2231,34 @@
"codec": "H.264",
"filetype": "MKV",
"title": "Thai Massage"
},
{
"audio": "Dolby TrueHD 7.1",
"codec": "H.264",
"title": "] Hitman"
},
{
"audio": "Dolby Digital Plus 5.1",
"network": "Netflix",
"title": "Last Call for Istanbul"
},
{
"quality": "DVD-Rip",
"title": "Blue Crush"
},
{
"title": "Rebirth"
},
{
"codec": "H.265",
"filetype": "MKV",
"title": "08 Планета обезьян Революция"
},
{
"quality": "DVD-Rip",
"title": "The fruit is ripe / Griechische Feigen"
},
{
"title": "Кнiганошы"
}
]

0 comments on commit c91e1c1

Please sign in to comment.