Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for parsing non-english chars along with english title & More language patterns #66

Open
wants to merge 27 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
d3fea07
Improvements
platelminto Oct 27, 2023
93192ac
Merge branch 'dev'
platelminto Oct 27, 2023
0d1e05e
Update python-publish.yml
platelminto Oct 27, 2023
d326bb7
Delete .github/workflows directory
platelminto Oct 27, 2023
60ee833
Fix only-title torrent names
platelminto Oct 30, 2023
6fe2362
Remove complete series words
platelminto Dec 30, 2023
8ece810
Fix overlapping stuff for seasons
platelminto Dec 31, 2023
6615cba
Improve season range support when many are listed
platelminto Dec 31, 2023
ca89c7b
Improve French subtitle support
platelminto Dec 31, 2023
ee7a3b6
Improve site matching at beginning of title
platelminto Dec 31, 2023
16835f0
Bump version
platelminto Dec 31, 2023
20ae328
added standard resolution types
mhdzumair Jan 4, 2024
74de856
reorder the pattern from highest to lowest.
mhdzumair Jan 4, 2024
c3002b7
Added new test title
mhdzumair Jan 4, 2024
3f399fe
Merge branch 'master' of https://github.com/platelminto/parse-torrent…
mhdzumair Jan 25, 2024
d9a21d2
fix torrent name and site parsing
mhdzumair Jan 25, 2024
a90609f
Merge branch 'dev' of https://github.com/platelminto/parse-torrent-title
mhdzumair Feb 15, 2024
5f4c12b
Add site regex description
mhdzumair Feb 15, 2024
1e4137d
Add more language patterns
mhdzumair May 27, 2024
e6daab6
rename test data generator to not trigger unit test by default
mhdzumair May 27, 2024
c91e1c1
#64: Add support for parsing non-english chars along with english title
mhdzumair May 27, 2024
1c61386
Merge branch 'dev' into master
mhdzumair May 27, 2024
99be918
Standardize PTN parsers
mhdzumair Jul 12, 2024
4997a6f
Refactor & Cleanup & Remove support for python 2
mhdzumair Jul 13, 2024
9800516
Further improvements
mhdzumair Jul 13, 2024
9e259e9
Compile the regex for Improve performance & doc strings
mhdzumair Jul 13, 2024
2caa376
Merge remote-tracking branch 'origin/master'
mhdzumair Jul 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 13 additions & 12 deletions PTN/__init__.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
#!/usr/bin/env python

import pkgutil
import sys

# Regex in python 2 is very slow so we check if the faster 'regex' library is available.
faster_regex = pkgutil.find_loader("regex")
if faster_regex is not None and sys.version_info[0] < 3:
re = faster_regex.load_module("regex")
else:
re = pkgutil.find_loader("re").load_module("re")

from .parse import PTN

__author__ = "Giorgio Momigliano"
__email__ = "[email protected]"
__version__ = "2.8.2"
__license__ = "MIT"

# Singleton instance of PTN
_ptn_instance = PTN()


def parse(name: str, standardise: bool = True, coherent_types: bool = False) -> dict:
"""
Parse the torrent title into its components.

def parse(name, standardise=True, coherent_types=False):
return PTN().parse(name, standardise, coherent_types)
:param name: The torrent name to parse.
:param standardise: Whether to standardise the parsed values.
:param coherent_types: Whether to ensure coherent types in the parsed results.
:return: A dictionary of parsed components.
"""
return _ptn_instance.parse(name, standardise, coherent_types)
172 changes: 76 additions & 96 deletions PTN/extras.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,66 @@
#!/usr/bin/env python
from typing import List, Tuple, Union

# Helper functions and constants for patterns.py

delimiters = "[\.\s\-\+_\/(),]"
delimiters = r"[\.\s\-\+_\/(),]"

langs = [
("rus(?:sian)?", "Russian"),
("(?:True)?fre?(?:nch)?", "French"),
("(?:nu)?ita(?:liano?)?", "Italian"),
("castellano|spa(?:nish)?|esp?", "Spanish"),
("swedish", "Swedish"),
("dk|dan(?:ish)?", "Danish"),
("ger(?:man)?|deu(?:tsch)?", "German"),
("nordic", "Nordic"),
("exyu", "ExYu"),
("chs|chi(?:nese)?", "Chinese"),
("hin(?:di)?", "Hindi"),
("polish|poland|pl", "Polish"),
("mandarin", "Mandarin"),
("kor(?:ean)?", "Korean"),
("ben(?:gali)?|bangla", "Bengali"),
("kan(?:nada)?", "Kannada"),
("tam(?:il)?", "Tamil"),
("tel(?:ugu)?", "Telugu"),
("mar(?:athi)?", "Marathi"),
("mal(?:ayalam)?", "Malayalam"),
("japanese|ja?p", "Japanese"),
("interslavic", "Interslavic"),
("ara(?:bic)?", "Arabic"),
("urdu", "Urdu"),
("punjabi", "Punjabi"),
("portuguese", "Portuguese"),
("albanian?", "Albanian"),
("egypt(?:ian)?", "Egyptian"),
("en?(?:g(?:lish)?)?", "English"), # Must be at end, matches just an 'e'
(r"rus(?:sian)?|russo", "Russian"),
(r"(?:True)?fre?(?:nch)?|fr(?:ench|a|e|anc[eê]s)?", "French"),
(r"(?:nu)?ita(?:liano?)?", "Italian"),
(r"castellano|spa(?:nish)?|esp?", "Spanish"),
(r"swedish", "Swedish"),
(r"dk|dan(?:ish)?", "Danish"),
(r"ger(?:man)?|deu(?:tsch)?|alem[aã]o", "German"),
(r"nordic", "Nordic"),
(r"exyu", "ExYu"),
(r"chs|chi(?:nese)?|(?:mand[ae]rin|ch[sn])|chin[eê]s|zh-hans", "Chinese"),
(r"hin(?:di)?", "Hindi"),
(r"polish|poland|pl", "Polish"),
(r"kor(?:ean)?|coreano", "Korean"),
(r"ben(?:gali)?|bangla", "Bengali"),
(r"kan(?:nada)?", "Kannada"),
(r"t[aâ]m(?:il)?", "Tamil"),
(r"tel(?:ugu)?", "Telugu"),
(r"mar(?:athi)?", "Marathi"),
(r"mal(?:ayalam)?", "Malayalam"),
(r"guj(?:arati)?", "Gujarati"),
(r"pun(?:jabi)?", "Punjabi"),
(r"ori(?:ya)?", "Oriya"),
(r"japanese|ja?p|jpn|japon[eê]s", "Japanese"),
(r"interslavic", "Interslavic"),
(r"ara(?:bic)?", "Arabic"),
(r"urdu", "Urdu"),
(r"tur(?:kish)?|tr", "Turkish"),
(r"tailand[eê]s|thai?", "Thai"),
(r"tagalog", "Tagalog"),
(r"ind(?:onesian)?", "Indonesian"),
(r"vie(?:tnamese)?", "Vietnamese"),
(r"heb(?:rew)?", "Hebrew"),
(r"gre(?:ek)?", "Greek"),
(r"cz(?:ech)?", "Czech"),
(r"hun(?:garian)?", "Hungarian"),
(r"ukr(?:ainian)?", "Ukrainian"),
(r"fin(?:nish)?", "Finnish"),
(r"nor(?:wegian)?", "Norwegian"),
(r"sin(?:hala)?", "Sinhala"),
(r"dutch|nl", "Dutch"),
(r"p[ua]n(?:jabi)?", "Punjabi"),
(r"por(?:tuguese)?|portugu[eèê]s[ea]?|p[rt]|port?", "Portuguese"),
(r"alb(?:anian?)?|albanais", "Albanian"),
(r"egypt(?:ian)?|egy", "Egyptian"),
(r"en?(?:g(?:lish)?)?|ing(?:l[eéê]s)?", "English"), # Must be at end, matches just an 'e'
]

genres = [
("Sci-?Fi", "Sci-Fi"),
("Drama", "Drama"),
("Comedy", "Comedy"),
("West(?:\.|ern)?", "Western"),
("Action", "Action"),
("Adventure", "Adventure"),
("Thriller", "Thriller"),
(r"Sci-?Fi", "Sci-Fi"),
(r"Drama", "Drama"),
(r"Comedy", "Comedy"),
(r"West(?:\.|ern)?", "Western"),
(r"Action", "Action"),
(r"Adventure", "Adventure"),
(r"Thriller", "Thriller"),
]

# Match strings like "complete series" for tv seasons/series, matching within the final title string.
Expand Down Expand Up @@ -78,45 +95,36 @@
# or a season. So if we have a language in the title it won't cause issues by getting matched.
# Empty list indicates to always do so, as opposed to matching specific regexes.
patterns_ignore_title = {
"language": [],
"audio": ["LiNE"],
"network": ["Hallmark"],
"languages": [],
"audio": [r"LiNE"],
"network": [r"Hallmark"],
"untouched": [],
"internal": [],
"limited": [],
"proper": [],
"extended": [r"(EXTENDED{d}(?!(?:CUT|EDITIONS?)))".format(d=delimiters)],
"extended": [rf"(EXTENDED{delimiters}(?!(?:CUT|EDITIONS?)))"],
}


channels = [(1, 0), (2, 0), (5, 0), (5, 1), (6, 1), (7, 1)]


# Return tuple with regexes for audio name with appended channel types, and without any channels
def get_channel_audio_options(patterns_with_names):
def get_channel_audio_options(patterns_with_names: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
options = []
for (audio_pattern, name) in patterns_with_names:
for (speakers, subwoofers) in channels:
for audio_pattern, name in patterns_with_names:
for speakers, subwoofers in channels:
options.append(
(
"((?:{}){}*{}[. \-]?{}(?:ch)?)".format(
audio_pattern, delimiters, speakers, subwoofers
),
"{} {}.{}".format(name, speakers, subwoofers),
rf"((?:{audio_pattern}){delimiters}*{speakers}[. \-]?{subwoofers}(?:ch)?)",
f"{name} {speakers}.{subwoofers}",
)
)
options.append(
("({})".format(audio_pattern), name)
) # After for loop, would match first

options.append((rf"({audio_pattern})", name)) # After for loop, would match first
return options


def prefix_pattern_with(prefixes, pattern_options, between="", optional=False):
if optional:
optional_char = "?"
else:
optional_char = ""
def prefix_pattern_with(prefixes: Union[str, List[str]], pattern_options: Union[str, List[Union[str, Tuple]]], between: str = "", optional: bool = False) -> List[Union[str, Tuple]]:
optional_char = "?" if optional else ""
options = []
if not isinstance(prefixes, list):
prefixes = [prefixes]
Expand All @@ -126,28 +134,19 @@ def prefix_pattern_with(prefixes, pattern_options, between="", optional=False):
for pattern_option in pattern_options:
if isinstance(pattern_option, str):
options.append(
"(?:{}){}(?:{})?({})".format(
prefix, optional_char, between, pattern_option
)
rf"(?:{prefix}){optional_char}(?:{between})?({pattern_option})"
)
else:
options.append(
(
"(?:{}){}(?:{})?({})".format(
prefix, optional_char, between, pattern_option[0]
),
)
+ pattern_option[1:]
rf"(?:{prefix}){optional_char}(?:{between})?({pattern_option[0]})",
) + pattern_option[1:]
)

return options


def suffix_pattern_with(suffixes, pattern_options, between="", optional=False):
if optional:
optional_char = "?"
else:
optional_char = ""
def suffix_pattern_with(suffixes: Union[str, List[str]], pattern_options: Union[str, List[Union[str, Tuple]]], between: str = "", optional: bool = False) -> List[Union[str, Tuple]]:
optional_char = "?" if optional else ""
options = []
if not isinstance(suffixes, list):
suffixes = [suffixes]
Expand All @@ -158,36 +157,17 @@ def suffix_pattern_with(suffixes, pattern_options, between="", optional=False):
if isinstance(pattern_option, tuple):
options.append(
(
"({})(?:{})?(?:{}){}".format(
pattern_option[0], between, suffix, optional_char
),
)
+ pattern_option[1:]
rf"({pattern_option[0]})(?:{between})?(?:{suffix}){optional_char}",
) + pattern_option[1:]
)
else:
options.append(
"({})(?:{})?(?:{}){}".format(
pattern_option, between, suffix, optional_char
)
rf"({pattern_option})(?:{between})?(?:{suffix}){optional_char}"
)

return options


# Link a regex-tuple list into a single regex (to be able to use elsewhere while
# maintaining standardisation functionality).
def link_patterns(pattern_options):
def link_patterns(pattern_options: Union[str, List[Union[str, Tuple]]]) -> str:
if not isinstance(pattern_options, list):
return pattern_options
return (
"(?:"
+ "|".join(
[
pattern_option[0]
if isinstance(pattern_option, tuple)
else pattern_option
for pattern_option in pattern_options
]
)
+ ")"
)
return rf"(?:{'|'.join([pattern_option[0] if isinstance(pattern_option, tuple) else pattern_option for pattern_option in pattern_options])})"
Loading