diff --git a/mnamer/endpoints.py b/mnamer/endpoints.py index f11b56ec..aac8a8a8 100644 --- a/mnamer/endpoints.py +++ b/mnamer/endpoints.py @@ -165,7 +165,7 @@ def tmdb_movies( def tmdb_search_movies( api_key: str, title: str, - year: int | str | None = None, + year: int | None = None, language: Language | None = None, region: str | None = None, adult: bool = False, diff --git a/mnamer/exceptions.py b/mnamer/exceptions.py index ccf40ca5..00e1ca06 100644 --- a/mnamer/exceptions.py +++ b/mnamer/exceptions.py @@ -19,3 +19,13 @@ class MnamerNetworkException(MnamerException): class MnamerNotFoundException(MnamerException): """Raised when a lookup or search works as expected yet yields no results.""" + + +class MnamerFailedLangGuesserInstantiation(MnamerException): + """ + Raised when a requested text language guesser failed to instantiate. + """ + + +class MnamerNoSuchLangGuesser(MnamerException): + """Raised when a requested text language guesser name does not match any known guessers.""" diff --git a/mnamer/metadata.py b/mnamer/metadata.py index df43382a..88910904 100644 --- a/mnamer/metadata.py +++ b/mnamer/metadata.py @@ -16,7 +16,6 @@ str_fix_padding, str_replace_slashes, str_title_case, - year_parse, ) @@ -44,6 +43,7 @@ class Metadata: language_sub: Language | None = None quality: str | None = None synopsis: str | None = None + original_filename: str | None = None @classmethod def to_media_type(cls) -> MediaType: @@ -111,21 +111,32 @@ class MetadataMovie(Metadata): """ name: str | None = None - year: str | None = None + date: dt.date | None = None id_imdb: str | None = None id_tmdb: str | None = None + def __post_init__(self): + if isinstance(self.date, str): + self.date = parse_date(self.date) + def __format__(self, format_spec: str | None): - default = "{name} ({year})" - re_pattern = r"({(\w+)(?:\[[\w:]+\])?(?:\:\d{1,2})?})" + default = "{name} ({date.year})" + re_pattern = r"({(\w+)(?:\[[\w:]+\]|\.\w+)?(?:\:\d{1,2})?})" + tname = '' + if ( format_spec is None or re.search("{(date.year|date)}", format_spec) is not None ) \ + and self.name is not None and self.date is not None \ + and self.name.endswith(f" ({self.date.year})"): + tname = f" ({self.date.year})" + self.name = self.name[:-len(tname)] s = re.sub(re_pattern, self._format_repl, format_spec or default) s = str_fix_padding(s) + self.name+=tname return s def __setattr__(self, key: str, value: Any): converter_map: dict[str, Callable] = { "name": fn_pipe(str_replace_slashes, str_title_case), - "year": year_parse, + "date": parse_date, } converter: Callable | None = converter_map.get(key) if value is not None and converter: @@ -141,6 +152,7 @@ class MetadataEpisode(Metadata): """ series: str | None = None + series_date: dt.date | None = None season: int | None = None episode: int | None = None date: dt.date | None = None @@ -155,12 +167,21 @@ def __post_init__(self): self.episode = int(self.episode) if isinstance(self.date, str): self.date = parse_date(self.date) + if isinstance(self.series_date, str): + self.series_date = parse_date(self.series_date) def __format__(self, format_spec: str | None): default = "{series} - {season:02}x{episode:02} - {title}" - re_pattern = r"({(\w+)(?:\[[\w:]+\])?(?:\:\d{1,2})?})" + re_pattern = r"({(\w+)(?:\[[\w:]+\]|\.\w+)?(?:\:\d{1,2})?})" + tseries = '' + if ( format_spec is None or re.search("{(series_date.year|series_date|date.year|date)}", format_spec) is not None ) \ + and self.series is not None and self.series_date is not None \ + and self.series.endswith(f" ({self.series_date.year})"): + tseries = f" ({self.series_date.year})" + self.series = self.series[:-len(tseries)] s = re.sub(re_pattern, self._format_repl, format_spec or default) s = str_fix_padding(s) + self.series+=tseries return s def __setattr__(self, key: str, value: Any): @@ -169,6 +190,7 @@ def __setattr__(self, key: str, value: Any): "episode": int, "season": int, "series": fn_pipe(str_replace_slashes, str_title_case), + "series_date": parse_date, "title": fn_pipe(str_replace_slashes, str_title_case), } converter: Callable | None = converter_map.get(key) diff --git a/mnamer/providers.py b/mnamer/providers.py index e4e83c04..ff37aaa7 100644 --- a/mnamer/providers.py +++ b/mnamer/providers.py @@ -28,7 +28,7 @@ from mnamer.metadata import Metadata, MetadataEpisode, MetadataMovie from mnamer.setting_store import SettingStore from mnamer.types import MediaType, ProviderType -from mnamer.utils import parse_date, year_range_parse +from mnamer.utils import parse_date class Provider(ABC): @@ -83,7 +83,7 @@ def search(self, query: MetadataMovie) -> Iterator[MetadataMovie]: if query.id_imdb: results = self._lookup_movie(query.id_imdb) elif query.name: - results = self._search_movie(query.name, query.year) + results = self._search_movie(query.name, None if query.date is None else query.date.year) else: raise MnamerNotFoundException yield from results @@ -94,15 +94,17 @@ def _lookup_movie(self, id_imdb: str) -> Iterator[MetadataMovie]: try: release_date = dt.datetime.strptime( response["Released"], "%d %b %Y" - ).strftime("%Y-%m-%d") + ) except (KeyError, ValueError): if response.get("Year") in (None, "N/A"): release_date = None else: - release_date = "{}-01-01".format(response["Year"]) + release_date = dt.datetime.strptime( + "{}-01-01".format(response["Year"]), "%Y-%m-%d" + ) meta = MetadataMovie( name=response["Title"], - year=release_date, + date=release_date, synopsis=response["Plot"], id_imdb=response["imdbID"], ) @@ -110,9 +112,9 @@ def _lookup_movie(self, id_imdb: str) -> Iterator[MetadataMovie]: meta.synopsis = None yield meta - def _search_movie(self, name: str, year: str | None) -> Iterator[MetadataMovie]: + def _search_movie(self, name: str, year: int | None) -> Iterator[MetadataMovie]: assert self.api_key - year_from, year_to = year_range_parse(year, 5) + year_from, year_to = year - 5, year + 5 found = False page = 1 page_max = 10 # each page yields a maximum of 10 results @@ -153,7 +155,7 @@ def search(self, query: MetadataMovie) -> Iterator[MetadataMovie]: if query.id_tmdb: results = self._search_id(query.id_tmdb, query.language) elif query.name: - results = self._search_name(query.name, query.year, query.language) + results = self._search_name(query.name, None if query.date is None else query.date.year, query.language) else: raise MnamerNotFoundException yield from results @@ -166,13 +168,13 @@ def _search_id( yield MetadataMovie( name=response["title"], language=language, - year=response["release_date"], + date=response["release_date"], synopsis=response["overview"], id_tmdb=response["id"], id_imdb=response["imdb_id"], ) - def _search_name(self, name: str, year: str | None, language: Language | None): + def _search_name(self, name: str, year: int | None, language: Language | None): assert self.api_key page = 1 page_max = 5 # each page yields a maximum of 20 results @@ -193,9 +195,9 @@ def _search_name(self, name: str, year: str | None, language: Language | None): name=entry["title"], language=language, synopsis=entry["overview"], - year=entry["release_date"], + date=entry["release_date"], ) - if not meta.year: + if not meta.date: continue yield meta found = True @@ -229,13 +231,17 @@ def search(self, query: MetadataEpisode) -> Iterator[MetadataEpisode]: if not self.token: self.token = self._login() if query.id_tvdb and query.date: - results = self._search_tvdb_date(query.id_tvdb, query.date, query.language) + results = self._search_tvdb_date( + query.id_tvdb, query.date, query.language, query.season, query.episode + ) elif query.id_tvdb: results = self._search_id( query.id_tvdb, query.season, query.episode, query.language ) elif query.series and query.date: - results = self._search_series_date(query.series, query.date, query.language) + results = self._search_series_date( + query.series, query.date, query.language, query.season, query.episode + ) elif query.series: results = self._search_series( query.series, query.season, query.episode, query.language @@ -275,6 +281,7 @@ def _search_id( id_tvdb=id_tvdb, season=entry["airedSeason"], series=series_data["data"]["seriesName"], + series_date=series_data["data"]["firstAired"], language=language, synopsis=(entry["overview"] or "") .replace("\r\n", "") @@ -316,19 +323,41 @@ def _search_series( raise MnamerNotFoundException def _search_tvdb_date( - self, id_tvdb: str, release_date: dt.date, language: Language | None + self, + id_tvdb: str, + release_date: dt.date, + language: Language | None, + season: int | None = None, + episode: int | None = None ): release_date = parse_date(release_date) found = False for meta in self._search_id(id_tvdb, language=language): - if meta.date and meta.date == release_date: - found = True - yield meta + if meta.date: + if season is not None and season == meta.season \ + and episode is not None and episode == meta.episode: + if meta.date == release_date: + found = True + yield meta + elif release_date.month == 1 and release_date.month == 1 and \ + ( meta.date.year == release_date.year or \ + meta.series_date.year == release_date.year ): + found = True + yield meta + else: + if meta.date == release_date: + found = True + yield meta if not found: raise MnamerNotFoundException def _search_series_date( - self, series: str, release_date: dt.date, language: Language | None + self, + series: str, + release_date: dt.date, + language: Language | None, + season: int | None = None, + episode: int | None = None ): release_date = parse_date(release_date) series_data = tvdb_search_series( @@ -338,7 +367,7 @@ def _search_series_date( found = False for tvdb_id in tvdb_ids: try: - yield from self._search_tvdb_date(tvdb_id, release_date, language) + yield from self._search_tvdb_date(tvdb_id, release_date, language, season, episode) found = True except MnamerNotFoundException: continue @@ -480,6 +509,7 @@ def _transform_meta( id_tvmaze=id_tvmaze or None, season=episode_entry["season"], series=series_entry["name"], + series_date=series_entry["premiered"], synopsis=episode_entry["summary"] or None, title=episode_entry["name"] or None, ) diff --git a/mnamer/setting_store.py b/mnamer/setting_store.py index 8981d335..919bef80 100644 --- a/mnamer/setting_store.py +++ b/mnamer/setting_store.py @@ -1,5 +1,6 @@ import dataclasses import json +from functools import cached_property from pathlib import Path from typing import Any, Callable @@ -11,6 +12,7 @@ from mnamer.setting_spec import SettingSpec from mnamer.types import MediaType, ProviderType, SettingType from mnamer.utils import crawl_out, json_loads, normalize_containers +from mnamer import text_lang_guesser @dataclasses.dataclass @@ -106,6 +108,15 @@ class SettingStore: help="--language=: specify the search language", ).as_dict(), ) + subtitle_lang_guesser: Language | None = dataclasses.field( + default=None, + metadata=SettingSpec( + flags=["--subtitle-lang-guesser"], + group=SettingType.PARAMETER, + choices=list(text_lang_guesser.available_guessers), + help="--subtitle-lang-guesser=: subtitle file text language guesser (must be installed)", + ).as_dict(), + ) mask: list[str] = dataclasses.field( default_factory=lambda: [ "avi", @@ -177,7 +188,7 @@ class SettingStore: ).as_dict(), ) movie_format: str = dataclasses.field( - default="{name} ({year}).{extension}", + default="{name} ({date.year}).{extension}", metadata=SettingSpec( dest="movie_format", flags=["--movie_format", "--movie-format", "--movieformat"], @@ -217,6 +228,15 @@ class SettingStore: help="--episode-format: set episode renaming format specification", ).as_dict(), ) + symlink: bool = dataclasses.field( + default=False, + metadata=SettingSpec( + action="store_true", + flags=["--symlink"], + group=SettingType.PARAMETER, + help="--symlink: leaves a trailing symlink", + ).as_dict(), + ) # directive attributes ----------------------------------------------------- @@ -322,7 +342,7 @@ class SettingStore: default=False, metadata=SettingSpec( action="store_true", - flags=["--test"], + flags=["--test", "--dry-run", "--dryrun"], group=SettingType.DIRECTIVE, help="--test: mocks the renaming and moving of files", ).as_dict(), @@ -367,6 +387,12 @@ def specifications(cls) -> list[SettingSpec]: def _resolve_path(path: str | Path) -> Path: return Path(path).resolve() + @cached_property + def text_lang_guesser(self): + if not self.subtitle_lang_guesser: + return None + return text_lang_guesser.guesser(self.subtitle_lang_guesser, Language.all()) + def __setattr__(self, key: str, value: Any): converter_map: dict[str, Callable] = { "episode_api": ProviderType, diff --git a/mnamer/target.py b/mnamer/target.py index cd9927e2..2a5b8789 100644 --- a/mnamer/target.py +++ b/mnamer/target.py @@ -2,6 +2,7 @@ import datetime as dt from os import path +from os import symlink from pathlib import Path from shutil import move from typing import Any, ClassVar, Type @@ -116,7 +117,8 @@ def destination(self) -> Path: def _parse(self, file_path: Path): path_data: dict[str, Any] = {"language": self._settings.language} - if is_subtitle(self.source): + source_is_subtitle = is_subtitle(self.source) + if source_is_subtitle: try: path_data["language"] = Language.parse(self.source.stem[-2:]) file_path = Path(self.source.parent, self.source.stem[:-2]) @@ -148,6 +150,7 @@ def _parse(self, file_path: Path): None: Metadata, }[media_type] self.metadata = meta_cls() + self.metadata.original_filename = self.source.name self.metadata.quality = ( " ".join( path_data[key] @@ -167,6 +170,10 @@ def _parse(self, file_path: Path): self.metadata.language = path_data.get("language") self.metadata.group = path_data.get("release_group") self.metadata.container = file_path.suffix or None + if "date" in path_data: + self.metadata.date = path_data.get("date") + elif "year" in path_data: + self.metadata.date = "{}-01-01".format(path_data.get("year")) if not self.metadata.language: try: self.metadata.language = path_data.get("language") @@ -176,11 +183,20 @@ def _parse(self, file_path: Path): self.metadata.language_sub = path_data.get("subtitle_language") except MnamerException: pass + if ( + source_is_subtitle + and not self.metadata.language_sub + and self._settings.subtitle_lang_guesser + ): + try: + self.metadata.language_sub = ( + self._settings.text_lang_guesser.guess_language(self.source) + ) + except MnamerException: + pass if isinstance(self.metadata, MetadataMovie): self.metadata.name = path_data.get("title") - self.metadata.year = path_data.get("year") elif isinstance(self.metadata, MetadataEpisode): - self.metadata.date = path_data.get("date") self.metadata.episode = path_data.get("episode") self.metadata.season = path_data.get("season") self.metadata.series = path_data.get("title") @@ -242,7 +258,14 @@ def relocate(self) -> None: """Performs the action of renaming and/or moving a file.""" destination_path = Path(self.destination).resolve() destination_path.parent.mkdir(parents=True, exist_ok=True) + if path.islink(destination_path) == True: + print("Skipped symlink") + return try: move(str(self.source), destination_path) + if self._settings.symlink: + if path.islink(destination_path) == False: + symlink(destination_path, str(self.source)) + except OSError: # pragma: no cover raise MnamerException diff --git a/mnamer/text_lang_guesser/__init__.py b/mnamer/text_lang_guesser/__init__.py new file mode 100644 index 00000000..fe86c1d9 --- /dev/null +++ b/mnamer/text_lang_guesser/__init__.py @@ -0,0 +1,54 @@ +import logging +from typing import Dict +from mnamer.exceptions import ( + MnamerFailedLangGuesserInstantiation, + MnamerNoSuchLangGuesser, +) +from mnamer.language import Language +from importlib import import_module + + +def _import_module(dotted_module_name: str): + try: + return import_module(dotted_module_name) + except ImportError as e: + logging.debug(f"Failed to import {dotted_module_name}: {e}", exc_info=e) + return None + + +possible_guessers = ( + ("lingua", "mnamer.text_lang_guesser.lingua.LinguaGuesser"), + ("langdetect", "mnamer.text_lang_guesser.langdetect.LangdetectGuesser"), + ("fasttext", "mnamer.text_lang_guesser.fasttext.FasttextGuesser"), + ("langid", "mnamer.text_lang_guesser.langid.LangidGuesser"), +) + +available_guessers = {} +for name, module_class in possible_guessers: + module_name, classname = module_class.rsplit(".", 1) + mod = _import_module(module_name) + if mod: + try: + cls = getattr(mod, classname) + except AttributeError as e: + logging.debug( + f"Failed to load class {classname} from module {mod}: {e}", exc_info=e + ) + continue + available_guessers[name] = cls + + +def guesser(name: str, guess_languages: Dict[str, Language]): + if name not in available_guessers: + raise MnamerNoSuchLangGuesser("Unrecognized language guesser") + try: + return available_guessers[name](guess_languages=guess_languages) + except Exception as e: + class_name = available_guessers[name].__name__ + logging.debug( + f"Error trying to instantiate {class_name}", + exc_info=e, + ) + raise MnamerFailedLangGuesserInstantiation( + f"Failed creating guesser {class_name}" + ) diff --git a/mnamer/text_lang_guesser/base.py b/mnamer/text_lang_guesser/base.py new file mode 100644 index 00000000..ea70560c --- /dev/null +++ b/mnamer/text_lang_guesser/base.py @@ -0,0 +1,174 @@ +from abc import ABC, abstractmethod +from pathlib import Path +import logging +import os +import re +from typing import List, Optional +from chardet.universaldetector import UniversalDetector +from mnamer.language import Language + + +class TextLanguageGuesser(ABC): + def __init__(self, guess_languages: List[Language], min_probability: float = 0.9): + self.guess_languages = guess_languages + self.language_map = self._language_map(guess_languages) + self.min_probability = min_probability + self.identifier = self._initialize_identifier() + + exp_only_nums = r"^\d+$" + exp_timeframe = r"^[\s0-9:.,>-]+$" + skip_patterns = [exp_only_nums, exp_timeframe] + self.skip_line_expressions_str = [re.compile(exp) for exp in skip_patterns] + self.skip_line_expressions_bytes = [ + re.compile(exp.encode("ascii")) for exp in skip_patterns + ] + self.encoding_detector = UniversalDetector() + + @abstractmethod + def guess_language_from_text(self, text: str) -> Optional[str]: + """ + Guess the language, based on the text in the file. + """ + pass + + def _language_map(self, lang_list: List[Language]): + """ + Returns a dict that will be used to map an identification result to a Language. + """ + return {lang.a2: lang for lang in lang_list} + + @abstractmethod + def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): + """ + Set up the language identifier, and return it. + It will be available in self.identifier. + + If restrict_to_langs is present, the identifier should restrict + its identification efforts to the given languages. + + Note that restricting the languages used is usually not a good idea + because it increases the possibility of false positives. + + :param restrict_to_langs: a list of two-letter language codes. + """ + pass + + def _skip_line(self, line, skip_expressions) -> bool: + stripped = line.strip() + if not stripped: + return True + for exp in skip_expressions: + if exp.match(stripped): + return True + return False + + def _detect_file_encoding(self, filepath) -> dict: + """ + Tries to guess the encoding (utf-8, iso-8859-1, etc). + + The returned dict has these fields of interest: + { + "encoding": str, + "confidence": float between 0 and 1 + } + """ + self.encoding_detector.reset() + for line in open(filepath, "rb"): + if self._skip_line(line, self.skip_line_expressions_bytes): + continue + self.encoding_detector.feed(line) + if self.encoding_detector.done: + break + self.encoding_detector.close() + + result = dict(self.encoding_detector.result) + if result["encoding"] == "ascii": + result["encoding"] = "utf-8" + return result + + def _read_lines_from_file( + self, filepath, encoding: str, lines=200, skip_first_lines=10 + ) -> str: + """ + Read a certain number of lines from the file, returning a unicode string. + + Lines that are subtitle control lines (only numbers, or time ranges) + are filtered out, and do not count towards the number of lines. + + By default, the 10 first lines are skipped. The reasoning behind + that is that perhaps the first lines contain subtitle credits + (e.g. a little advertisement for the subtitle creator), which may + not correspond to the principal language of the file. + """ + stop_count = lines + skip_first_lines + text = "" + i = 0 + for line in open(filepath, mode="r", encoding=encoding): + if self._skip_line(line, self.skip_line_expressions_str): + continue + + i += 1 + if i <= skip_first_lines: + continue + + text += line + if i > stop_count: + break + return text + + def _get_file_text(self, filepath) -> Optional[str]: + """ + Tries to determine the file encoding and read some lines from the file. + + If the confidence for the encoding is not high enough, or an error + occurs while reading lines from the file, the return value is None. + """ + encoding = self._detect_file_encoding(filepath) + text = None + if encoding["confidence"] >= 0.6: + try: + text = self._read_lines_from_file( + filepath, encoding=encoding["encoding"] + ) + except Exception as e: + logging.warning( + f"Unable to read file {filepath} with encoding {encoding['encoding']}. " + f"Error: {e}" + ) + return text + + @staticmethod + def boolean_env_var(env_var, default=None) -> Optional[bool]: + value = os.getenv(env_var) + if value is None: + return default + value = value.strip().lower() + if value in ["true", "yes", "1"]: + return True + return False + + def guess_language(self, filepath: Path) -> Optional[Language]: + """ + Reads text from the file and passes it the implementation-specific + guess_language_from_text() method. + + If a matching mnamer.Language exists, it is returned, otherwise None. + """ + text = self._get_file_text(filepath) + + if not text: + return None + + guessed_language = None + try: + guessed_language = self.guess_language_from_text(text) + except Exception as e: + logging.warning( + "Unexpected error while guessing language from file text. " + f"File: {filepath}, Error: {e}" + ) + + if not guessed_language: + return None + + return self.language_map.get(guessed_language, None) diff --git a/mnamer/text_lang_guesser/fasttext.py b/mnamer/text_lang_guesser/fasttext.py new file mode 100644 index 00000000..a17c7db4 --- /dev/null +++ b/mnamer/text_lang_guesser/fasttext.py @@ -0,0 +1,35 @@ +from typing import Optional, Dict, Union +from ftlangdetect.detect import get_or_load_model +from mnamer.text_lang_guesser.base import TextLanguageGuesser + + +class FasttextGuesser(TextLanguageGuesser): + """ + Installation note: a modern g++ version is required for building fasttext. + """ + + def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): + # Note: It seems there is no way to restrict languages for fasttext. + low_memory = self.boolean_env_var("FASTTEXT_LOW_MEMORY", False) + return get_or_load_model(low_memory=low_memory) + + def detect(self, text: str) -> Optional[Dict[str, Union[str, float]]]: + """ + Modified version of ftlangdetect.detect.detect, that specifies the threshold. + """ + labels, scores = self.identifier.predict(text, threshold=self.min_probability) + if not labels: + return None + label = labels[0].replace("__label__", "") + score = min(float(scores[0]), 1.0) + return { + "lang": label, + "score": score, + } + + def guess_language_from_text(self, text: str) -> Optional[str]: + text = text.replace("\n", " ").replace("\r", "") + guessed_language = self.detect(text) + if not guessed_language: + return None + return guessed_language["lang"] diff --git a/mnamer/text_lang_guesser/langdetect.py b/mnamer/text_lang_guesser/langdetect.py new file mode 100644 index 00000000..9eb3a724 --- /dev/null +++ b/mnamer/text_lang_guesser/langdetect.py @@ -0,0 +1,47 @@ +import logging +from pathlib import Path +from typing import Optional, List +from langdetect.detector_factory import DetectorFactory, PROFILES_DIRECTORY +from mnamer.language import Language +from mnamer.text_lang_guesser.base import TextLanguageGuesser + + +class LangdetectGuesser(TextLanguageGuesser): + def _language_map(self, lang_list: List[Language]): + lang_map = super()._language_map(lang_list) + zh = lang_map.pop("zh", None) + if zh: + # lang-detect has zh-cn and zh-tw. Map them both to mnamer's zh. + lang_map["zh-cn"] = zh + lang_map["zh-tw"] = zh + return lang_map + + def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): + # Be deterministic. Without this, langdetect could guess different + # languages for the same short text. + DetectorFactory.seed = 0 + + identifier = DetectorFactory() + if restrict_to_langs: + profiles_root = Path(PROFILES_DIRECTORY) + json_profiles = [] + for lang in self.language_map: + profile = profiles_root / lang + if profile.is_file(): + json_profiles.append(profile.read_text(encoding="utf-8")) + else: + logging.warning(f"Language profile not found for language '{lang}'") + identifier.load_json_profile(json_profiles) + else: + identifier.load_profile(PROFILES_DIRECTORY) + return identifier + + def guess_language_from_text(self, text: str) -> Optional[str]: + detector = self.identifier.create() + detector.append(text) + guessed_languages = detector.get_probabilities() + if not guessed_languages: + return None + lang = guessed_languages[0] + if lang.prob >= self.min_probability: + return lang.lang diff --git a/mnamer/text_lang_guesser/langid.py b/mnamer/text_lang_guesser/langid.py new file mode 100644 index 00000000..c1c7671c --- /dev/null +++ b/mnamer/text_lang_guesser/langid.py @@ -0,0 +1,17 @@ +from typing import Optional +from py3langid.langid import LanguageIdentifier, MODEL_FILE +from mnamer.text_lang_guesser.base import TextLanguageGuesser + + +class LangidGuesser(TextLanguageGuesser): + def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): + identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True) + if restrict_to_langs: + identifier.set_languages(restrict_to_langs) + return identifier + + def guess_language_from_text(self, text: str) -> Optional[str]: + guessed_language = self.identifier.classify(text) + if not guessed_language or guessed_language[1] < self.min_probability: + return None + return guessed_language[0] diff --git a/mnamer/text_lang_guesser/lingua.py b/mnamer/text_lang_guesser/lingua.py new file mode 100644 index 00000000..794b912f --- /dev/null +++ b/mnamer/text_lang_guesser/lingua.py @@ -0,0 +1,34 @@ +from typing import List, Optional +from lingua import LanguageDetectorBuilder +from lingua import Language as LinguaLanguage +from mnamer.language import Language +from mnamer.text_lang_guesser.base import TextLanguageGuesser + + +class LinguaGuesser(TextLanguageGuesser): + def _language_map(self, lang_list: List[Language]): + """ + Returns a dict that will be used to map an identification result to a Language. + """ + upcase_map = {lang.name.upper(): lang for lang in lang_list} + + return { + lang: upcase_map[lang.name] + for lang in LinguaLanguage.all() + if lang.name in upcase_map + } + + def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): + if restrict_to_langs: + language_list = self.language_map.keys() + else: + language_list = LinguaLanguage.all() + + return ( + LanguageDetectorBuilder.from_languages(*language_list) + .with_minimum_relative_distance(self.min_probability) + .build() + ) + + def guess_language_from_text(self, text: str) -> Optional[str]: + return self.identifier.detect_language_of(text) diff --git a/mnamer/utils.py b/mnamer/utils.py index 84df7870..177b3366 100644 --- a/mnamer/utils.py +++ b/mnamer/utils.py @@ -141,6 +141,8 @@ def is_subtitle(container: str | Path | None) -> bool: return False return str(container).endswith(tuple(SUBTITLE_CONTAINERS)) +def is_symlink(path: Path) -> str: + return os.path.islink(str(Path)) def get_session() -> requests_cache.CachedSession: """Convenience function that returns request-cache session singleton.""" @@ -476,30 +478,3 @@ def str_title_case(s: str) -> str: s = s[:pos] + exception.upper() + s[pos + word_length :] return s - - -def year_parse(s: str) -> int | None: - """Parses a year from a string.""" - regex = r"((?:19|20)\d{2})(?:$|[-/]\d{2}[-/]\d{2})" - try: - return int(re.findall(regex, str(s))[0]) - except IndexError: - return None - - -def year_range_parse(years: str | int | None, tolerance: int = 1) -> tuple[int, int]: - """Parses a year or dash-delimited year range.""" - regex = r"^((?:19|20)\d{2})?([-,: ]*)?((?:19|20)\d{2})?$" - default_start = 1900 - default_end = CURRENT_YEAR - try: - start, dash, end = re.match(regex, str(years).strip()).groups() # type: ignore - except AttributeError: - start, end, dash = None, None, True - if not start and not end: - start, end, dash = None, None, True - start = int(start or default_start) - end = int(end or default_end) - if not dash: - end = start - return start - tolerance, end + tolerance diff --git a/pyproject.toml b/pyproject.toml index 1d58cc21..5d3c61b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,11 @@ dependencies = { file = "requirements.txt" } [tool.setuptools.dynamic.optional-dependencies] dev = { file = "requirements-dev.txt" } +guess_langid = { file = "requirements-guess-langid.txt" } +guess_lingua = { file = "requirements-guess-lingua.txt" } +guess_fasttext = { file = "requirements-guess-fasttext.txt" } +guess_langdetect = { file = "requirements-guess-langdetect.txt" } +guess_all = { file = "requirements-guess-all.txt" } [build-system] requires = ["setuptools >= 61.0.0", "setuptools_scm[toml] >= 6.2", "wheel"] diff --git a/requirements-dev.txt b/requirements-dev.txt index 314f0072..fe3bcc2f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,13 +2,13 @@ black ~= 23.7.0 build ~= 0.10.0 codecov ~= 2.1.13 isort ~= 5.11.4 -mypy ~= 0.991 +mypy ~= 1.7 pyflakes ~= 3.0.1 -pylint ~= 2.15.10 +pylint ~= 3.0.2 pytest ~= 7.4.0 pytest-cov ~= 4.1.0 pytest-rerunfailures ~= 12.0 -setuptools ~= 68.1.0 +setuptools ~= 68.2.2 twine ~= 4.0.2 -types-requests ~= 2.28 +types-requests ~= 2.31 wheel ~= 0.38.4 diff --git a/requirements-guess-all.txt b/requirements-guess-all.txt new file mode 100644 index 00000000..f90cce91 --- /dev/null +++ b/requirements-guess-all.txt @@ -0,0 +1,5 @@ +chardet >= 5.2.0 +py3langid ~= 0.2.2 +lingua-language-detector ~= 2.0.2 +fasttext-langdetect ~= 1.0.5 +langdetect ~= 1.0.9 diff --git a/requirements-guess-fasttext.txt b/requirements-guess-fasttext.txt new file mode 100644 index 00000000..b04d0dca --- /dev/null +++ b/requirements-guess-fasttext.txt @@ -0,0 +1,2 @@ +chardet >= 5.2.0 +fasttext-langdetect ~= 1.0.5 \ No newline at end of file diff --git a/requirements-guess-langdetect.txt b/requirements-guess-langdetect.txt new file mode 100644 index 00000000..7cfaec7e --- /dev/null +++ b/requirements-guess-langdetect.txt @@ -0,0 +1,2 @@ +chardet >= 5.2.0 +langdetect ~= 1.0.9 \ No newline at end of file diff --git a/requirements-guess-langid.txt b/requirements-guess-langid.txt new file mode 100644 index 00000000..e140d37a --- /dev/null +++ b/requirements-guess-langid.txt @@ -0,0 +1,2 @@ +chardet >= 5.2.0 +py3langid ~= 0.2.2 \ No newline at end of file diff --git a/requirements-guess-lingua.txt b/requirements-guess-lingua.txt new file mode 100644 index 00000000..197bd646 --- /dev/null +++ b/requirements-guess-lingua.txt @@ -0,0 +1,2 @@ +chardet >= 5.2.0 +lingua-language-detector ~= 2.0.2 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index be464c26..37d36bac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ appdirs ~= 1.4.4 babelfish ~= 0.6.0 -guessit ~= 3.7.1 +guessit ~= 3.8.0 requests == 2.* requests_cache ~= 0.9.7 setuptools_scm ~= 7.1.0 diff --git a/tests/e2e/test_moving.py b/tests/e2e/test_moving.py index 7b8ccfb6..f6cb1324 100644 --- a/tests/e2e/test_moving.py +++ b/tests/e2e/test_moving.py @@ -211,3 +211,15 @@ def test_ambiguous_language_deletction(e2e_run, setup_test_files): ) result = e2e_run("--batch", ".") assert result.code == 0 + + +@pytest.mark.usefixtures("setup_test_dir") +def test_original_filename(e2e_run, setup_test_files): + setup_test_files("archer.2009.s10e07.webrip.x264-lucidtv.mp4") + result = e2e_run( + "--batch", + "--episode-format='{original_filename}'", + ".", + ) + assert result.code == 0 + assert "archer.2009.s10e07.webrip.x264-lucidtv.mp4" in result.out