From 9f19b5f5eff0f58dffb864bfe84e0ee4963e2ae9 Mon Sep 17 00:00:00 2001 From: zebdo Date: Tue, 31 Jan 2023 22:20:07 +1100 Subject: [PATCH 01/29] added --symlink --- mnamer/setting_store.py | 9 +++++++++ mnamer/target.py | 3 +++ 2 files changed, 12 insertions(+) diff --git a/mnamer/setting_store.py b/mnamer/setting_store.py index 8981d335..29c11582 100644 --- a/mnamer/setting_store.py +++ b/mnamer/setting_store.py @@ -217,6 +217,15 @@ class SettingStore: help="--episode-format: set episode renaming format specification", ).as_dict(), ) + symlink: bool = dataclasses.field( + default=False, + metadata=SettingSpec( + action="store_true", + flags=["--symlink"], + group=SettingType.PARAMETER, + help="--symlink: leaves a trailing symlink", + ).as_dict(), + ) # directive attributes ----------------------------------------------------- diff --git a/mnamer/target.py b/mnamer/target.py index cd9927e2..01a15643 100644 --- a/mnamer/target.py +++ b/mnamer/target.py @@ -244,5 +244,8 @@ def relocate(self) -> None: destination_path.parent.mkdir(parents=True, exist_ok=True) try: move(str(self.source), destination_path) + if self._settings.symlink: + symlink(destination_path, str(self.source)) + except OSError: # pragma: no cover raise MnamerException From ff308d824bb723010234415a613d765045e2d48c Mon Sep 17 00:00:00 2001 From: zebdo Date: Tue, 31 Jan 2023 22:25:35 +1100 Subject: [PATCH 02/29] add symlink import from os --- mnamer/target.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mnamer/target.py b/mnamer/target.py index 01a15643..d1eca8f0 100644 --- a/mnamer/target.py +++ b/mnamer/target.py @@ -2,6 +2,7 @@ import datetime as dt from os import path +from os import symlink from pathlib import Path from shutil import move from typing import Any, ClassVar, Type From 490d2763dd0dbfdeebb3357f6bfe9bf700e9bdea Mon Sep 17 00:00:00 2001 From: zebdo Date: Tue, 31 Jan 2023 22:34:18 +1100 Subject: [PATCH 03/29] Update target.py --- mnamer/target.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mnamer/target.py b/mnamer/target.py index d1eca8f0..5e1cc103 100644 --- a/mnamer/target.py +++ b/mnamer/target.py @@ -247,6 +247,5 @@ def relocate(self) -> None: move(str(self.source), destination_path) if self._settings.symlink: symlink(destination_path, str(self.source)) - except OSError: # pragma: no cover raise MnamerException From 8c42383a8a99493a6f65d57d3c92ad3fa3d04444 Mon Sep 17 00:00:00 2001 From: zebdo Date: Sat, 25 Feb 2023 12:06:35 +1100 Subject: [PATCH 04/29] added an os.path.islink check --- mnamer/target.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mnamer/target.py b/mnamer/target.py index 5e1cc103..cf44a494 100644 --- a/mnamer/target.py +++ b/mnamer/target.py @@ -246,6 +246,8 @@ def relocate(self) -> None: try: move(str(self.source), destination_path) if self._settings.symlink: - symlink(destination_path, str(self.source)) + if path.islink(destination_path): + symlink(destination_path, str(self.source)) + except OSError: # pragma: no cover raise MnamerException From bfb1d7cf175cf138d2bc922e4f83b2bf47b435b8 Mon Sep 17 00:00:00 2001 From: zebdo Date: Sat, 25 Feb 2023 12:14:41 +1100 Subject: [PATCH 05/29] Update target.py --- mnamer/target.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mnamer/target.py b/mnamer/target.py index cf44a494..61b659f7 100644 --- a/mnamer/target.py +++ b/mnamer/target.py @@ -246,7 +246,7 @@ def relocate(self) -> None: try: move(str(self.source), destination_path) if self._settings.symlink: - if path.islink(destination_path): + if !path.islink(destination_path): symlink(destination_path, str(self.source)) except OSError: # pragma: no cover From 4068b3493b7b0142cdc68f425a24ac373d5e9e22 Mon Sep 17 00:00:00 2001 From: zebdo Date: Sat, 25 Feb 2023 12:16:42 +1100 Subject: [PATCH 06/29] zz --- mnamer/target.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mnamer/target.py b/mnamer/target.py index 61b659f7..2f7c98bd 100644 --- a/mnamer/target.py +++ b/mnamer/target.py @@ -246,7 +246,7 @@ def relocate(self) -> None: try: move(str(self.source), destination_path) if self._settings.symlink: - if !path.islink(destination_path): + if path.islink(destination_path) == False: symlink(destination_path, str(self.source)) except OSError: # pragma: no cover From 1474f9509292b8ef475a708a7f0c25541969a4a3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 21 Aug 2023 14:52:55 +0000 Subject: [PATCH 07/29] Update types-requests requirement from ~=2.28 to ~=2.31 Updates the requirements on [types-requests](https://github.com/python/typeshed) to permit the latest version. - [Commits](https://github.com/python/typeshed/commits) --- updated-dependencies: - dependency-name: types-requests dependency-type: direct:development ... Signed-off-by: dependabot[bot] --- requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 314f0072..c85fafe8 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,5 +10,5 @@ pytest-cov ~= 4.1.0 pytest-rerunfailures ~= 12.0 setuptools ~= 68.1.0 twine ~= 4.0.2 -types-requests ~= 2.28 +types-requests ~= 2.31 wheel ~= 0.38.4 From e3d7c4f9a3ff685a745abf6b5cada2aacc836ec5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 18 Sep 2023 14:13:07 +0000 Subject: [PATCH 08/29] Update setuptools requirement from ~=68.1.0 to ~=68.2.2 Updates the requirements on [setuptools](https://github.com/pypa/setuptools) to permit the latest version. - [Release notes](https://github.com/pypa/setuptools/releases) - [Changelog](https://github.com/pypa/setuptools/blob/main/NEWS.rst) - [Commits](https://github.com/pypa/setuptools/compare/v68.1.0...v68.2.2) --- updated-dependencies: - dependency-name: setuptools dependency-type: direct:development ... Signed-off-by: dependabot[bot] --- requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 314f0072..ce552691 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -8,7 +8,7 @@ pylint ~= 2.15.10 pytest ~= 7.4.0 pytest-cov ~= 4.1.0 pytest-rerunfailures ~= 12.0 -setuptools ~= 68.1.0 +setuptools ~= 68.2.2 twine ~= 4.0.2 types-requests ~= 2.28 wheel ~= 0.38.4 From e41f6cca993fc96115ef625fa07606994b23ad49 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Oct 2023 14:58:31 +0000 Subject: [PATCH 09/29] Update pylint requirement from ~=2.15.10 to ~=3.0.2 Updates the requirements on [pylint](https://github.com/pylint-dev/pylint) to permit the latest version. - [Release notes](https://github.com/pylint-dev/pylint/releases) - [Commits](https://github.com/pylint-dev/pylint/compare/v2.15.10...v3.0.2) --- updated-dependencies: - dependency-name: pylint dependency-type: direct:development ... Signed-off-by: dependabot[bot] --- requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 314f0072..dfa4861a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,7 +4,7 @@ codecov ~= 2.1.13 isort ~= 5.11.4 mypy ~= 0.991 pyflakes ~= 3.0.1 -pylint ~= 2.15.10 +pylint ~= 3.0.2 pytest ~= 7.4.0 pytest-cov ~= 4.1.0 pytest-rerunfailures ~= 12.0 From 1c581b5558193424adcac9228a15689c6bec55fe Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 Nov 2023 14:27:01 +0000 Subject: [PATCH 10/29] Update mypy requirement from ~=0.991 to ~=1.7 Updates the requirements on [mypy](https://github.com/python/mypy) to permit the latest version. - [Changelog](https://github.com/python/mypy/blob/master/CHANGELOG.md) - [Commits](https://github.com/python/mypy/compare/v0.991...v1.7.0) --- updated-dependencies: - dependency-name: mypy dependency-type: direct:development ... Signed-off-by: dependabot[bot] --- requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 314f0072..eb2f13af 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,7 +2,7 @@ black ~= 23.7.0 build ~= 0.10.0 codecov ~= 2.1.13 isort ~= 5.11.4 -mypy ~= 0.991 +mypy ~= 1.7 pyflakes ~= 3.0.1 pylint ~= 2.15.10 pytest ~= 7.4.0 From a2dc1d0968215a637707888d4292f7c45dd36439 Mon Sep 17 00:00:00 2001 From: Big Eater Date: Sun, 31 Dec 2023 15:37:57 +0100 Subject: [PATCH 11/29] Guess text with lingua --- mnamer/exceptions.py | 9 ++++ mnamer/setting_store.py | 17 +++++++ mnamer/target.py | 8 +++- mnamer/text_lang_guesser/__init__.py | 28 ++++++++++++ mnamer/text_lang_guesser/base.py | 66 ++++++++++++++++++++++++++++ mnamer/text_lang_guesser/lingua.py | 57 ++++++++++++++++++++++++ 6 files changed, 184 insertions(+), 1 deletion(-) create mode 100644 mnamer/text_lang_guesser/__init__.py create mode 100644 mnamer/text_lang_guesser/base.py create mode 100644 mnamer/text_lang_guesser/lingua.py diff --git a/mnamer/exceptions.py b/mnamer/exceptions.py index ccf40ca5..a25cbe3e 100644 --- a/mnamer/exceptions.py +++ b/mnamer/exceptions.py @@ -19,3 +19,12 @@ class MnamerNetworkException(MnamerException): class MnamerNotFoundException(MnamerException): """Raised when a lookup or search works as expected yet yields no results.""" + +class MnamerFailedLangGuesserImport(MnamerException): + """ + Raised when a requested text language guesser failed to import, probably + because the python packages that the guesser depends on are not installed. + """ + +class MnamerNoSuchLangGuesser(MnamerException): + """Raised when a requested text language guesser name does not match any known guessers.""" diff --git a/mnamer/setting_store.py b/mnamer/setting_store.py index 8981d335..acac7914 100644 --- a/mnamer/setting_store.py +++ b/mnamer/setting_store.py @@ -1,5 +1,6 @@ import dataclasses import json +from functools import cached_property from pathlib import Path from typing import Any, Callable @@ -11,6 +12,7 @@ from mnamer.setting_spec import SettingSpec from mnamer.types import MediaType, ProviderType, SettingType from mnamer.utils import crawl_out, json_loads, normalize_containers +from mnamer import text_lang_guesser @dataclasses.dataclass @@ -106,6 +108,15 @@ class SettingStore: help="--language=: specify the search language", ).as_dict(), ) + subtitle_lang_guesser: Language | None = dataclasses.field( + default=None, + metadata=SettingSpec( + flags=["--subtitle-lang-guesser"], + group=SettingType.PARAMETER, + choices=['lingua', 'langdetect'], + help="--subtitle-lang-guesser=: subtitle file text language guesser (must be installed)", + ).as_dict(), + ) mask: list[str] = dataclasses.field( default_factory=lambda: [ "avi", @@ -367,6 +378,12 @@ def specifications(cls) -> list[SettingSpec]: def _resolve_path(path: str | Path) -> Path: return Path(path).resolve() + @cached_property + def text_lang_guesser(self): + if not self.subtitle_lang_guesser: + return None + return text_lang_guesser.guesser(self.subtitle_lang_guesser, Language.all()) + def __setattr__(self, key: str, value: Any): converter_map: dict[str, Callable] = { "episode_api": ProviderType, diff --git a/mnamer/target.py b/mnamer/target.py index cd9927e2..9caaac68 100644 --- a/mnamer/target.py +++ b/mnamer/target.py @@ -116,7 +116,8 @@ def destination(self) -> Path: def _parse(self, file_path: Path): path_data: dict[str, Any] = {"language": self._settings.language} - if is_subtitle(self.source): + source_is_subtitle = is_subtitle(self.source) + if source_is_subtitle: try: path_data["language"] = Language.parse(self.source.stem[-2:]) file_path = Path(self.source.parent, self.source.stem[:-2]) @@ -176,6 +177,11 @@ def _parse(self, file_path: Path): self.metadata.language_sub = path_data.get("subtitle_language") except MnamerException: pass + if source_is_subtitle and not self.metadata.language_sub and self._settings.subtitle_lang_guesser: + try: + self.metadata.language_sub = self._settings.text_lang_guesser.guess_language(self.source) + except MnamerException: + pass if isinstance(self.metadata, MetadataMovie): self.metadata.name = path_data.get("title") self.metadata.year = path_data.get("year") diff --git a/mnamer/text_lang_guesser/__init__.py b/mnamer/text_lang_guesser/__init__.py new file mode 100644 index 00000000..fe26a9d1 --- /dev/null +++ b/mnamer/text_lang_guesser/__init__.py @@ -0,0 +1,28 @@ +import logging +from typing import Dict +from mnamer.exceptions import MnamerFailedLangGuesserImport, MnamerNoSuchLangGuesser +from mnamer.language import Language + + +known_guessers = ['lingua', 'langdetect'] + +def guesser(name: str, guess_languages: Dict[str, Language]): + lower_name = name.lower() + try: + if lower_name == 'lingua': + from mnamer.text_lang_guesser.lingua import LinguaGuesser + guesser_cls = LinguaGuesser + elif lower_name == 'langdetect': + from mnamer.text_lang_guesser.langdetect import LangdetectGuesser + guesser_cls = LangdetectGuesser + else: + raise MnamerNoSuchLangGuesser("Unrecognized language guesser") + except ImportError as e: + logging.debug(f"Failed to import text language guesser '{name}'", exc_info=e) + raise MnamerFailedLangGuesserImport(f"Failed to import text language guesser '{name}': {e}") + + try: + return guesser_cls(guess_languages) + except Exception as e: + logging.debug(f"Error trying to instantiate {guesser_cls.__name__}", exc_info=e) + raise e diff --git a/mnamer/text_lang_guesser/base.py b/mnamer/text_lang_guesser/base.py new file mode 100644 index 00000000..e7a3284a --- /dev/null +++ b/mnamer/text_lang_guesser/base.py @@ -0,0 +1,66 @@ +from abc import ABC, abstractmethod +from pathlib import Path +import re +from typing import List, Optional +from chardet.universaldetector import UniversalDetector +from mnamer.language import Language + + +class TextLanguageGuesser(ABC): + def __init__(self, guess_languages: List[Language]): + self.guess_languages = guess_languages + exp_only_nums = r"^\d+$" + exp_timeframe = r"^[\s0-9:.,>-]+$" + skip_patterns = [exp_only_nums, exp_timeframe] + self.skip_line_expressions_str = [re.compile(exp) for exp in skip_patterns] + self.skip_line_expressions_bytes = [ + re.compile(exp.encode("ascii")) for exp in skip_patterns + ] + self.encoding_detector = UniversalDetector() + + @abstractmethod + def guess_language(self, filepath: Path) -> Optional[Language]: + pass + + def _skip_line(self, line, skip_expressions) -> bool: + stripped = line.strip() + if not stripped: + return True + for exp in skip_expressions: + if exp.match(stripped): + return True + return False + + def _detect_file_encoding(self, filepath): + self.encoding_detector.reset() + for line in open(filepath, "rb"): + if self._skip_line(line, self.skip_line_expressions_bytes): + continue + self.encoding_detector.feed(line) + if self.encoding_detector.done: + break + self.encoding_detector.close() + + result = dict(self.encoding_detector.result) + if result["encoding"] == "ascii": + result["encoding"] = "utf-8" + return result + + def _read_lines_from_file( + self, filepath, encoding: str, lines=100, skip_first_lines=10 + ) -> str: + stop_count = lines + skip_first_lines + text = "" + i = 0 + for line in open(filepath, mode="r", encoding=encoding): + if self._skip_line(line, self.skip_line_expressions_str): + continue + + i += 1 + if i <= skip_first_lines: + continue + + text += line + if i > stop_count: + break + return text diff --git a/mnamer/text_lang_guesser/lingua.py b/mnamer/text_lang_guesser/lingua.py new file mode 100644 index 00000000..09d81ae7 --- /dev/null +++ b/mnamer/text_lang_guesser/lingua.py @@ -0,0 +1,57 @@ +import logging +from pathlib import Path +from typing import Optional +from lingua import LanguageDetectorBuilder +from lingua import Language as LinguaLanguage +from mnamer.language import Language +from mnamer.text_lang_guesser.base import TextLanguageGuesser + + +class LinguaGuesser(TextLanguageGuesser): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + guess_upper = { + lang.name.upper(): lang for lang in self.guess_languages + } + self.search_langs = { + lang: guess_upper[lang.name] + for lang in LinguaLanguage.all() + if lang.name in guess_upper + } + + self.detector = ( + LanguageDetectorBuilder.from_languages(*self.search_langs.keys()) + .with_minimum_relative_distance(0.9) + .build() + ) + + def guess_language(self, filepath: Path) -> Optional[Language]: + encoding = self._detect_file_encoding(filepath) + text = None + if encoding["confidence"] >= 0.6: + try: + text = self._read_lines_from_file( + filepath, encoding=encoding["encoding"] + ) + except Exception as e: + logging.warning( + f"Unable to read file {filepath} with encoding {encoding['encoding']}. " + f"Error: {e}" + ) + + if not text: + return None + + guessed_language = None + try: + guessed_language = self.detector.detect_language_of(text) + except Exception as e: + logging.warning( + "Unexpected error while guessing language from file text. " + f"File: {filepath}, Error: {e}" + ) + + if not guessed_language: + return None + + return self.search_langs[guessed_language] From cd05a0a276b8a1adde55ca5ff0d57851dae54f5f Mon Sep 17 00:00:00 2001 From: Big Eater Date: Sun, 31 Dec 2023 15:37:57 +0100 Subject: [PATCH 12/29] Add langdetect --- mnamer/exceptions.py | 6 ++- mnamer/setting_store.py | 2 +- mnamer/target.py | 10 ++++- mnamer/text_lang_guesser/__init__.py | 14 +++--- mnamer/text_lang_guesser/base.py | 19 +++++++- mnamer/text_lang_guesser/langdetect.py | 60 ++++++++++++++++++++++++++ mnamer/text_lang_guesser/lingua.py | 25 +++-------- 7 files changed, 106 insertions(+), 30 deletions(-) create mode 100644 mnamer/text_lang_guesser/langdetect.py diff --git a/mnamer/exceptions.py b/mnamer/exceptions.py index a25cbe3e..343923cc 100644 --- a/mnamer/exceptions.py +++ b/mnamer/exceptions.py @@ -20,11 +20,13 @@ class MnamerNetworkException(MnamerException): class MnamerNotFoundException(MnamerException): """Raised when a lookup or search works as expected yet yields no results.""" + class MnamerFailedLangGuesserImport(MnamerException): """ - Raised when a requested text language guesser failed to import, probably - because the python packages that the guesser depends on are not installed. + Raised when a requested text language guesser failed to import, probably + because the python packages that the guesser depends on are not installed. """ + class MnamerNoSuchLangGuesser(MnamerException): """Raised when a requested text language guesser name does not match any known guessers.""" diff --git a/mnamer/setting_store.py b/mnamer/setting_store.py index acac7914..b49493fb 100644 --- a/mnamer/setting_store.py +++ b/mnamer/setting_store.py @@ -113,7 +113,7 @@ class SettingStore: metadata=SettingSpec( flags=["--subtitle-lang-guesser"], group=SettingType.PARAMETER, - choices=['lingua', 'langdetect'], + choices=["lingua", "langdetect"], help="--subtitle-lang-guesser=: subtitle file text language guesser (must be installed)", ).as_dict(), ) diff --git a/mnamer/target.py b/mnamer/target.py index 9caaac68..d22f3f11 100644 --- a/mnamer/target.py +++ b/mnamer/target.py @@ -177,9 +177,15 @@ def _parse(self, file_path: Path): self.metadata.language_sub = path_data.get("subtitle_language") except MnamerException: pass - if source_is_subtitle and not self.metadata.language_sub and self._settings.subtitle_lang_guesser: + if ( + source_is_subtitle + and not self.metadata.language_sub + and self._settings.subtitle_lang_guesser + ): try: - self.metadata.language_sub = self._settings.text_lang_guesser.guess_language(self.source) + self.metadata.language_sub = ( + self._settings.text_lang_guesser.guess_language(self.source) + ) except MnamerException: pass if isinstance(self.metadata, MetadataMovie): diff --git a/mnamer/text_lang_guesser/__init__.py b/mnamer/text_lang_guesser/__init__.py index fe26a9d1..d60fe458 100644 --- a/mnamer/text_lang_guesser/__init__.py +++ b/mnamer/text_lang_guesser/__init__.py @@ -4,25 +4,27 @@ from mnamer.language import Language -known_guessers = ['lingua', 'langdetect'] - def guesser(name: str, guess_languages: Dict[str, Language]): lower_name = name.lower() try: - if lower_name == 'lingua': + if lower_name == "lingua": from mnamer.text_lang_guesser.lingua import LinguaGuesser + guesser_cls = LinguaGuesser - elif lower_name == 'langdetect': + elif lower_name == "langdetect": from mnamer.text_lang_guesser.langdetect import LangdetectGuesser + guesser_cls = LangdetectGuesser else: raise MnamerNoSuchLangGuesser("Unrecognized language guesser") except ImportError as e: logging.debug(f"Failed to import text language guesser '{name}'", exc_info=e) - raise MnamerFailedLangGuesserImport(f"Failed to import text language guesser '{name}': {e}") + raise MnamerFailedLangGuesserImport( + f"Failed to import text language guesser '{name}': {e}" + ) try: - return guesser_cls(guess_languages) + return guesser_cls(guess_languages=guess_languages) except Exception as e: logging.debug(f"Error trying to instantiate {guesser_cls.__name__}", exc_info=e) raise e diff --git a/mnamer/text_lang_guesser/base.py b/mnamer/text_lang_guesser/base.py index e7a3284a..e4bec368 100644 --- a/mnamer/text_lang_guesser/base.py +++ b/mnamer/text_lang_guesser/base.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod from pathlib import Path +import logging import re from typing import List, Optional from chardet.universaldetector import UniversalDetector @@ -7,8 +8,9 @@ class TextLanguageGuesser(ABC): - def __init__(self, guess_languages: List[Language]): + def __init__(self, guess_languages: List[Language], min_confidence: float = 0.9): self.guess_languages = guess_languages + self.min_confidence = min_confidence exp_only_nums = r"^\d+$" exp_timeframe = r"^[\s0-9:.,>-]+$" skip_patterns = [exp_only_nums, exp_timeframe] @@ -64,3 +66,18 @@ def _read_lines_from_file( if i > stop_count: break return text + + def _get_file_text(self, filepath): + encoding = self._detect_file_encoding(filepath) + text = None + if encoding["confidence"] >= 0.6: + try: + text = self._read_lines_from_file( + filepath, encoding=encoding["encoding"] + ) + except Exception as e: + logging.warning( + f"Unable to read file {filepath} with encoding {encoding['encoding']}. " + f"Error: {e}" + ) + return text diff --git a/mnamer/text_lang_guesser/langdetect.py b/mnamer/text_lang_guesser/langdetect.py new file mode 100644 index 00000000..313c4ef3 --- /dev/null +++ b/mnamer/text_lang_guesser/langdetect.py @@ -0,0 +1,60 @@ +import logging +from pathlib import Path +from typing import Optional +from langdetect.detector_factory import DetectorFactory, PROFILES_DIRECTORY +from mnamer.language import Language +from mnamer.text_lang_guesser.base import TextLanguageGuesser + + +# Be deterministic. Without this, langdetect could guess different +# languages for the same short text. +DetectorFactory.seed = 0 + + +class LangdetectGuesser(TextLanguageGuesser): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.lang_map = {lang.a2: lang for lang in self.guess_languages} + zh = self.lang_map.pop("zh", None) + if zh: + # lang-detect has zh-cn and zh-tw. Map them both to mnamer's zh. + self.lang_map["zh-cn"] = zh + self.lang_map["zh-tw"] = zh + + profiles_root = Path(PROFILES_DIRECTORY) + json_profiles = [] + for lang in self.lang_map: + profile = profiles_root / lang + if profile.is_file(): + json_profiles.append(profile.read_text(encoding="utf-8")) + else: + logging.warning(f"Language profile not found for language '{lang}'") + + self.langdetect_factory = DetectorFactory() + self.langdetect_factory.load_json_profile(json_profiles) + + def guess_language(self, filepath: Path) -> Optional[Language]: + text = self._get_file_text(filepath) + + if not text: + return None + + guessed_languages = [] + try: + detector = self.langdetect_factory.create() + detector.append(text) + guessed_languages = detector.get_probabilities() + except Exception as e: + logging.warning( + "Unexpected error while guessing language from file text. " + f"File: {filepath}, Error: {e}" + ) + + if not guessed_languages: + return None + + lang = guessed_languages[0] + if lang.prob >= self.min_confidence: + return self.lang_map[lang.lang] + + return None diff --git a/mnamer/text_lang_guesser/lingua.py b/mnamer/text_lang_guesser/lingua.py index 09d81ae7..9710af30 100644 --- a/mnamer/text_lang_guesser/lingua.py +++ b/mnamer/text_lang_guesser/lingua.py @@ -8,11 +8,11 @@ class LinguaGuesser(TextLanguageGuesser): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - guess_upper = { - lang.name.upper(): lang for lang in self.guess_languages - } + def __init__(self, **kwargs): + super().__init__(**kwargs) + guess_upper = {lang.name.upper(): lang for lang in self.guess_languages} + + # Limit the languages that lingua will evaluate to ones known to mnamer. self.search_langs = { lang: guess_upper[lang.name] for lang in LinguaLanguage.all() @@ -21,23 +21,12 @@ def __init__(self, *args, **kwargs): self.detector = ( LanguageDetectorBuilder.from_languages(*self.search_langs.keys()) - .with_minimum_relative_distance(0.9) + .with_minimum_relative_distance(self.min_confidence) .build() ) def guess_language(self, filepath: Path) -> Optional[Language]: - encoding = self._detect_file_encoding(filepath) - text = None - if encoding["confidence"] >= 0.6: - try: - text = self._read_lines_from_file( - filepath, encoding=encoding["encoding"] - ) - except Exception as e: - logging.warning( - f"Unable to read file {filepath} with encoding {encoding['encoding']}. " - f"Error: {e}" - ) + text = self._get_file_text(filepath) if not text: return None From 50c4e82833d296a69ebda68a52741b4930fb2844 Mon Sep 17 00:00:00 2001 From: Big Eater Date: Sun, 31 Dec 2023 15:37:57 +0100 Subject: [PATCH 13/29] Add fasttext --- mnamer/setting_store.py | 2 +- mnamer/text_lang_guesser/__init__.py | 9 ++--- mnamer/text_lang_guesser/fasttext.py | 49 ++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 5 deletions(-) create mode 100644 mnamer/text_lang_guesser/fasttext.py diff --git a/mnamer/setting_store.py b/mnamer/setting_store.py index b49493fb..5b2c27b1 100644 --- a/mnamer/setting_store.py +++ b/mnamer/setting_store.py @@ -113,7 +113,7 @@ class SettingStore: metadata=SettingSpec( flags=["--subtitle-lang-guesser"], group=SettingType.PARAMETER, - choices=["lingua", "langdetect"], + choices=["lingua", "langdetect", "fasttext"], help="--subtitle-lang-guesser=: subtitle file text language guesser (must be installed)", ).as_dict(), ) diff --git a/mnamer/text_lang_guesser/__init__.py b/mnamer/text_lang_guesser/__init__.py index d60fe458..c557a9ef 100644 --- a/mnamer/text_lang_guesser/__init__.py +++ b/mnamer/text_lang_guesser/__init__.py @@ -8,13 +8,14 @@ def guesser(name: str, guess_languages: Dict[str, Language]): lower_name = name.lower() try: if lower_name == "lingua": - from mnamer.text_lang_guesser.lingua import LinguaGuesser - + from mnamer.text_lang_guesser.lingua import LinguaGuesser # noqa guesser_cls = LinguaGuesser elif lower_name == "langdetect": - from mnamer.text_lang_guesser.langdetect import LangdetectGuesser - + from mnamer.text_lang_guesser.langdetect import LangdetectGuesser # noqa guesser_cls = LangdetectGuesser + elif lower_name == "langdetect": + from mnamer.text_lang_guesser.fasttext import FasttextGuesser # noqa + guesser_cls = FasttextGuesser else: raise MnamerNoSuchLangGuesser("Unrecognized language guesser") except ImportError as e: diff --git a/mnamer/text_lang_guesser/fasttext.py b/mnamer/text_lang_guesser/fasttext.py new file mode 100644 index 00000000..00554e6e --- /dev/null +++ b/mnamer/text_lang_guesser/fasttext.py @@ -0,0 +1,49 @@ +import logging +from pathlib import Path +from typing import Optional, Dict, Union +from ftlangdetect.detect import get_or_load_model +from mnamer.language import Language +from mnamer.text_lang_guesser.base import TextLanguageGuesser + + +class FasttextGuesser(TextLanguageGuesser): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.search_langs = {lang.a2: lang for lang in self.guess_languages} + + def detect(self, text: str, low_memory=False) -> Optional[Dict[str, Union[str, float]]]: + """ + Modified version of ftlangdetect.detect.detect, that specifies the threshold. + """ + model = get_or_load_model(low_memory) + labels, scores = model.predict(text, threshold=self.min_confidence) + if not labels: + return None + label = labels[0].replace("__label__", '') + score = min(float(scores[0]), 1.0) + return { + "lang": label, + "score": score, + } + + def guess_language(self, filepath: Path) -> Optional[Language]: + text = self._get_file_text(filepath) + + if not text: + return None + + text = text.replace('\n', ' ').replace('\r', '') + + guessed_language = None + try: + guessed_language = self.detect(text) + except Exception as e: + logging.warning( + "Unexpected error while guessing language from file text. " + f"File: {filepath}, Error: {e}" + ) + + if not guessed_language: + return None + + return self.search_langs.get(guessed_language['lang'], None) From acd58909d52a165df6a505fcc359ee9347aca4d8 Mon Sep 17 00:00:00 2001 From: Big Eater Date: Sun, 31 Dec 2023 15:37:57 +0100 Subject: [PATCH 14/29] Linted --- mnamer/text_lang_guesser/__init__.py | 7 +++++-- mnamer/text_lang_guesser/fasttext.py | 10 ++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/mnamer/text_lang_guesser/__init__.py b/mnamer/text_lang_guesser/__init__.py index c557a9ef..d4dc26c2 100644 --- a/mnamer/text_lang_guesser/__init__.py +++ b/mnamer/text_lang_guesser/__init__.py @@ -7,14 +7,17 @@ def guesser(name: str, guess_languages: Dict[str, Language]): lower_name = name.lower() try: - if lower_name == "lingua": + if lower_name == "lingua": # noqa from mnamer.text_lang_guesser.lingua import LinguaGuesser # noqa - guesser_cls = LinguaGuesser + + guesser_cls = LinguaGuesser # noqa elif lower_name == "langdetect": from mnamer.text_lang_guesser.langdetect import LangdetectGuesser # noqa + guesser_cls = LangdetectGuesser elif lower_name == "langdetect": from mnamer.text_lang_guesser.fasttext import FasttextGuesser # noqa + guesser_cls = FasttextGuesser else: raise MnamerNoSuchLangGuesser("Unrecognized language guesser") diff --git a/mnamer/text_lang_guesser/fasttext.py b/mnamer/text_lang_guesser/fasttext.py index 00554e6e..ac6ba84d 100644 --- a/mnamer/text_lang_guesser/fasttext.py +++ b/mnamer/text_lang_guesser/fasttext.py @@ -11,7 +11,9 @@ def __init__(self, **kwargs): super().__init__(**kwargs) self.search_langs = {lang.a2: lang for lang in self.guess_languages} - def detect(self, text: str, low_memory=False) -> Optional[Dict[str, Union[str, float]]]: + def detect( + self, text: str, low_memory=False + ) -> Optional[Dict[str, Union[str, float]]]: """ Modified version of ftlangdetect.detect.detect, that specifies the threshold. """ @@ -19,7 +21,7 @@ def detect(self, text: str, low_memory=False) -> Optional[Dict[str, Union[str, f labels, scores = model.predict(text, threshold=self.min_confidence) if not labels: return None - label = labels[0].replace("__label__", '') + label = labels[0].replace("__label__", "") score = min(float(scores[0]), 1.0) return { "lang": label, @@ -32,7 +34,7 @@ def guess_language(self, filepath: Path) -> Optional[Language]: if not text: return None - text = text.replace('\n', ' ').replace('\r', '') + text = text.replace("\n", " ").replace("\r", "") guessed_language = None try: @@ -46,4 +48,4 @@ def guess_language(self, filepath: Path) -> Optional[Language]: if not guessed_language: return None - return self.search_langs.get(guessed_language['lang'], None) + return self.search_langs.get(guessed_language["lang"], None) From afed58701cae5581088370095bff29f8e7630068 Mon Sep 17 00:00:00 2001 From: Big Eater Date: Sun, 31 Dec 2023 15:37:57 +0100 Subject: [PATCH 15/29] Add langid, and refactor common code to base class --- mnamer/setting_store.py | 2 +- mnamer/text_lang_guesser/__init__.py | 10 +++- mnamer/text_lang_guesser/base.py | 61 +++++++++++++++++++- mnamer/text_lang_guesser/fasttext.py | 38 +++---------- mnamer/text_lang_guesser/langdetect.py | 79 +++++++++++--------------- mnamer/text_lang_guesser/langid.py | 24 ++++++++ mnamer/text_lang_guesser/lingua.py | 52 +++++++---------- 7 files changed, 154 insertions(+), 112 deletions(-) create mode 100644 mnamer/text_lang_guesser/langid.py diff --git a/mnamer/setting_store.py b/mnamer/setting_store.py index 5b2c27b1..2e90ae38 100644 --- a/mnamer/setting_store.py +++ b/mnamer/setting_store.py @@ -113,7 +113,7 @@ class SettingStore: metadata=SettingSpec( flags=["--subtitle-lang-guesser"], group=SettingType.PARAMETER, - choices=["lingua", "langdetect", "fasttext"], + choices=["lingua", "langdetect", "fasttext", "langid"], help="--subtitle-lang-guesser=: subtitle file text language guesser (must be installed)", ).as_dict(), ) diff --git a/mnamer/text_lang_guesser/__init__.py b/mnamer/text_lang_guesser/__init__.py index d4dc26c2..ffa57d94 100644 --- a/mnamer/text_lang_guesser/__init__.py +++ b/mnamer/text_lang_guesser/__init__.py @@ -8,17 +8,21 @@ def guesser(name: str, guess_languages: Dict[str, Language]): lower_name = name.lower() try: if lower_name == "lingua": # noqa - from mnamer.text_lang_guesser.lingua import LinguaGuesser # noqa + from mnamer.text_lang_guesser.lingua import LinguaGuesser guesser_cls = LinguaGuesser # noqa elif lower_name == "langdetect": - from mnamer.text_lang_guesser.langdetect import LangdetectGuesser # noqa + from mnamer.text_lang_guesser.langdetect import LangdetectGuesser guesser_cls = LangdetectGuesser elif lower_name == "langdetect": - from mnamer.text_lang_guesser.fasttext import FasttextGuesser # noqa + from mnamer.text_lang_guesser.fasttext import FasttextGuesser guesser_cls = FasttextGuesser + elif lower_name == "langid": + from mnamer.text_lang_guesser.langid import LangidGuesser + + guesser_cls = LangidGuesser else: raise MnamerNoSuchLangGuesser("Unrecognized language guesser") except ImportError as e: diff --git a/mnamer/text_lang_guesser/base.py b/mnamer/text_lang_guesser/base.py index e4bec368..b3241b02 100644 --- a/mnamer/text_lang_guesser/base.py +++ b/mnamer/text_lang_guesser/base.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from pathlib import Path import logging +import os import re from typing import List, Optional from chardet.universaldetector import UniversalDetector @@ -10,7 +11,10 @@ class TextLanguageGuesser(ABC): def __init__(self, guess_languages: List[Language], min_confidence: float = 0.9): self.guess_languages = guess_languages + self.language_map = self._language_map(guess_languages) self.min_confidence = min_confidence + self.identifier = self._initialize_identifier() + exp_only_nums = r"^\d+$" exp_timeframe = r"^[\s0-9:.,>-]+$" skip_patterns = [exp_only_nums, exp_timeframe] @@ -21,7 +25,32 @@ def __init__(self, guess_languages: List[Language], min_confidence: float = 0.9) self.encoding_detector = UniversalDetector() @abstractmethod - def guess_language(self, filepath: Path) -> Optional[Language]: + def guess_language_from_text(self, text: str) -> Optional[str]: + """ + Guess the language, based on the text in the file. + """ + pass + + def _language_map(self, lang_list: List[Language]): + """ + Returns a dict that will be used to map an identification result to a Language. + """ + return {lang.a2: lang for lang in lang_list} + + @abstractmethod + def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): + """ + Set up the language identifier, and return it. + It will be available in self.identifier. + + If restrict_to_langs is present, the identifier should restrict + its identification efforts to the given languages. + + Note that restricting the languages used is usually not a good idea + because it increases the possibility of false positives. + + :param restrict_to_langs: a list of two-letter language codes. + """ pass def _skip_line(self, line, skip_expressions) -> bool: @@ -81,3 +110,33 @@ def _get_file_text(self, filepath): f"Error: {e}" ) return text + + @staticmethod + def boolean_env_var(env_var, default=None) -> Optional[bool]: + value = os.getenv(env_var) + if value is None: + return default + value = value.strip().lower() + if value in ["true", "yes", "1"]: + return True + return False + + def guess_language(self, filepath: Path) -> Optional[Language]: + text = self._get_file_text(filepath) + + if not text: + return None + + guessed_language = None + try: + guessed_language = self.guess_language_from_text(text) + except Exception as e: + logging.warning( + "Unexpected error while guessing language from file text. " + f"File: {filepath}, Error: {e}" + ) + + if not guessed_language: + return None + + return self.language_map.get(guessed_language, None) diff --git a/mnamer/text_lang_guesser/fasttext.py b/mnamer/text_lang_guesser/fasttext.py index ac6ba84d..c0f88f58 100644 --- a/mnamer/text_lang_guesser/fasttext.py +++ b/mnamer/text_lang_guesser/fasttext.py @@ -1,24 +1,19 @@ -import logging -from pathlib import Path from typing import Optional, Dict, Union from ftlangdetect.detect import get_or_load_model -from mnamer.language import Language from mnamer.text_lang_guesser.base import TextLanguageGuesser class FasttextGuesser(TextLanguageGuesser): - def __init__(self, **kwargs): - super().__init__(**kwargs) - self.search_langs = {lang.a2: lang for lang in self.guess_languages} + def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): + # Note: It seems there is no way to restrict languages for fasttext. + low_memory = self.boolean_env_var("FASTTEXT_LOW_MEMORY", False) + return get_or_load_model(low_memory=low_memory) - def detect( - self, text: str, low_memory=False - ) -> Optional[Dict[str, Union[str, float]]]: + def detect(self, text: str) -> Optional[Dict[str, Union[str, float]]]: """ Modified version of ftlangdetect.detect.detect, that specifies the threshold. """ - model = get_or_load_model(low_memory) - labels, scores = model.predict(text, threshold=self.min_confidence) + labels, scores = self.identifier.predict(text, threshold=self.min_confidence) if not labels: return None label = labels[0].replace("__label__", "") @@ -28,24 +23,9 @@ def detect( "score": score, } - def guess_language(self, filepath: Path) -> Optional[Language]: - text = self._get_file_text(filepath) - - if not text: - return None - + def guess_language_from_text(self, text: str) -> Optional[str]: text = text.replace("\n", " ").replace("\r", "") - - guessed_language = None - try: - guessed_language = self.detect(text) - except Exception as e: - logging.warning( - "Unexpected error while guessing language from file text. " - f"File: {filepath}, Error: {e}" - ) - + guessed_language = self.detect(text) if not guessed_language: return None - - return self.search_langs.get(guessed_language["lang"], None) + return guessed_language["lang"] diff --git a/mnamer/text_lang_guesser/langdetect.py b/mnamer/text_lang_guesser/langdetect.py index 313c4ef3..ce893d1a 100644 --- a/mnamer/text_lang_guesser/langdetect.py +++ b/mnamer/text_lang_guesser/langdetect.py @@ -1,60 +1,47 @@ import logging from pathlib import Path -from typing import Optional +from typing import Optional, List from langdetect.detector_factory import DetectorFactory, PROFILES_DIRECTORY from mnamer.language import Language from mnamer.text_lang_guesser.base import TextLanguageGuesser -# Be deterministic. Without this, langdetect could guess different -# languages for the same short text. -DetectorFactory.seed = 0 - - class LangdetectGuesser(TextLanguageGuesser): - def __init__(self, **kwargs): - super().__init__(**kwargs) - self.lang_map = {lang.a2: lang for lang in self.guess_languages} - zh = self.lang_map.pop("zh", None) + def _language_map(self, lang_list: List[Language]): + lang_map = super()._language_map(lang_list) + zh = lang_map.pop("zh", None) if zh: # lang-detect has zh-cn and zh-tw. Map them both to mnamer's zh. - self.lang_map["zh-cn"] = zh - self.lang_map["zh-tw"] = zh - - profiles_root = Path(PROFILES_DIRECTORY) - json_profiles = [] - for lang in self.lang_map: - profile = profiles_root / lang - if profile.is_file(): - json_profiles.append(profile.read_text(encoding="utf-8")) - else: - logging.warning(f"Language profile not found for language '{lang}'") - - self.langdetect_factory = DetectorFactory() - self.langdetect_factory.load_json_profile(json_profiles) - - def guess_language(self, filepath: Path) -> Optional[Language]: - text = self._get_file_text(filepath) - - if not text: - return None - - guessed_languages = [] - try: - detector = self.langdetect_factory.create() - detector.append(text) - guessed_languages = detector.get_probabilities() - except Exception as e: - logging.warning( - "Unexpected error while guessing language from file text. " - f"File: {filepath}, Error: {e}" - ) - + lang_map["zh-cn"] = zh + lang_map["zh-tw"] = zh + return lang_map + + def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): + # Be deterministic. Without this, langdetect could guess different + # languages for the same short text. + DetectorFactory.seed = 0 + + identifier = DetectorFactory() + if restrict_to_langs: + profiles_root = Path(PROFILES_DIRECTORY) + json_profiles = [] + for lang in self.language_map: + profile = profiles_root / lang + if profile.is_file(): + json_profiles.append(profile.read_text(encoding="utf-8")) + else: + logging.warning(f"Language profile not found for language '{lang}'") + identifier.load_json_profile(json_profiles) + else: + identifier.load_profile(PROFILES_DIRECTORY) + return identifier + + def guess_language_from_text(self, text: str) -> Optional[str]: + detector = self.identifier.create() + detector.append(text) + guessed_languages = detector.get_probabilities() if not guessed_languages: return None - lang = guessed_languages[0] if lang.prob >= self.min_confidence: - return self.lang_map[lang.lang] - - return None + return lang.lang diff --git a/mnamer/text_lang_guesser/langid.py b/mnamer/text_lang_guesser/langid.py new file mode 100644 index 00000000..ab0007f4 --- /dev/null +++ b/mnamer/text_lang_guesser/langid.py @@ -0,0 +1,24 @@ +from typing import Optional +from langid.langid import LanguageIdentifier, model +from mnamer.text_lang_guesser.base import TextLanguageGuesser + + +class LangidGuesser(TextLanguageGuesser): + default_min_confidence = 0.9999 + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + self.min_confidence = self.default_min_confidence + + def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): + identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) + if restrict_to_langs: + identifier.set_languages(restrict_to_langs) + return identifier + + def guess_language_from_text(self, text: str) -> Optional[str]: + guessed_language = self.identifier.classify(text) + if not guessed_language or guessed_language[1] < self.min_confidence: + return None + return guessed_language[0] diff --git a/mnamer/text_lang_guesser/lingua.py b/mnamer/text_lang_guesser/lingua.py index 9710af30..213b2322 100644 --- a/mnamer/text_lang_guesser/lingua.py +++ b/mnamer/text_lang_guesser/lingua.py @@ -1,6 +1,4 @@ -import logging -from pathlib import Path -from typing import Optional +from typing import List, Optional from lingua import LanguageDetectorBuilder from lingua import Language as LinguaLanguage from mnamer.language import Language @@ -8,39 +6,29 @@ class LinguaGuesser(TextLanguageGuesser): - def __init__(self, **kwargs): - super().__init__(**kwargs) - guess_upper = {lang.name.upper(): lang for lang in self.guess_languages} - - # Limit the languages that lingua will evaluate to ones known to mnamer. - self.search_langs = { - lang: guess_upper[lang.name] + def _language_map(self, lang_list: List[Language]): + """ + Returns a dict that will be used to map an identification result to a Language. + """ + upcase_map = {lang.name.upper(): lang for lang in lang_list} + + return { + lang: upcase_map[lang.name] for lang in LinguaLanguage.all() - if lang.name in guess_upper + if lang.name in upcase_map } - self.detector = ( - LanguageDetectorBuilder.from_languages(*self.search_langs.keys()) + def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): + if restrict_to_langs: + language_list = self.language_map.keys() + else: + language_list = LinguaLanguage.all() + + return ( + LanguageDetectorBuilder.from_languages(*language_list) .with_minimum_relative_distance(self.min_confidence) .build() ) - def guess_language(self, filepath: Path) -> Optional[Language]: - text = self._get_file_text(filepath) - - if not text: - return None - - guessed_language = None - try: - guessed_language = self.detector.detect_language_of(text) - except Exception as e: - logging.warning( - "Unexpected error while guessing language from file text. " - f"File: {filepath}, Error: {e}" - ) - - if not guessed_language: - return None - - return self.search_langs[guessed_language] + def guess_language_from_text(self, text: str) -> Optional[str]: + return self.identifier.detect_language_of(text) From 2ae96e58c69df42890dbf5bd849abe9b6ee8724d Mon Sep 17 00:00:00 2001 From: Big Eater Date: Sun, 31 Dec 2023 15:37:57 +0100 Subject: [PATCH 16/29] Use py3langid instead of langid --- mnamer/text_lang_guesser/langid.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mnamer/text_lang_guesser/langid.py b/mnamer/text_lang_guesser/langid.py index ab0007f4..2255e4d3 100644 --- a/mnamer/text_lang_guesser/langid.py +++ b/mnamer/text_lang_guesser/langid.py @@ -1,5 +1,5 @@ from typing import Optional -from langid.langid import LanguageIdentifier, model +from py3langid.langid import LanguageIdentifier, MODEL_FILE from mnamer.text_lang_guesser.base import TextLanguageGuesser @@ -12,7 +12,7 @@ def __init__(self, **kwargs): self.min_confidence = self.default_min_confidence def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): - identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) + identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True) if restrict_to_langs: identifier.set_languages(restrict_to_langs) return identifier From 4ba7985cdc2208054387974237209880adbb81a3 Mon Sep 17 00:00:00 2001 From: Big Eater Date: Sun, 31 Dec 2023 15:37:57 +0100 Subject: [PATCH 17/29] Rename min_confidence -> min_probability --- mnamer/text_lang_guesser/base.py | 6 +++--- mnamer/text_lang_guesser/fasttext.py | 2 +- mnamer/text_lang_guesser/langdetect.py | 2 +- mnamer/text_lang_guesser/langid.py | 9 +-------- mnamer/text_lang_guesser/lingua.py | 2 +- 5 files changed, 7 insertions(+), 14 deletions(-) diff --git a/mnamer/text_lang_guesser/base.py b/mnamer/text_lang_guesser/base.py index b3241b02..dc1465c3 100644 --- a/mnamer/text_lang_guesser/base.py +++ b/mnamer/text_lang_guesser/base.py @@ -9,10 +9,10 @@ class TextLanguageGuesser(ABC): - def __init__(self, guess_languages: List[Language], min_confidence: float = 0.9): + def __init__(self, guess_languages: List[Language], min_probability: float = 0.9): self.guess_languages = guess_languages self.language_map = self._language_map(guess_languages) - self.min_confidence = min_confidence + self.min_probability = min_probability self.identifier = self._initialize_identifier() exp_only_nums = r"^\d+$" @@ -78,7 +78,7 @@ def _detect_file_encoding(self, filepath): return result def _read_lines_from_file( - self, filepath, encoding: str, lines=100, skip_first_lines=10 + self, filepath, encoding: str, lines=200, skip_first_lines=10 ) -> str: stop_count = lines + skip_first_lines text = "" diff --git a/mnamer/text_lang_guesser/fasttext.py b/mnamer/text_lang_guesser/fasttext.py index c0f88f58..d7d00987 100644 --- a/mnamer/text_lang_guesser/fasttext.py +++ b/mnamer/text_lang_guesser/fasttext.py @@ -13,7 +13,7 @@ def detect(self, text: str) -> Optional[Dict[str, Union[str, float]]]: """ Modified version of ftlangdetect.detect.detect, that specifies the threshold. """ - labels, scores = self.identifier.predict(text, threshold=self.min_confidence) + labels, scores = self.identifier.predict(text, threshold=self.min_probability) if not labels: return None label = labels[0].replace("__label__", "") diff --git a/mnamer/text_lang_guesser/langdetect.py b/mnamer/text_lang_guesser/langdetect.py index ce893d1a..9eb3a724 100644 --- a/mnamer/text_lang_guesser/langdetect.py +++ b/mnamer/text_lang_guesser/langdetect.py @@ -43,5 +43,5 @@ def guess_language_from_text(self, text: str) -> Optional[str]: if not guessed_languages: return None lang = guessed_languages[0] - if lang.prob >= self.min_confidence: + if lang.prob >= self.min_probability: return lang.lang diff --git a/mnamer/text_lang_guesser/langid.py b/mnamer/text_lang_guesser/langid.py index 2255e4d3..c1c7671c 100644 --- a/mnamer/text_lang_guesser/langid.py +++ b/mnamer/text_lang_guesser/langid.py @@ -4,13 +4,6 @@ class LangidGuesser(TextLanguageGuesser): - default_min_confidence = 0.9999 - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - self.min_confidence = self.default_min_confidence - def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True) if restrict_to_langs: @@ -19,6 +12,6 @@ def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): def guess_language_from_text(self, text: str) -> Optional[str]: guessed_language = self.identifier.classify(text) - if not guessed_language or guessed_language[1] < self.min_confidence: + if not guessed_language or guessed_language[1] < self.min_probability: return None return guessed_language[0] diff --git a/mnamer/text_lang_guesser/lingua.py b/mnamer/text_lang_guesser/lingua.py index 213b2322..794b912f 100644 --- a/mnamer/text_lang_guesser/lingua.py +++ b/mnamer/text_lang_guesser/lingua.py @@ -26,7 +26,7 @@ def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): return ( LanguageDetectorBuilder.from_languages(*language_list) - .with_minimum_relative_distance(self.min_confidence) + .with_minimum_relative_distance(self.min_probability) .build() ) From bd0c42a3d8ae8c72fc1d2c6545dffab7b406c09d Mon Sep 17 00:00:00 2001 From: Big Eater Date: Sun, 31 Dec 2023 15:37:57 +0100 Subject: [PATCH 18/29] Add docstrings --- mnamer/text_lang_guesser/base.py | 36 ++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/mnamer/text_lang_guesser/base.py b/mnamer/text_lang_guesser/base.py index dc1465c3..ea70560c 100644 --- a/mnamer/text_lang_guesser/base.py +++ b/mnamer/text_lang_guesser/base.py @@ -62,7 +62,16 @@ def _skip_line(self, line, skip_expressions) -> bool: return True return False - def _detect_file_encoding(self, filepath): + def _detect_file_encoding(self, filepath) -> dict: + """ + Tries to guess the encoding (utf-8, iso-8859-1, etc). + + The returned dict has these fields of interest: + { + "encoding": str, + "confidence": float between 0 and 1 + } + """ self.encoding_detector.reset() for line in open(filepath, "rb"): if self._skip_line(line, self.skip_line_expressions_bytes): @@ -80,6 +89,17 @@ def _detect_file_encoding(self, filepath): def _read_lines_from_file( self, filepath, encoding: str, lines=200, skip_first_lines=10 ) -> str: + """ + Read a certain number of lines from the file, returning a unicode string. + + Lines that are subtitle control lines (only numbers, or time ranges) + are filtered out, and do not count towards the number of lines. + + By default, the 10 first lines are skipped. The reasoning behind + that is that perhaps the first lines contain subtitle credits + (e.g. a little advertisement for the subtitle creator), which may + not correspond to the principal language of the file. + """ stop_count = lines + skip_first_lines text = "" i = 0 @@ -96,7 +116,13 @@ def _read_lines_from_file( break return text - def _get_file_text(self, filepath): + def _get_file_text(self, filepath) -> Optional[str]: + """ + Tries to determine the file encoding and read some lines from the file. + + If the confidence for the encoding is not high enough, or an error + occurs while reading lines from the file, the return value is None. + """ encoding = self._detect_file_encoding(filepath) text = None if encoding["confidence"] >= 0.6: @@ -122,6 +148,12 @@ def boolean_env_var(env_var, default=None) -> Optional[bool]: return False def guess_language(self, filepath: Path) -> Optional[Language]: + """ + Reads text from the file and passes it the implementation-specific + guess_language_from_text() method. + + If a matching mnamer.Language exists, it is returned, otherwise None. + """ text = self._get_file_text(filepath) if not text: From 5e0924473d8b8e3646f1f66b66a6f23279b3e5cc Mon Sep 17 00:00:00 2001 From: Big Eater Date: Sun, 31 Dec 2023 15:37:57 +0100 Subject: [PATCH 19/29] Get available guessers from text_lang_guesser module --- mnamer/exceptions.py | 5 +-- mnamer/setting_store.py | 2 +- mnamer/text_lang_guesser/__init__.py | 66 +++++++++++++++++----------- 3 files changed, 44 insertions(+), 29 deletions(-) diff --git a/mnamer/exceptions.py b/mnamer/exceptions.py index 343923cc..00e1ca06 100644 --- a/mnamer/exceptions.py +++ b/mnamer/exceptions.py @@ -21,10 +21,9 @@ class MnamerNotFoundException(MnamerException): """Raised when a lookup or search works as expected yet yields no results.""" -class MnamerFailedLangGuesserImport(MnamerException): +class MnamerFailedLangGuesserInstantiation(MnamerException): """ - Raised when a requested text language guesser failed to import, probably - because the python packages that the guesser depends on are not installed. + Raised when a requested text language guesser failed to instantiate. """ diff --git a/mnamer/setting_store.py b/mnamer/setting_store.py index 2e90ae38..40282a7a 100644 --- a/mnamer/setting_store.py +++ b/mnamer/setting_store.py @@ -113,7 +113,7 @@ class SettingStore: metadata=SettingSpec( flags=["--subtitle-lang-guesser"], group=SettingType.PARAMETER, - choices=["lingua", "langdetect", "fasttext", "langid"], + choices=list(text_lang_guesser.available_guessers), help="--subtitle-lang-guesser=: subtitle file text language guesser (must be installed)", ).as_dict(), ) diff --git a/mnamer/text_lang_guesser/__init__.py b/mnamer/text_lang_guesser/__init__.py index ffa57d94..fe86c1d9 100644 --- a/mnamer/text_lang_guesser/__init__.py +++ b/mnamer/text_lang_guesser/__init__.py @@ -1,38 +1,54 @@ import logging from typing import Dict -from mnamer.exceptions import MnamerFailedLangGuesserImport, MnamerNoSuchLangGuesser +from mnamer.exceptions import ( + MnamerFailedLangGuesserInstantiation, + MnamerNoSuchLangGuesser, +) from mnamer.language import Language +from importlib import import_module -def guesser(name: str, guess_languages: Dict[str, Language]): - lower_name = name.lower() +def _import_module(dotted_module_name: str): try: - if lower_name == "lingua": # noqa - from mnamer.text_lang_guesser.lingua import LinguaGuesser + return import_module(dotted_module_name) + except ImportError as e: + logging.debug(f"Failed to import {dotted_module_name}: {e}", exc_info=e) + return None - guesser_cls = LinguaGuesser # noqa - elif lower_name == "langdetect": - from mnamer.text_lang_guesser.langdetect import LangdetectGuesser - guesser_cls = LangdetectGuesser - elif lower_name == "langdetect": - from mnamer.text_lang_guesser.fasttext import FasttextGuesser +possible_guessers = ( + ("lingua", "mnamer.text_lang_guesser.lingua.LinguaGuesser"), + ("langdetect", "mnamer.text_lang_guesser.langdetect.LangdetectGuesser"), + ("fasttext", "mnamer.text_lang_guesser.fasttext.FasttextGuesser"), + ("langid", "mnamer.text_lang_guesser.langid.LangidGuesser"), +) - guesser_cls = FasttextGuesser - elif lower_name == "langid": - from mnamer.text_lang_guesser.langid import LangidGuesser +available_guessers = {} +for name, module_class in possible_guessers: + module_name, classname = module_class.rsplit(".", 1) + mod = _import_module(module_name) + if mod: + try: + cls = getattr(mod, classname) + except AttributeError as e: + logging.debug( + f"Failed to load class {classname} from module {mod}: {e}", exc_info=e + ) + continue + available_guessers[name] = cls - guesser_cls = LangidGuesser - else: - raise MnamerNoSuchLangGuesser("Unrecognized language guesser") - except ImportError as e: - logging.debug(f"Failed to import text language guesser '{name}'", exc_info=e) - raise MnamerFailedLangGuesserImport( - f"Failed to import text language guesser '{name}': {e}" - ) +def guesser(name: str, guess_languages: Dict[str, Language]): + if name not in available_guessers: + raise MnamerNoSuchLangGuesser("Unrecognized language guesser") try: - return guesser_cls(guess_languages=guess_languages) + return available_guessers[name](guess_languages=guess_languages) except Exception as e: - logging.debug(f"Error trying to instantiate {guesser_cls.__name__}", exc_info=e) - raise e + class_name = available_guessers[name].__name__ + logging.debug( + f"Error trying to instantiate {class_name}", + exc_info=e, + ) + raise MnamerFailedLangGuesserInstantiation( + f"Failed creating guesser {class_name}" + ) From 947de8faa16e14d181182bd0cca79b9ecae6f809 Mon Sep 17 00:00:00 2001 From: Big Eater Date: Sun, 31 Dec 2023 15:37:58 +0100 Subject: [PATCH 20/29] Configure optional dependencies --- mnamer/text_lang_guesser/fasttext.py | 4 ++++ pyproject.toml | 5 +++++ requirements-guess-all.txt | 5 +++++ requirements-guess-fasttext.txt | 2 ++ requirements-guess-langdetect.txt | 2 ++ requirements-guess-langid.txt | 2 ++ requirements-guess-lingua.txt | 2 ++ 7 files changed, 22 insertions(+) create mode 100644 requirements-guess-all.txt create mode 100644 requirements-guess-fasttext.txt create mode 100644 requirements-guess-langdetect.txt create mode 100644 requirements-guess-langid.txt create mode 100644 requirements-guess-lingua.txt diff --git a/mnamer/text_lang_guesser/fasttext.py b/mnamer/text_lang_guesser/fasttext.py index d7d00987..a17c7db4 100644 --- a/mnamer/text_lang_guesser/fasttext.py +++ b/mnamer/text_lang_guesser/fasttext.py @@ -4,6 +4,10 @@ class FasttextGuesser(TextLanguageGuesser): + """ + Installation note: a modern g++ version is required for building fasttext. + """ + def _initialize_identifier(self, restrict_to_langs: Optional[list[str]] = None): # Note: It seems there is no way to restrict languages for fasttext. low_memory = self.boolean_env_var("FASTTEXT_LOW_MEMORY", False) diff --git a/pyproject.toml b/pyproject.toml index 1d58cc21..5d3c61b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,11 @@ dependencies = { file = "requirements.txt" } [tool.setuptools.dynamic.optional-dependencies] dev = { file = "requirements-dev.txt" } +guess_langid = { file = "requirements-guess-langid.txt" } +guess_lingua = { file = "requirements-guess-lingua.txt" } +guess_fasttext = { file = "requirements-guess-fasttext.txt" } +guess_langdetect = { file = "requirements-guess-langdetect.txt" } +guess_all = { file = "requirements-guess-all.txt" } [build-system] requires = ["setuptools >= 61.0.0", "setuptools_scm[toml] >= 6.2", "wheel"] diff --git a/requirements-guess-all.txt b/requirements-guess-all.txt new file mode 100644 index 00000000..f90cce91 --- /dev/null +++ b/requirements-guess-all.txt @@ -0,0 +1,5 @@ +chardet >= 5.2.0 +py3langid ~= 0.2.2 +lingua-language-detector ~= 2.0.2 +fasttext-langdetect ~= 1.0.5 +langdetect ~= 1.0.9 diff --git a/requirements-guess-fasttext.txt b/requirements-guess-fasttext.txt new file mode 100644 index 00000000..b04d0dca --- /dev/null +++ b/requirements-guess-fasttext.txt @@ -0,0 +1,2 @@ +chardet >= 5.2.0 +fasttext-langdetect ~= 1.0.5 \ No newline at end of file diff --git a/requirements-guess-langdetect.txt b/requirements-guess-langdetect.txt new file mode 100644 index 00000000..7cfaec7e --- /dev/null +++ b/requirements-guess-langdetect.txt @@ -0,0 +1,2 @@ +chardet >= 5.2.0 +langdetect ~= 1.0.9 \ No newline at end of file diff --git a/requirements-guess-langid.txt b/requirements-guess-langid.txt new file mode 100644 index 00000000..e140d37a --- /dev/null +++ b/requirements-guess-langid.txt @@ -0,0 +1,2 @@ +chardet >= 5.2.0 +py3langid ~= 0.2.2 \ No newline at end of file diff --git a/requirements-guess-lingua.txt b/requirements-guess-lingua.txt new file mode 100644 index 00000000..197bd646 --- /dev/null +++ b/requirements-guess-lingua.txt @@ -0,0 +1,2 @@ +chardet >= 5.2.0 +lingua-language-detector ~= 2.0.2 \ No newline at end of file From 184ddd23f898a226a5d94d9ec5c4a0539f6ac7a3 Mon Sep 17 00:00:00 2001 From: zebdo Date: Mon, 22 Jan 2024 03:13:26 +1100 Subject: [PATCH 21/29] add symlink checks when attempting to move a file --- mnamer/target.py | 3 +++ mnamer/utils.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/mnamer/target.py b/mnamer/target.py index 2f7c98bd..f012eda0 100644 --- a/mnamer/target.py +++ b/mnamer/target.py @@ -243,6 +243,9 @@ def relocate(self) -> None: """Performs the action of renaming and/or moving a file.""" destination_path = Path(self.destination).resolve() destination_path.parent.mkdir(parents=True, exist_ok=True) + if path.isLink(destination_path) == True: + print("Skipped symlink") + return try: move(str(self.source), destination_path) if self._settings.symlink: diff --git a/mnamer/utils.py b/mnamer/utils.py index 84df7870..a77c4ed4 100644 --- a/mnamer/utils.py +++ b/mnamer/utils.py @@ -141,6 +141,8 @@ def is_subtitle(container: str | Path | None) -> bool: return False return str(container).endswith(tuple(SUBTITLE_CONTAINERS)) +def is_symlink(path: Path) -> str: + return os.path.islink(str(Path)) def get_session() -> requests_cache.CachedSession: """Convenience function that returns request-cache session singleton.""" From b41177ad274194c66c79cd89ba4601f63e7c959b Mon Sep 17 00:00:00 2001 From: Dennis Keitzel Date: Mon, 21 Oct 2024 08:55:04 +0200 Subject: [PATCH 22/29] Python 3.13 compatibility We need guessit 3.8.0 for Python 3.13 compatibility. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index be464c26..37d36bac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ appdirs ~= 1.4.4 babelfish ~= 0.6.0 -guessit ~= 3.7.1 +guessit ~= 3.8.0 requests == 2.* requests_cache ~= 0.9.7 setuptools_scm ~= 7.1.0 From 790b23b630d87f3948cffb5695c3b6dff819ee79 Mon Sep 17 00:00:00 2001 From: meitham Date: Wed, 25 Dec 2024 06:55:04 +0000 Subject: [PATCH 23/29] original filename --- mnamer/metadata.py | 1 + mnamer/setting_store.py | 2 +- mnamer/target.py | 1 + tests/e2e/test_moving.py | 12 ++++++++++++ 4 files changed, 15 insertions(+), 1 deletion(-) diff --git a/mnamer/metadata.py b/mnamer/metadata.py index df43382a..19380d2d 100644 --- a/mnamer/metadata.py +++ b/mnamer/metadata.py @@ -44,6 +44,7 @@ class Metadata: language_sub: Language | None = None quality: str | None = None synopsis: str | None = None + original_filename: str | None = None @classmethod def to_media_type(cls) -> MediaType: diff --git a/mnamer/setting_store.py b/mnamer/setting_store.py index 8981d335..164a2b3c 100644 --- a/mnamer/setting_store.py +++ b/mnamer/setting_store.py @@ -322,7 +322,7 @@ class SettingStore: default=False, metadata=SettingSpec( action="store_true", - flags=["--test"], + flags=["--test", "--dry-run", "--dryrun"], group=SettingType.DIRECTIVE, help="--test: mocks the renaming and moving of files", ).as_dict(), diff --git a/mnamer/target.py b/mnamer/target.py index cd9927e2..1284c496 100644 --- a/mnamer/target.py +++ b/mnamer/target.py @@ -148,6 +148,7 @@ def _parse(self, file_path: Path): None: Metadata, }[media_type] self.metadata = meta_cls() + self.metadata.original_filename = self.source.name self.metadata.quality = ( " ".join( path_data[key] diff --git a/tests/e2e/test_moving.py b/tests/e2e/test_moving.py index 7b8ccfb6..f6cb1324 100644 --- a/tests/e2e/test_moving.py +++ b/tests/e2e/test_moving.py @@ -211,3 +211,15 @@ def test_ambiguous_language_deletction(e2e_run, setup_test_files): ) result = e2e_run("--batch", ".") assert result.code == 0 + + +@pytest.mark.usefixtures("setup_test_dir") +def test_original_filename(e2e_run, setup_test_files): + setup_test_files("archer.2009.s10e07.webrip.x264-lucidtv.mp4") + result = e2e_run( + "--batch", + "--episode-format='{original_filename}'", + ".", + ) + assert result.code == 0 + assert "archer.2009.s10e07.webrip.x264-lucidtv.mp4" in result.out From d1c9b965ab29f609b219b6ed3abe906328f10510 Mon Sep 17 00:00:00 2001 From: caesar Date: Wed, 5 Feb 2025 21:00:33 +0100 Subject: [PATCH 24/29] allow attributing for shows, such as date.year, date.hour --- mnamer/metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mnamer/metadata.py b/mnamer/metadata.py index df43382a..b76d1195 100644 --- a/mnamer/metadata.py +++ b/mnamer/metadata.py @@ -158,7 +158,7 @@ def __post_init__(self): def __format__(self, format_spec: str | None): default = "{series} - {season:02}x{episode:02} - {title}" - re_pattern = r"({(\w+)(?:\[[\w:]+\])?(?:\:\d{1,2})?})" + re_pattern = r"({(\w+)(?:\[[\w:]+\]|\.\w+)?(?:\:\d{1,2})?})" s = re.sub(re_pattern, self._format_repl, format_spec or default) s = str_fix_padding(s) return s From 1c0d53f74c700f3f1bcbd8c1ec0ad1c813dd4e48 Mon Sep 17 00:00:00 2001 From: caesar Date: Fri, 7 Feb 2025 23:58:36 +0100 Subject: [PATCH 25/29] date no years --- mnamer/endpoints.py | 2 +- mnamer/metadata.py | 17 ++++++++++++----- mnamer/providers.py | 26 +++++++++++++++----------- mnamer/setting_store.py | 2 +- mnamer/target.py | 2 +- mnamer/utils.py | 27 --------------------------- 6 files changed, 30 insertions(+), 46 deletions(-) diff --git a/mnamer/endpoints.py b/mnamer/endpoints.py index f11b56ec..aac8a8a8 100644 --- a/mnamer/endpoints.py +++ b/mnamer/endpoints.py @@ -165,7 +165,7 @@ def tmdb_movies( def tmdb_search_movies( api_key: str, title: str, - year: int | str | None = None, + year: int | None = None, language: Language | None = None, region: str | None = None, adult: bool = False, diff --git a/mnamer/metadata.py b/mnamer/metadata.py index b76d1195..03212e09 100644 --- a/mnamer/metadata.py +++ b/mnamer/metadata.py @@ -16,7 +16,6 @@ str_fix_padding, str_replace_slashes, str_title_case, - year_parse, ) @@ -111,13 +110,17 @@ class MetadataMovie(Metadata): """ name: str | None = None - year: str | None = None + date: dt.date | None = None id_imdb: str | None = None id_tmdb: str | None = None + def __post_init__(self): + if isinstance(self.date, str): + self.date = parse_date(self.date) + def __format__(self, format_spec: str | None): - default = "{name} ({year})" - re_pattern = r"({(\w+)(?:\[[\w:]+\])?(?:\:\d{1,2})?})" + default = "{name} ({date.year})" + re_pattern = r"({(\w+)(?:\[[\w:]+\]|\.\w+)?(?:\:\d{1,2})?})" s = re.sub(re_pattern, self._format_repl, format_spec or default) s = str_fix_padding(s) return s @@ -125,7 +128,7 @@ def __format__(self, format_spec: str | None): def __setattr__(self, key: str, value: Any): converter_map: dict[str, Callable] = { "name": fn_pipe(str_replace_slashes, str_title_case), - "year": year_parse, + "date": parse_date, } converter: Callable | None = converter_map.get(key) if value is not None and converter: @@ -141,6 +144,7 @@ class MetadataEpisode(Metadata): """ series: str | None = None + series_date: dt.date | None = None season: int | None = None episode: int | None = None date: dt.date | None = None @@ -155,6 +159,8 @@ def __post_init__(self): self.episode = int(self.episode) if isinstance(self.date, str): self.date = parse_date(self.date) + if isinstance(self.series_date, str): + self.series_date = parse_date(self.series_date) def __format__(self, format_spec: str | None): default = "{series} - {season:02}x{episode:02} - {title}" @@ -169,6 +175,7 @@ def __setattr__(self, key: str, value: Any): "episode": int, "season": int, "series": fn_pipe(str_replace_slashes, str_title_case), + "series_date": fn_pipe(str_replace_slashes, str_title_case), "title": fn_pipe(str_replace_slashes, str_title_case), } converter: Callable | None = converter_map.get(key) diff --git a/mnamer/providers.py b/mnamer/providers.py index e4e83c04..61a53d4c 100644 --- a/mnamer/providers.py +++ b/mnamer/providers.py @@ -28,7 +28,7 @@ from mnamer.metadata import Metadata, MetadataEpisode, MetadataMovie from mnamer.setting_store import SettingStore from mnamer.types import MediaType, ProviderType -from mnamer.utils import parse_date, year_range_parse +from mnamer.utils import parse_date class Provider(ABC): @@ -83,7 +83,7 @@ def search(self, query: MetadataMovie) -> Iterator[MetadataMovie]: if query.id_imdb: results = self._lookup_movie(query.id_imdb) elif query.name: - results = self._search_movie(query.name, query.year) + results = self._search_movie(query.name, query.date.year) else: raise MnamerNotFoundException yield from results @@ -94,15 +94,17 @@ def _lookup_movie(self, id_imdb: str) -> Iterator[MetadataMovie]: try: release_date = dt.datetime.strptime( response["Released"], "%d %b %Y" - ).strftime("%Y-%m-%d") + ) except (KeyError, ValueError): if response.get("Year") in (None, "N/A"): release_date = None else: - release_date = "{}-01-01".format(response["Year"]) + release_date = dt.datetime.strptime( + "{}-01-01".format(response["Year"]), "%Y-%m-%d" + ) meta = MetadataMovie( name=response["Title"], - year=release_date, + date=release_date, synopsis=response["Plot"], id_imdb=response["imdbID"], ) @@ -110,9 +112,9 @@ def _lookup_movie(self, id_imdb: str) -> Iterator[MetadataMovie]: meta.synopsis = None yield meta - def _search_movie(self, name: str, year: str | None) -> Iterator[MetadataMovie]: + def _search_movie(self, name: str, year: int | None) -> Iterator[MetadataMovie]: assert self.api_key - year_from, year_to = year_range_parse(year, 5) + year_from, year_to = year - 5, year + 5 found = False page = 1 page_max = 10 # each page yields a maximum of 10 results @@ -153,7 +155,7 @@ def search(self, query: MetadataMovie) -> Iterator[MetadataMovie]: if query.id_tmdb: results = self._search_id(query.id_tmdb, query.language) elif query.name: - results = self._search_name(query.name, query.year, query.language) + results = self._search_name(query.name, query.date.year, query.language) else: raise MnamerNotFoundException yield from results @@ -166,13 +168,13 @@ def _search_id( yield MetadataMovie( name=response["title"], language=language, - year=response["release_date"], + date=response["release_date"], synopsis=response["overview"], id_tmdb=response["id"], id_imdb=response["imdb_id"], ) - def _search_name(self, name: str, year: str | None, language: Language | None): + def _search_name(self, name: str, year: int | None, language: Language | None): assert self.api_key page = 1 page_max = 5 # each page yields a maximum of 20 results @@ -193,7 +195,7 @@ def _search_name(self, name: str, year: str | None, language: Language | None): name=entry["title"], language=language, synopsis=entry["overview"], - year=entry["release_date"], + date=entry["release_date"], ) if not meta.year: continue @@ -275,6 +277,7 @@ def _search_id( id_tvdb=id_tvdb, season=entry["airedSeason"], series=series_data["data"]["seriesName"], + series_date=series_data["data"]["firstAired"], language=language, synopsis=(entry["overview"] or "") .replace("\r\n", "") @@ -480,6 +483,7 @@ def _transform_meta( id_tvmaze=id_tvmaze or None, season=episode_entry["season"], series=series_entry["name"], + series_date=series_entry["premiered"], synopsis=episode_entry["summary"] or None, title=episode_entry["name"] or None, ) diff --git a/mnamer/setting_store.py b/mnamer/setting_store.py index 8981d335..b255fd2d 100644 --- a/mnamer/setting_store.py +++ b/mnamer/setting_store.py @@ -177,7 +177,7 @@ class SettingStore: ).as_dict(), ) movie_format: str = dataclasses.field( - default="{name} ({year}).{extension}", + default="{name} ({date.year}).{extension}", metadata=SettingSpec( dest="movie_format", flags=["--movie_format", "--movie-format", "--movieformat"], diff --git a/mnamer/target.py b/mnamer/target.py index cd9927e2..bf38d080 100644 --- a/mnamer/target.py +++ b/mnamer/target.py @@ -178,7 +178,7 @@ def _parse(self, file_path: Path): pass if isinstance(self.metadata, MetadataMovie): self.metadata.name = path_data.get("title") - self.metadata.year = path_data.get("year") + self.metadata.date = path_data.get("date") elif isinstance(self.metadata, MetadataEpisode): self.metadata.date = path_data.get("date") self.metadata.episode = path_data.get("episode") diff --git a/mnamer/utils.py b/mnamer/utils.py index 84df7870..1a3b03f0 100644 --- a/mnamer/utils.py +++ b/mnamer/utils.py @@ -476,30 +476,3 @@ def str_title_case(s: str) -> str: s = s[:pos] + exception.upper() + s[pos + word_length :] return s - - -def year_parse(s: str) -> int | None: - """Parses a year from a string.""" - regex = r"((?:19|20)\d{2})(?:$|[-/]\d{2}[-/]\d{2})" - try: - return int(re.findall(regex, str(s))[0]) - except IndexError: - return None - - -def year_range_parse(years: str | int | None, tolerance: int = 1) -> tuple[int, int]: - """Parses a year or dash-delimited year range.""" - regex = r"^((?:19|20)\d{2})?([-,: ]*)?((?:19|20)\d{2})?$" - default_start = 1900 - default_end = CURRENT_YEAR - try: - start, dash, end = re.match(regex, str(years).strip()).groups() # type: ignore - except AttributeError: - start, end, dash = None, None, True - if not start and not end: - start, end, dash = None, None, True - start = int(start or default_start) - end = int(end or default_end) - if not dash: - end = start - return start - tolerance, end + tolerance From 0cbb6f59814058de7e9d8f035cadbc4b2a053e04 Mon Sep 17 00:00:00 2001 From: caesar Date: Sat, 8 Feb 2025 20:40:29 +0100 Subject: [PATCH 26/29] dating --- mnamer/metadata.py | 2 +- mnamer/providers.py | 6 +++--- mnamer/target.py | 6 ++++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/mnamer/metadata.py b/mnamer/metadata.py index 03212e09..b5aaff09 100644 --- a/mnamer/metadata.py +++ b/mnamer/metadata.py @@ -175,7 +175,7 @@ def __setattr__(self, key: str, value: Any): "episode": int, "season": int, "series": fn_pipe(str_replace_slashes, str_title_case), - "series_date": fn_pipe(str_replace_slashes, str_title_case), + "series_date": parse_date, "title": fn_pipe(str_replace_slashes, str_title_case), } converter: Callable | None = converter_map.get(key) diff --git a/mnamer/providers.py b/mnamer/providers.py index 61a53d4c..4ecb28cf 100644 --- a/mnamer/providers.py +++ b/mnamer/providers.py @@ -83,7 +83,7 @@ def search(self, query: MetadataMovie) -> Iterator[MetadataMovie]: if query.id_imdb: results = self._lookup_movie(query.id_imdb) elif query.name: - results = self._search_movie(query.name, query.date.year) + results = self._search_movie(query.name, None if query.date is None else query.date.year) else: raise MnamerNotFoundException yield from results @@ -155,7 +155,7 @@ def search(self, query: MetadataMovie) -> Iterator[MetadataMovie]: if query.id_tmdb: results = self._search_id(query.id_tmdb, query.language) elif query.name: - results = self._search_name(query.name, query.date.year, query.language) + results = self._search_name(query.name, None if query.date is None else query.date.year, query.language) else: raise MnamerNotFoundException yield from results @@ -197,7 +197,7 @@ def _search_name(self, name: str, year: int | None, language: Language | None): synopsis=entry["overview"], date=entry["release_date"], ) - if not meta.year: + if not meta.date: continue yield meta found = True diff --git a/mnamer/target.py b/mnamer/target.py index bf38d080..e6ab5f41 100644 --- a/mnamer/target.py +++ b/mnamer/target.py @@ -167,6 +167,10 @@ def _parse(self, file_path: Path): self.metadata.language = path_data.get("language") self.metadata.group = path_data.get("release_group") self.metadata.container = file_path.suffix or None + if "date" in path_data: + self.metadata.date = path_data.get("date") + elif "year" in path_data: + self.metadata.date = "{}-01-01".format(path_data.get("year")) if not self.metadata.language: try: self.metadata.language = path_data.get("language") @@ -178,9 +182,7 @@ def _parse(self, file_path: Path): pass if isinstance(self.metadata, MetadataMovie): self.metadata.name = path_data.get("title") - self.metadata.date = path_data.get("date") elif isinstance(self.metadata, MetadataEpisode): - self.metadata.date = path_data.get("date") self.metadata.episode = path_data.get("episode") self.metadata.season = path_data.get("season") self.metadata.series = path_data.get("title") From 3a674f572e233a6f606769b4c5822a251383f895 Mon Sep 17 00:00:00 2001 From: matt wilkie Date: Sun, 29 Jun 2025 20:24:52 -0700 Subject: [PATCH 27/29] fix islink typo (close #313) score +1 for accurate error message! it was indeed a just a typo. --- mnamer/target.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mnamer/target.py b/mnamer/target.py index 2c5b48e2..2a5b8789 100644 --- a/mnamer/target.py +++ b/mnamer/target.py @@ -258,7 +258,7 @@ def relocate(self) -> None: """Performs the action of renaming and/or moving a file.""" destination_path = Path(self.destination).resolve() destination_path.parent.mkdir(parents=True, exist_ok=True) - if path.isLink(destination_path) == True: + if path.islink(destination_path) == True: print("Skipped symlink") return try: From 2add6eb44552488a1503c154764a92dc69f14141 Mon Sep 17 00:00:00 2001 From: caesar Date: Tue, 8 Jul 2025 21:17:13 +0200 Subject: [PATCH 28/29] additional date search logic --- mnamer/providers.py | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/mnamer/providers.py b/mnamer/providers.py index 4ecb28cf..b2813aa2 100644 --- a/mnamer/providers.py +++ b/mnamer/providers.py @@ -231,13 +231,17 @@ def search(self, query: MetadataEpisode) -> Iterator[MetadataEpisode]: if not self.token: self.token = self._login() if query.id_tvdb and query.date: - results = self._search_tvdb_date(query.id_tvdb, query.date, query.language) + results = self._search_tvdb_date( + query.id_tvdb, query.date, query.language, query.season, query.episode + ) elif query.id_tvdb: results = self._search_id( query.id_tvdb, query.season, query.episode, query.language ) elif query.series and query.date: - results = self._search_series_date(query.series, query.date, query.language) + results = self._search_series_date( + query.series, query.date, query.language, query.season, query.episode + ) elif query.series: results = self._search_series( query.series, query.season, query.episode, query.language @@ -319,19 +323,39 @@ def _search_series( raise MnamerNotFoundException def _search_tvdb_date( - self, id_tvdb: str, release_date: dt.date, language: Language | None + self, + id_tvdb: str, + release_date: dt.date, + language: Language | None, + season: int | None = None, + episode: int | None = None ): release_date = parse_date(release_date) found = False for meta in self._search_id(id_tvdb, language=language): - if meta.date and meta.date == release_date: - found = True - yield meta + if meta.date: + if season is not None and season == meta.season and episode is not None and episode == meta.episode: + if meta.date == release_date: + found = True + yield meta + elif release_date.month == 1 and release_date.month == 1: + if meta.date.year == release_date.year or meta.series_date.year == release_date.year: + found = True + yield meta + else: + if meta.date == release_date: + found = True + yield meta if not found: raise MnamerNotFoundException def _search_series_date( - self, series: str, release_date: dt.date, language: Language | None + self, + series: str, + release_date: dt.date, + language: Language | None, + season: int | None = None, + episode: int | None = None ): release_date = parse_date(release_date) series_data = tvdb_search_series( @@ -341,7 +365,7 @@ def _search_series_date( found = False for tvdb_id in tvdb_ids: try: - yield from self._search_tvdb_date(tvdb_id, release_date, language) + yield from self._search_tvdb_date(tvdb_id, release_date, language, season, episode) found = True except MnamerNotFoundException: continue From c499e0222fd14c247fbeb83c81f3033cf205c429 Mon Sep 17 00:00:00 2001 From: caesar Date: Wed, 9 Jul 2025 01:33:10 +0200 Subject: [PATCH 29/29] extra logic to avoid extra year tagging, and finding the right match in dates --- mnamer/metadata.py | 14 ++++++++++++++ mnamer/providers.py | 12 +++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/mnamer/metadata.py b/mnamer/metadata.py index b5aaff09..685e8653 100644 --- a/mnamer/metadata.py +++ b/mnamer/metadata.py @@ -121,8 +121,15 @@ def __post_init__(self): def __format__(self, format_spec: str | None): default = "{name} ({date.year})" re_pattern = r"({(\w+)(?:\[[\w:]+\]|\.\w+)?(?:\:\d{1,2})?})" + tname = '' + if ( format_spec is None or re.search("{(date.year|date)}", format_spec) is not None ) \ + and self.name is not None and self.date is not None \ + and self.name.endswith(f" ({self.date.year})"): + tname = f" ({self.date.year})" + self.name = self.name[:-len(tname)] s = re.sub(re_pattern, self._format_repl, format_spec or default) s = str_fix_padding(s) + self.name+=tname return s def __setattr__(self, key: str, value: Any): @@ -165,8 +172,15 @@ def __post_init__(self): def __format__(self, format_spec: str | None): default = "{series} - {season:02}x{episode:02} - {title}" re_pattern = r"({(\w+)(?:\[[\w:]+\]|\.\w+)?(?:\:\d{1,2})?})" + tseries = '' + if ( format_spec is None or re.search("{(series_date.year|series_date|date.year|date)}", format_spec) is not None ) \ + and self.series is not None and self.series_date is not None \ + and self.series.endswith(f" ({self.series_date.year})"): + tseries = f" ({self.series_date.year})" + self.series = self.series[:-len(tseries)] s = re.sub(re_pattern, self._format_repl, format_spec or default) s = str_fix_padding(s) + self.series+=tseries return s def __setattr__(self, key: str, value: Any): diff --git a/mnamer/providers.py b/mnamer/providers.py index b2813aa2..ff37aaa7 100644 --- a/mnamer/providers.py +++ b/mnamer/providers.py @@ -334,14 +334,16 @@ def _search_tvdb_date( found = False for meta in self._search_id(id_tvdb, language=language): if meta.date: - if season is not None and season == meta.season and episode is not None and episode == meta.episode: + if season is not None and season == meta.season \ + and episode is not None and episode == meta.episode: if meta.date == release_date: found = True yield meta - elif release_date.month == 1 and release_date.month == 1: - if meta.date.year == release_date.year or meta.series_date.year == release_date.year: - found = True - yield meta + elif release_date.month == 1 and release_date.month == 1 and \ + ( meta.date.year == release_date.year or \ + meta.series_date.year == release_date.year ): + found = True + yield meta else: if meta.date == release_date: found = True