diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e5c7c08 --- /dev/null +++ b/.gitignore @@ -0,0 +1,228 @@ +# Created by https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos +# Edit at https://www.toptal.com/developers/gitignore?templates=python,visualstudiocode,macos + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### macOS Patch ### +# iCloud generated files +*.icloud + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +# End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode,macos diff --git a/README.md b/README.md index e4e83d7..339ca94 100644 --- a/README.md +++ b/README.md @@ -1 +1,54 @@ -# linkura-localization-template \ No newline at end of file +# linkura-localization-template + + +## translation progress + +![translation zh-CN](https://img.shields.io/badge/translation_zh--CN-2%2F9-blue) +![translation en](https://img.shields.io/badge/translation_en-0%2F9-blue) +--- + +## 说明 + +这是一个linkura翻译模板,用于快速启动翻译项目 + +该模板提供了一些基本的文件结构和工具,以帮助您更轻松地进行翻译工作。您可以根据自己的需要进行修改和扩展。 + +主要提供了三大个阶段的翻译 + +### gentodo + +从原始文件中生成中间文件 + +**推荐**在此阶段就在raw文件夹下生成格式化后的原始文本json文件,这样有助于追踪翻译进度。 + +### translate + +可以编写的自动化翻译工具 + +提供了基本的大模型api和prompt,您可以根据自己的需要进行修改和扩展。 + +基本用法如下: + + +### generate + +在 `data` 目录下生成最终翻译文件 + +```typescript data/*.json format +type i18n = 'zh' ... +type I18n = { + [key in i18n]: { + text: string, + author: string, + } +} + +type TranslatedItem = { + raw: string, + translation: I18n, +} + +type Data = Array +``` + +并记录翻译进度 diff --git a/data/example.json b/data/example.json new file mode 100644 index 0000000..917b31a --- /dev/null +++ b/data/example.json @@ -0,0 +1,24 @@ +[ + { + "raw": "こんにちは、世界!", + "translation": { + "zh-CN": { + "text": "你好,世界!", + "author": "copilot" + }, + "en": { + "text": "Hello, world!", + "author": "copilot" + } + } + }, + { + "raw": "蓮ノ空", + "translation": { + "zh-CN": { + "text": "莲之空", + "author": "copilot" + } + } + } +] \ No newline at end of file diff --git a/example.json b/example.json new file mode 100644 index 0000000..817df24 --- /dev/null +++ b/example.json @@ -0,0 +1,9 @@ +[ + "月", + "火", + "水", + "木", + "金", + "土", + "日" +] \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..524166a --- /dev/null +++ b/main.py @@ -0,0 +1,97 @@ +import argparse +import sys +from pathlib import Path +from src.generate import analyze + +i18n = ["zh-CN", "en"] +OUTPUT_DIR = Path("data") +RAW_DIR = Path("raw") +README_FILE = Path("README.md") + + +def command_gentodo(args): + """From raw file to translation todo file""" + print("TODO") + print("We recommend to generate raw files in raw directory, this will help you keep track of your translation progress.") + return 0 + +def command_translate(args): + """ + Translation tool, providing basic large model API translation interface + + And user also can translated by handmade + """ + print("TODO") + return 0 + +def command_generate(args): + """Generate translated files and progress reports""" + (total, translated) = analyze.analyze_translation_progress(RAW_DIR, OUTPUT_DIR, locale=args.locale) + analyze.write_translation_progress(README_FILE, total, translated, locale=args.locale) + return 0 + +def main(): + """Main function""" + parser = argparse.ArgumentParser( + description="Linkura Translation Tool Template", + formatter_class=argparse.RawDescriptionHelpFormatter + ) + + parser.add_argument( + '--locale', '-l', + default='zh-CN', + choices=i18n, + help='Translation locale' + ) + + subparsers = parser.add_subparsers( + dest='command', + metavar='COMMAND' + ) + # gentodo + parser_gentodo = subparsers.add_parser( + 'gentodo', + help='From raw file to translation todo file', + ) + parser_gentodo.add_argument( + '--about', '-a', + help='Example for sub args' + ) + parser_gentodo.set_defaults(func=command_gentodo) + + # translate + parser_translate = subparsers.add_parser( + 'translate', + help='Translation tool, providing basic large model API translation interface', + ) + parser_translate.add_argument( + '--about', '-a', + help='Example for sub args' + ) + parser_translate.set_defaults(func=command_translate) + + # generate + parser_generate = subparsers.add_parser( + 'generate', + help='Generate translated files and progress reports', + ) + parser_generate.add_argument( + '--about', '-a', + help='Example for sub args' + ) + parser_generate.set_defaults(func=command_generate) + + args = parser.parse_args() + + if not args.command: + parser.print_help() + return 1 + + try: + return args.func(args) + except Exception as e: + print(f"Error occurred while executing command: {e}", file=sys.stderr) + return 1 + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/raw/example.json b/raw/example.json new file mode 100644 index 0000000..817df24 --- /dev/null +++ b/raw/example.json @@ -0,0 +1,9 @@ +[ + "月", + "火", + "水", + "木", + "金", + "土", + "日" +] \ No newline at end of file diff --git a/src/generate/analyze.py b/src/generate/analyze.py new file mode 100644 index 0000000..df73b2c --- /dev/null +++ b/src/generate/analyze.py @@ -0,0 +1,125 @@ +from typing import Set, Tuple +from pathlib import Path +import os +import json + + +def load_json_as_sets(file_path: Path, key = "raw") -> Set[str]: + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + if not isinstance(data, list): + raise ValueError("Invalid `data` / `raw` json format") + res = set() + for item in data: + if isinstance(item, str): + res.add(item) + elif isinstance(item, dict): + res.add(item[key]) + return res + +def load_locale_count(file_path: Path, locale = "zh-CN") -> int: + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + if not isinstance(data, list): + raise ValueError("Invalid locale json format") + count = 0 + for item in data: + if not isinstance(item, dict): + continue + if locale in item["translation"] and item["translation"][locale]: + count += 1 + return count + +def analyze_translation_progress(raw_dir: Path, output_dir: Path, locale: str = "zh-CN")-> Tuple[int, int]: + raw_strings = set() + # read raw_dir/*.json & output_dir/*_locale.json + for filename in os.listdir(raw_dir): + if filename.endswith(".json"): + file_path = os.path.join(raw_dir, filename) + raw_strings.update(load_json_as_sets(file_path, "raw")) + translated_strings = 0 + for filename in os.listdir(output_dir): + if filename.endswith(f".json"): + file_path = os.path.join(output_dir, filename) + raw_strings.update(load_json_as_sets(file_path, "raw")) + translated_strings += load_locale_count(file_path, locale) + total = len(raw_strings) + translated = translated_strings + return (total, translated) + +def write_translation_progress(readme_file: Path, total: int, translated: int, locale: str = "zh-CN"): + """ + 更新README.md中的翻译进度badge + + Args: + total: 总字符串数 + translated: 已翻译字符串数 + locale: 语言代码 + """ + # 生成新的badge URL + sheilds_locale = locale.replace('-', '--') + badge_url = f"![translation {locale}](https://img.shields.io/badge/translation_{sheilds_locale}-{translated}%2F{total}-blue)" + + try: + with open(readme_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + except FileNotFoundError: + # 如果README.md不存在,创建一个基本的结构 + lines = [ + "# translation\n", + "\n", + "---\n", + "\n", + "## translation progress\n", + "\n", + "---\n" + ] + + # 查找translation progress section + start_idx = None + end_idx = None + + for i, line in enumerate(lines): + if "## translation progress" in line.lower(): + start_idx = i + elif start_idx is not None and line.strip() == "---": + end_idx = i + break + + if start_idx is None: + # 如果没有找到translation progress section,在文件末尾添加 + lines.extend([ + "\n", + "## translation progress\n", + "\n", + f"{badge_url}\n", + "\n", + "---\n" + ]) + else: + # 在translation progress section中查找现有的badge + locale_found = False + badge_pattern = f"translation_{sheilds_locale}" + + # 搜索现有的locale badge并替换 + for i in range(start_idx + 1, end_idx if end_idx else len(lines)): + if badge_pattern in lines[i]: + lines[i] = f"{badge_url}\n" + locale_found = True + break + + # 如果没有找到现有的locale badge,添加新的 + if not locale_found: + if end_idx is not None: + # 在 --- 之前插入新的badge + lines.insert(end_idx, f"{badge_url}\n") + else: + # 如果没有结束标记,在section末尾添加 + lines.append(f"{badge_url}\n") + lines.append("---\n") + + # 写回文件 + with open(readme_file, 'w', encoding='utf-8') as f: + f.writelines(lines) + + print(f"Update {locale} translation progress: {translated}/{total} ({translated/total*100:.1f}%)") diff --git a/src/translate/api.py b/src/translate/api.py new file mode 100644 index 0000000..09da26f --- /dev/null +++ b/src/translate/api.py @@ -0,0 +1,429 @@ +""" +LLM API Interface for Translation +Supports OpenAPI format compatible and Gemini format LLM interface calls +""" + +import os +import json +import requests +from typing import Optional, Dict, List, Any, Union +from dataclasses import dataclass +from enum import Enum + + +class APIProvider(Enum): + """API provider enumeration""" + OPENAI = "openai" + GEMINI = "gemini" + CLAUDE = "claude" + CUSTOM = "custom" + + +@dataclass +class APIConfig: + """API configuration class""" + provider: APIProvider + api_key: str + base_url: str + model: str + temperature: float = 0.7 + max_tokens: int = 2048 + timeout: int = 30 + + +class LLMAPIClient: + """LLM API client, supports multiple API formats""" + + def __init__(self, config: Optional[APIConfig] = None): + """ + Initialize API client + + Args: + config: API configuration, if None load from environment variables + """ + if config is None: + config = self._load_config_from_env() + self.config = config + + def _load_config_from_env(self) -> APIConfig: + """Load configuration from environment variables""" + provider_str = os.getenv('LLM_PROVIDER', 'openai').lower() + provider = APIProvider(provider_str) + + api_key = os.getenv('LLM_API_KEY') + if not api_key: + raise ValueError("LLM_API_KEY environment variable is required") + + base_url = os.getenv('LLM_BASE_URL', self._get_default_base_url(provider)) + model = os.getenv('LLM_MODEL', self._get_default_model(provider)) + temperature = float(os.getenv('LLM_TEMPERATURE', '0.7')) + max_tokens = int(os.getenv('LLM_MAX_TOKENS', '2048')) + timeout = int(os.getenv('LLM_TIMEOUT', '30')) + + return APIConfig( + provider=provider, + api_key=api_key, + base_url=base_url, + model=model, + temperature=temperature, + max_tokens=max_tokens, + timeout=timeout + ) + + def _get_default_base_url(self, provider: APIProvider) -> str: + """Get default base URL""" + defaults = { + APIProvider.OPENAI: "https://api.openai.com/v1", + APIProvider.GEMINI: "https://generativelanguage.googleapis.com/v1", + APIProvider.CLAUDE: "https://api.anthropic.com", + APIProvider.CUSTOM: "http://localhost:8000" + } + return defaults.get(provider, "https://api.openai.com/v1") + + def _get_default_model(self, provider: APIProvider) -> str: + """Get default model name""" + defaults = { + APIProvider.OPENAI: "gpt-3.5-turbo", + APIProvider.GEMINI: "gemini-pro", + APIProvider.CLAUDE: "claude-3-sonnet-20240229", + APIProvider.CUSTOM: "default" + } + return defaults.get(provider, "gpt-3.5-turbo") + + def chat_completion( + self, + messages: List[Dict[str, str]], + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + **kwargs + ) -> Dict[str, Any]: + """ + Send chat completion request + + Args: + messages: Message list, format: [{"role": "user", "content": "..."}] + temperature: Temperature parameter + max_tokens: Maximum token count + **kwargs: Other parameters + + Returns: + API response result + """ + if self.config.provider == APIProvider.OPENAI: + return self._openai_chat_completion(messages, temperature, max_tokens, **kwargs) + elif self.config.provider == APIProvider.GEMINI: + return self._gemini_chat_completion(messages, temperature, max_tokens, **kwargs) + elif self.config.provider == APIProvider.CLAUDE: + return self._claude_chat_completion(messages, temperature, max_tokens, **kwargs) + else: + return self._custom_chat_completion(messages, temperature, max_tokens, **kwargs) + + def _openai_chat_completion( + self, + messages: List[Dict[str, str]], + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + **kwargs + ) -> Dict[str, Any]: + """OpenAI format chat completion request""" + url = f"{self.config.base_url}/chat/completions" + + headers = { + "Authorization": f"Bearer {self.config.api_key}", + "Content-Type": "application/json" + } + + payload = { + "model": self.config.model, + "messages": messages, + "temperature": temperature or self.config.temperature, + "max_tokens": max_tokens or self.config.max_tokens, + **kwargs + } + + response = requests.post( + url, + headers=headers, + json=payload, + timeout=self.config.timeout + ) + response.raise_for_status() + return response.json() + + def _gemini_chat_completion( + self, + messages: List[Dict[str, str]], + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + **kwargs + ) -> Dict[str, Any]: + """Gemini format chat completion request""" + url = f"{self.config.base_url}/models/{self.config.model}:generateContent" + + headers = { + "Content-Type": "application/json", + "x-goog-api-key": self.config.api_key + } + + # Convert message format to Gemini format + gemini_messages = self._convert_to_gemini_format(messages) + + payload = { + "contents": gemini_messages, + "generationConfig": { + "temperature": temperature or self.config.temperature, + "maxOutputTokens": max_tokens or self.config.max_tokens, + **kwargs + } + } + + response = requests.post( + url, + headers=headers, + json=payload, + timeout=self.config.timeout + ) + response.raise_for_status() + return response.json() + + def _claude_chat_completion( + self, + messages: List[Dict[str, str]], + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + **kwargs + ) -> Dict[str, Any]: + """Claude format chat completion request""" + url = f"{self.config.base_url}/v1/messages" + + headers = { + "Authorization": f"Bearer {self.config.api_key}", + "Content-Type": "application/json", + "anthropic-version": "2023-06-01" + } + + # Extract system message + system_message = None + user_messages = [] + + for msg in messages: + if msg["role"] == "system": + system_message = msg["content"] + else: + user_messages.append(msg) + + payload = { + "model": self.config.model, + "messages": user_messages, + "temperature": temperature or self.config.temperature, + "max_tokens": max_tokens or self.config.max_tokens, + **kwargs + } + + if system_message: + payload["system"] = system_message + + response = requests.post( + url, + headers=headers, + json=payload, + timeout=self.config.timeout + ) + response.raise_for_status() + return response.json() + + def _custom_chat_completion( + self, + messages: List[Dict[str, str]], + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + **kwargs + ) -> Dict[str, Any]: + """Custom format chat completion request (OpenAI compatible)""" + return self._openai_chat_completion(messages, temperature, max_tokens, **kwargs) + + def _convert_to_gemini_format(self, messages: List[Dict[str, str]]) -> List[Dict[str, Any]]: + """Convert OpenAI format messages to Gemini format""" + gemini_contents = [] + + for message in messages: + role = message["role"] + content = message["content"] + + if role == "system": + # Gemini merges system messages into user messages + gemini_contents.append({ + "role": "user", + "parts": [{"text": f"System: {content}"}] + }) + elif role == "user": + gemini_contents.append({ + "role": "user", + "parts": [{"text": content}] + }) + elif role == "assistant": + gemini_contents.append({ + "role": "model", + "parts": [{"text": content}] + }) + + return gemini_contents + + def extract_response_content(self, response: Dict[str, Any]) -> str: + """Extract text content from API response""" + if self.config.provider == APIProvider.OPENAI or self.config.provider == APIProvider.CUSTOM: + return response["choices"][0]["message"]["content"] + elif self.config.provider == APIProvider.GEMINI: + return response["candidates"][0]["content"]["parts"][0]["text"] + elif self.config.provider == APIProvider.CLAUDE: + return response["content"][0]["text"] + else: + raise ValueError(f"Unsupported provider: {self.config.provider}") + + +def translate_text( + text: str, + target_language: str = "zh-CN", + source_language: str = "auto", + api_key: Optional[str] = None, + provider: str = "openai", + model: Optional[str] = None, + **kwargs +) -> str: + """ + Convenience function for translating text + + Args: + text: Text to translate + target_language: Target language + source_language: Source language + api_key: API key, if None read from environment variables + provider: API provider + model: Model name + **kwargs: Other parameters + + Returns: + Translated text + """ + # Build configuration + if api_key: + config = APIConfig( + provider=APIProvider(provider.lower()), + api_key=api_key, + base_url=kwargs.get('base_url', ''), + model=model or '', + temperature=kwargs.get('temperature', 0.3), + max_tokens=kwargs.get('max_tokens', 2048) + ) + if not config.base_url: + config.base_url = LLMAPIClient(config)._get_default_base_url(config.provider) + if not config.model: + config.model = LLMAPIClient(config)._get_default_model(config.provider) + + client = LLMAPIClient(config) + else: + client = LLMAPIClient() + + # Build translation prompt + if source_language == "auto": + prompt = f"""Please translate the following text to {target_language}: + +{text} + +Please return only the translation result without additional explanations.""" + else: + prompt = f"""Please translate the following {source_language} text to {target_language}: + +{text} + +Please return only the translation result without additional explanations.""" + + messages = [ + {"role": "system", "content": "You are a professional translation assistant capable of accurately translating various languages."}, + {"role": "user", "content": prompt} + ] + + # Send request + response = client.chat_completion(messages, **kwargs) + + # Extract translation result + return client.extract_response_content(response).strip() + + +def batch_translate( + texts: List[str], + target_language: str = "zh-CN", + source_language: str = "auto", + **kwargs +) -> List[str]: + """ + Batch translate texts + + Args: + texts: List of texts to translate + target_language: Target language + source_language: Source language + **kwargs: Other parameters + + Returns: + List of translated texts + """ + results = [] + + for text in texts: + try: + translated = translate_text( + text, + target_language=target_language, + source_language=source_language, + **kwargs + ) + results.append(translated) + except Exception as e: + print(f"Translation failed for '{text}': {e}") + results.append(text) # Keep original text when translation fails + + return results + + +# Environment variable configuration example: +# export LLM_PROVIDER=openai # or gemini, claude, custom +# export LLM_API_KEY=your_api_key +# export LLM_BASE_URL=https://api.openai.com/v1 +# export LLM_MODEL=gpt-3.5-turbo +# export LLM_TEMPERATURE=0.7 +# export LLM_MAX_TOKENS=2048 +# export LLM_TIMEOUT=30 + + +if __name__ == "__main__": + # Usage examples + + # 1. Using environment variable configuration + try: + result = translate_text("Hello, world!", "zh-CN") + print(f"Translation result: {result}") + except Exception as e: + print(f"Translation failed: {e}") + + # 2. Using function parameter configuration + try: + result = translate_text( + "Hello, world!", + "zh-CN", + api_key="your_api_key", + provider="openai", + model="gpt-3.5-turbo" + ) + print(f"Translation result: {result}") + except Exception as e: + print(f"Translation failed: {e}") + + # 3. Batch translation + texts = ["Hello", "World", "Python"] + try: + results = batch_translate(texts, "zh-CN") + for original, translated in zip(texts, results): + print(f"{original} -> {translated}") + except Exception as e: + print(f"Batch translation failed: {e}") \ No newline at end of file