+```
+
+## 🤝 Contributing
+
+
+
+**Love this project? Here's how you can help:**
+
+[](https://github.com/Hugging-Face-KREW/i18n-agent.git)
+[](https://github.com/Hugging-Face-KREW/i18n-agent.git)
+[](https://github.com/Hugging-Face-KREW/i18n-agent.git)
+
+
+
+### 👥 Contributors
+
+🤗 shhr.kre@gmail.com / @harheem
+🤗 jminj6@gmail.com / @Jwaminju
+
+## 💡 Use Cases
+
+> **🌟 Real-world scenarios where this agent shines:**
+
+- **📚 Documentation Teams**: Batch translate Transformers documentation updates
+- **🌍 Community Contributors**: Help make Transformers accessible in your language
+- **🏢 Organizations**: Streamline i18n workflows for Transformers library
+- **👨💻 Developers**: Contribute Transformers translations without manual GitHub workflow
+- **🎯 Issue #20179 Contributors**: Directly address the internationalization challenges raised by the community
+
+## 🛠️ Tech Stack
+
+
+
+
+
+
+
+
+
+
+
+## ❓ FAQ
+
+
+Q: How does this relate to Issue #20179?
+
+This agent directly addresses the pain points raised in Issue #20179 by automating the translation workflow, reducing manual overhead, and making it easier for contributors to submit high-quality translations.
+
+
+
+Q: How accurate are the translations?
+
+The agent uses Claude Sonnet 4, which provides high-quality translations with technical context awareness. It preserves code blocks, maintains formatting, and follows established translation patterns.
+
+
+
+Q: What permissions do I need for GitHub integration?
+
+Your GitHub token needs repository read/write permissions and the ability to create branches and pull requests on the target repository.
+
+
+
+Q: Can I customize the translation style?
+
+Yes! You can provide reference PR URLs to match existing translation patterns and maintain consistency with community standards.
+
+
+## 🐛 Troubleshooting
+
+### Common Issues
+
+
+API Key Issues
+
+- Ensure your Anthropic API key is valid and has sufficient credits
+- Check that your GitHub token has the necessary repository permissions
+
+
+
+
+Translation Quality
+
+- The system uses Claude Sonnet 4 for high-quality translations
+- Formatting and markdown structure is maintained
+- Please restart the translation again if you met format issue
+
+
+
+
+GitHub PR Creation
+
+- Verify repository permissions and branch protection rules
+- Check that the reference PR URL is accessible and valid
+
+
+
+
+## 🙏 Acknowledgments
+
+Special thanks to the amazing communities that make this possible:
+
+- **🤗 Hugging Face** - For building the Transformers library and comprehensive documentation
+- **🎭 Anthropic** - For Claude's incredible language capabilities
+- **👥 Hugging Face KREW Community** - For championing Korean AI translation
+- **🎨 Gradio** - For making beautiful AI interfaces simple
+- **🌍 Community Contributors** - For raising awareness through [Issue #20179](https://github.com/huggingface/transformers/issues/20179)
+
+---
+
+
+
+**Made with ❤️ for global accessibility of Hugging Face Transformers documentation.**
+
+**🎯 Solving [Issue #20179](https://github.com/huggingface/transformers/issues/20179) one translation at a time.**
+
+[⭐ Star this repo](https://github.com/Hugging-Face-KREW/i18n-agent.git) • [🐛 Report Bug](https://github.com/Hugging-Face-KREW/i18n-agent.git) • [💡 Request Feature](https://github.com/Hugging-Face-KREW/i18n-agent.git)
+
+
diff --git a/agent/handler.py b/agent/handler.py
index ad40eb1..37664af 100644
--- a/agent/handler.py
+++ b/agent/handler.py
@@ -1,639 +1,639 @@
-"""Module for gradio chat-based translation agent interface."""
-
-import os
-import re
-from pathlib import Path
-
-import gradio as gr
-
-from agent.workflow import (
- report_translation_target_files,
- translate_docs_interactive,
- generate_github_pr,
-)
-from pr_generator.searcher import find_reference_pr_simple_stream
-from translator.content import get_full_prompt, get_content, preprocess_content
-from translator.project_config import get_available_projects, get_project_config
-
-
-# State management
-class ChatState:
- def __init__(self):
- self.step = "welcome" # welcome -> find_files -> translate -> create_github_pr
-
- # Transient state (reset on restart)
- self.selected_project = "transformers"
- self.target_language = "ko"
- self.k_files = 10
- self.files_to_translate = []
- self.additional_instruction = ""
- self.current_file_content = {"translated": ""}
- self.pr_result = None
-
- # Persistent settings (preserved across restarts)
- self.persistent_settings = {
- "anthropic_api_key": "",
- "aws_bearer_token_bedrock": "",
- "github_config": {
- "token": "",
- "owner": "",
- "repo_name": "",
- "reference_pr_url": "",
- }
- }
-
- def reset_transient_state(self):
- """Reset only the workflow state, keep persistent settings"""
- self.step = "welcome"
- self.selected_project = "transformers"
- self.target_language = "ko"
- self.k_files = 10
- self.files_to_translate = []
- self.additional_instruction = ""
- self.current_file_content = {"translated": ""}
- self.pr_result = None
-
- @property
- def github_config(self):
- return self.persistent_settings["github_config"]
-
-
-state = ChatState()
-
-
-def _extract_content_for_display(content: str) -> str:
- """Extract text from document for display."""
- # Remove Copyright header
- to_translate = re.sub(r"", "", content, count=1, flags=re.DOTALL)
- to_translate = to_translate.strip()
- ## remove code blocks from text
- to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
- ## remove markdown tables from text
- to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
- ## remove empty lines from text
- to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
-
- return to_translate
-
-
-def get_welcome_message():
- """Initial welcome message with project selection"""
- return """**👋 Welcome to 🌐 Hugging Face i18n Translation Agent!**
-
-I'll help you find files that need translation and translate them in a streamlined workflow.
-
-**🎯 First, select which project you want to translate:**
-
-Use the **`Quick Controls`** on the right to select a project, or **ask me `what`, `how`, or `help`** to get started.
-"""
-
-
-def process_file_search_handler(project: str, lang: str, k: int, history: list) -> tuple:
- """Process file search request and update Gradio UI components."""
- global state
- state.selected_project = project
- state.target_language = lang
- state.k_files = k
- state.step = "find_files"
-
- try:
- status_report, files_list = report_translation_target_files(project, lang, k)
- except Exception as e:
- if "rate limit" in str(e).lower():
- response = f"""❌ **GitHub API Rate Limit Exceeded**
-
-{str(e)}
-
-**💡 To fix this:**
-1. Set GitHub Token in Configuration panel above
-2. Click "💾 Save Configuration"
-3. Try "Find Files" again"""
- history.append(["File search request", response])
- return history, "", update_status(), gr.Tabs(selected=0), gr.update(choices=[]), gr.update(visible=False)
- else:
- raise # Re-raise non-rate-limit errors
- state.files_to_translate = (
- [file[0] for file in files_list]
- if files_list
- else []
- )
-
- response = f"""**✅ File search completed!**
-
-**Status Report:**
-{status_report}
-
-**📁 Found first {len(state.files_to_translate)} files to translate:**
-"""
-
- if state.files_to_translate:
- config = get_project_config(state.selected_project)
- for i, file in enumerate(state.files_to_translate, 1):
- file_link = f"{config.repo_url}/blob/main/{file}"
- response += f"\n{i}. [`{file}`]({file_link})"
-
- # if len(state.files_to_translate) > 5:
- # response += f"\n... and {len(state.files_to_translate) - 5} more files"
-
- response += "\n\n**🚀 Ready to start translation?**\nI can begin translating these files one by one. Would you like to proceed?"
- else:
- response += "\nNo files found that need translation."
-
- # Add to history
- history.append(["Please find files that need translation", response])
- cleared_input = ""
-
- # 드롭다운 choices로 쓸 파일 리스트 반환 추가
- return (
- history,
- cleared_input,
- update_status(),
- gr.Tabs(), # Don't change tab
- update_dropdown_choices(state.files_to_translate),
- )
-
-
-def update_dropdown_choices(file_list):
- return gr.update(choices=file_list, value=None)
-
-
-def confirm_and_go_translate_handler(history):
- """Confirm selection and go to translate tab"""
- global state
-
- response = f"✅ **Selection confirmed!**\n\n🎯 **Project:** {state.selected_project}\n🌍 **Language:** {state.target_language}\n\n**➡️ Go to Tab 2 to start translation.**"
- history.append(["Confirm selection", response])
- return history, "", update_status(), gr.Tabs(selected=1)
-
-
-def confirm_translation_and_go_upload_handler(history):
- """Confirm translation and go to upload PR tab"""
- global state
-
- if not state.current_file_content.get("translated"):
- response = "❌ No translation available. Please complete translation first."
- history.append(["Upload PR request", response])
- return history, "", update_status(), gr.Tabs()
-
- response = f"✅ **Translation confirmed!**\n\n📄 **File:** `{state.files_to_translate[0] if state.files_to_translate else 'Unknown'}`\n\n**➡️ Go to Tab 3 to upload PR.**"
- history.append(["Upload PR request", response])
- return history, "", update_status(), gr.Tabs(selected=2)
-
-
-def start_translation_process(force_retranslate=False):
- """Start the translation process for the first file"""
- if not state.files_to_translate:
- return "❌ No files available for translation.", ""
-
- current_file = state.files_to_translate[0]
-
- # Call translation function (simplified for demo)
- try:
- status, translated = translate_docs_interactive(
- state.target_language, [[current_file]], state.additional_instruction, state.selected_project, force_retranslate
- )
-
- state.current_file_content = {"translated": translated}
- path = (
- Path(__file__).resolve().parent.parent
- / f"translation_result/{current_file}"
- )
- p = Path(path)
- p.parent.mkdir(parents=True, exist_ok=True)
- p.write_text(translated, encoding="utf-8")
-
- config = get_project_config(state.selected_project)
- original_file_link = f"{config.repo_url}/blob/main/{current_file}"
- print("Compeleted translation:\n")
- print(translated)
- print("----------------------------")
-
- # Different response format for existing vs new translation
- if isinstance(status, str) and "Existing translation loaded" in status:
- response = f"{status}\n**📄 Original Content Link:** {original_file_link}\n\n**🌐 Translated Content:**"
- else:
- response = (
- f"""🔄 Translation for: `{current_file}`\n"""
- f"**📄 Original Content Link:** {original_file_link}\n\n"
- f"{status}\n\n"
- "**🌐 Translated Content:**"
- )
- return response, translated
-
-
- except Exception as e:
- response = f"❌ Translation failed: {str(e)}"
- response += "\n**➡️ Please try from the beginning.**"
- return response, ""
-
-
-def handle_general_message(message):
- """Handle general messages"""
- message_lower = message.lower()
-
- if any(word in message_lower for word in ["help", "what", "how"]):
- return """**🤖 I'm your Hugging Face i18n Translation Agent!**
-
-I can help you:
-1. **🔍 Find files** that need translation
-2. **🌐 Translate documents** using AI
-3. **📋 Review translations** for quality
-4. **🚀 Create GitHub PR** for translation
-
-Currently available actions with quick controls:
-- "find files" - Search for files needing translation
-- "translate" - Start translation process
-- "review" - Review current translation
-- "github" - Create GitHub Pull Request
-- "restart" - Start over"""
-
- elif "restart" in message_lower:
- global state
- state = ChatState()
- return get_welcome_message()
-
- else:
- return """I understand you want to work on translations!
-
-**Two ways to get started:**
-
-1. **🔍 Find Files first** - Use Tab 1 to discover files that need translation
-2. **🚀 Direct Translation** - Go to Tab 2 and enter a file path directly (e.g., `docs/source/en/model_doc/bert.md`)
-
-Make sure to configure your API keys in the Configuration panel above.
-"""
-
-
-# Main handler
-def handle_user_message(message, history):
- """Handle user messages and provide appropriate responses"""
- global state
-
- if not message.strip():
- return history, ""
-
- elif state.step == "find_files" and any(
- word in message.lower()
- for word in ["yes", "proceed", "start", "translate", "translation"]
- ):
- # User wants to start translation
- if state.files_to_translate:
- state.step = "translate"
- response, translated = start_translation_process()
- history.append([message, response])
- history.append(["", translated])
- return history, ""
- else:
- response = (
- "❌ No files available for translation. Please search for files first."
- )
- # Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
- else:
- # General response
- response = handle_general_message(message)
-
- history.append([message, response])
- return history, ""
-
-
-def update_status():
- if state.step == "welcome":
- return f"""
-
-
🔄 Step: Welcome
-
🎯 Project: {state.selected_project}
-
📁 Files: 0
-
🌍 Language: {state.target_language}
-
- """
-
- step_map = {
- "welcome": "Welcome",
- "find_files": "Finding Files",
- "translate": "Translating",
- "review": "Reviewing",
- "create_github_pr": "Creating PR",
- }
-
- progress_map = {
- "welcome": "Ready to start",
- "find_files": "Files found",
- "translate": f"{len(state.files_to_translate)} remaining",
- "review": "Review complete",
- "create_github_pr": "PR generation in progress",
- }
-
- # Check GitHub configuration status
- github_status = "❌ Not configured"
- if all(
- [
- state.github_config["token"],
- state.github_config["owner"],
- state.github_config["repo_name"],
- ]
- ):
- github_status = (
- f"✅ {state.github_config['owner']}/{state.github_config['repo_name']}"
- )
-
- status_html = f"""
-
-
🔄 Step: {step_map.get(state.step, state.step)}
-
🎯 Project: {state.selected_project}
-
📁 Files: {len(state.files_to_translate)}
-
🌍 Language: {state.target_language}
-
⏳ Progress: {progress_map.get(state.step, 'In progress')}
-
🔧 GitHub: {github_status}
-
- """
-
- return status_html
-
-
-# Event handlers
-
-
-def sync_language_displays(lang):
- return lang
-
-
-def update_project_selection(project, history):
- """Update state when project is selected"""
- global state
- state.selected_project = project
- response = f"Selection confirmed: 🎯 Project → **{project}**"
- history.append(["Project selection", response])
- return history, "", update_status()
-
-
-def update_language_selection(lang, history):
- """Update state when language is selected"""
- global state
- state.target_language = lang
- response = f"Selection confirmed: 🌍 Language → **{lang}**"
- history.append(["Language selection", response])
- return history, "", update_status(), lang
-
-
-def update_persistent_config(api_provider, anthropic_key, aws_bearer_token_bedrock, github_token, github_owner, github_repo, reference_pr_url, history):
- """Update persistent configuration settings."""
- global state
-
- # Update API keys based on provider selection
- if api_provider == "Anthropic":
- state.persistent_settings["anthropic_api_key"] = anthropic_key
- os.environ["ANTHROPIC_API_KEY"] = anthropic_key
- # Clear AWS Bedrock token if Anthropic is selected
- state.persistent_settings["aws_bearer_token_bedrock"] = ""
- os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None)
- elif api_provider == "AWS Bedrock":
- state.persistent_settings["aws_bearer_token_bedrock"] = aws_bearer_token_bedrock
- os.environ["AWS_BEARER_TOKEN_BEDROCK"] = aws_bearer_token_bedrock
- # Clear Anthropic key if AWS Bedrock is selected
- state.persistent_settings["anthropic_api_key"] = ""
- os.environ.pop("ANTHROPIC_API_KEY", None)
- else:
- # If no provider is selected or unknown, clear both
- state.persistent_settings["anthropic_api_key"] = ""
- os.environ.pop("ANTHROPIC_API_KEY", None)
- state.persistent_settings["aws_bearer_token_bedrock"] = ""
- os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None)
-
- if github_token:
- os.environ["GITHUB_TOKEN"] = github_token
-
- # Get default reference PR URL from project config if not provided
- if not reference_pr_url and state.selected_project:
- try:
- config = get_project_config(state.selected_project)
- reference_pr_url = config.reference_pr_url
- except:
- pass
-
- # Save GitHub configuration to persistent settings
- state.persistent_settings["github_config"].update({
- "token": github_token or "",
- "owner": github_owner or "",
- "repo_name": github_repo or "",
- "reference_pr_url": reference_pr_url or "",
- })
-
- # Build response message based on what was configured
- response = "✅ Configuration saved!"
- if github_owner and github_repo:
- response += f" GitHub: {github_owner}/{github_repo}"
-
- if api_provider == "Anthropic" and anthropic_key:
- response += " Anthropic API key updated."
- elif api_provider == "AWS Bedrock" and aws_bearer_token_bedrock:
- response += " AWS Bedrock Bearer Token updated."
-
- history.append(["Configuration update", response])
- return history, "", update_status()
-
-
-def update_github_config(token, owner, repo, reference_pr_url):
- """Legacy function for backward compatibility."""
- return update_persistent_config("", token, owner, repo, reference_pr_url)
-
-
-def update_prompt_preview(language, file_path, additional_instruction):
- """Update prompt preview based on current settings"""
- if not file_path.strip():
- return "Select a file to see the prompt preview..."
-
- try:
- # Get language name
- if language == "ko":
- translation_lang = "Korean"
- else:
- translation_lang = language
-
- # Get sample content (first 500 characters)
- content = get_content(file_path, state.selected_project)
- to_translate = preprocess_content(content)
-
- # Truncate for preview
- sample_content = to_translate[:500] + ("..." if len(to_translate) > 500 else "")
-
- # Generate prompt
- prompt = get_full_prompt(translation_lang, sample_content, additional_instruction)
-
- return prompt
- except Exception as e:
- error_str = str(e)
- if "Failed to retrieve content from the URL" in error_str:
- return f"❌ **File not found:** `{file_path}`\n\n💡 **Please check:**\n1. Is this file in the **{state.selected_project}** project?\n2. Use \"🔍 Find Files to Translate\" to see available files\n3. Verify the file path is correct"
- return f"Error generating prompt preview: {error_str}"
-
-
-def send_message(message, history):
- new_history, cleared_input = handle_user_message(message, history)
- return new_history, cleared_input, update_status()
-
-
-# Button handlers with tab switching
-def start_translate_handler(history, file_to_translate, additional_instruction="", force_retranslate=False):
- # Use persistent anthropic key
- anthropic_key = state.persistent_settings["anthropic_api_key"]
- aws_bearer_token_bedrock = state.persistent_settings["aws_bearer_token_bedrock"]
-
- if not anthropic_key and not aws_bearer_token_bedrock:
- response = "❌ Please set either Anthropic API key or AWS Bearer Token for Bedrock in Configuration panel first."
- history.append(["Translation request", response])
- return history, "", update_status(), gr.Tabs(), gr.update(), gr.update()
-
- # Set the active API key to environment variable for translator.content.py
- if anthropic_key:
- os.environ["ANTHROPIC_API_KEY"] = anthropic_key
- os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None) # Ensure only one is active
- elif aws_bearer_token_bedrock:
- os.environ["AWS_BEARER_TOKEN_BEDROCK"] = aws_bearer_token_bedrock
- os.environ.pop("ANTHROPIC_API_KEY", None) # Ensure only one is active
-
- # Check if file path is provided
- if not file_to_translate or not file_to_translate.strip():
- response = "❌ Please select a file from the dropdown or enter a file path to translate."
- history.append(["Translation request", response])
- return history, "", update_status(), gr.Tabs(), gr.update(), gr.update()
-
- state.additional_instruction = additional_instruction
- state.files_to_translate = [file_to_translate]
- state.step = "translate"
-
- # Start translation directly
- if force_retranslate:
- history.append(["Translation request", "🔄 **Force retranslation started...**"])
- response, translated = start_translation_process(force_retranslate)
- history.append(["", response])
- if translated:
- history.append(["", translated])
-
- # Update button text and show confirm button after translation
- start_btn_text = "🔄 Retranslation" if state.current_file_content["translated"] else "🚀 Start Translation"
- confirm_btn_visible = bool(state.current_file_content["translated"])
-
- return history, "", update_status(), gr.Tabs(), gr.update(value=start_btn_text), gr.update(visible=confirm_btn_visible)
-
-
-def approve_handler(history, owner, repo, reference_pr_url):
- """Handles the request to generate a GitHub PR."""
- global state
- state.step = "create_github_pr"
-
- # Check all required GitHub configuration at once
- github_config = state.persistent_settings["github_config"]
- missing_config = []
-
- if not github_config.get("token"):
- missing_config.append("GitHub Token")
- if not owner:
- missing_config.append("GitHub Owner")
- if not repo:
- missing_config.append("Repository Name")
-
- if missing_config:
- config = get_project_config(state.selected_project)
- repo_name = config.repo_url.split('/')[-1] # Extract repo name from URL
- response = f"❌ Please set the following in Configuration panel first: {', '.join(missing_config)}\n\n💡 **Note:** GitHub Owner/Repository should be your fork of [`{repo_name}`]({config.repo_url}) (e.g., Owner: `your-username`, Repository: `{repo_name}`)"
- history.append(["GitHub PR creation request", response])
- return history, "", update_status()
-
- # Update reference PR URL (can be set per PR)
- if reference_pr_url:
- state.persistent_settings["github_config"]["reference_pr_url"] = reference_pr_url
-
- # Use persistent settings
- github_config = state.persistent_settings["github_config"]
-
- # Initialize response variable
- response = ""
-
- # If reference PR is not provided, use the agent to find one
- if not github_config.get("reference_pr_url"):
- response = "🤖 **Reference PR URL not found. The agent will now search for a suitable one...**"
- try:
- # This part is simplified to avoid streaming logic in a non-generator function
- stream_gen = find_reference_pr_simple_stream(
- target_language=state.target_language,
- context="documentation translation",
- )
- # We will just get the final result from the generator
- final_result = None
- try:
- while True:
- # We are not interested in the streamed messages here, just the final result.
- next(stream_gen)
- except StopIteration as e:
- final_result = e.value
-
- if final_result and final_result.get("status") == "success":
- result_text = final_result.get("result", "")
- match = re.search(r"https://github.com/[^\s]+", result_text)
- if match:
- found_url = match.group(0)
- state.github_config["reference_pr_url"] = found_url
- response += f"\n✅ **Agent found a reference PR:** {found_url}"
- else:
- raise ValueError(
- "Could not extract a valid PR URL from agent's response."
- )
- else:
- error_message = final_result.get("message") or final_result.get(
- "result", "Unknown error"
- )
- raise ValueError(f"Agent failed to find a PR. Reason: {error_message}")
- except Exception as e:
- response += f"\n❌ **Agent failed to find a reference PR.**\nReason: {e}\n\nPlease provide a reference PR URL manually in Tab 3 and try again."
- history.append(["Agent searching for PR", response])
- return history, "", update_status()
-
- # Proceed with PR generation
- if state.files_to_translate and state.current_file_content.get("translated"):
- current_file = state.files_to_translate[0]
- translated_content = state.current_file_content["translated"]
- response += "\n\n🚀 **Generating GitHub PR...**"
-
- # Extract title from file for toctree mapping
- file_name = current_file.split("/")[-1].replace(".md", "").replace("_", " ").title()
- print(file_name)
-
- pr_response = generate_github_pr(
- target_language=state.target_language,
- filepath=current_file,
- translated_content=translated_content,
- github_config=state.github_config,
- en_title=file_name,
- project=state.selected_project,
- )
- response += f"\n{pr_response}"
- else:
- response = "❌ No translated file available. Please complete the translation process first."
-
- history.append(["GitHub PR creation request", response])
- return history, "", update_status()
-
-
-def restart_handler(history):
- """Resets the workflow state but preserves persistent settings."""
- global state
- # Backup persistent settings
- backup_settings = state.persistent_settings.copy()
-
- # Reset state
- state = ChatState()
-
- # Restore persistent settings
- state.persistent_settings = backup_settings
-
- # Restore environment variables
- if backup_settings["anthropic_api_key"]:
- os.environ["ANTHROPIC_API_KEY"] = backup_settings["anthropic_api_key"]
- if backup_settings["aws_bearer_token_bedrock"]:
- os.environ["AWS_BEARER_TOKEN_BEDROCK"] = backup_settings["aws_bearer_token_bedrock"]
- if backup_settings["github_config"]["token"]:
- os.environ["GITHUB_TOKEN"] = backup_settings["github_config"]["token"]
-
- welcome_msg = get_welcome_message()
- new_hist = [[None, welcome_msg]]
- return new_hist, "", update_status(), gr.Tabs(selected=0)
+"""Module for gradio chat-based translation agent interface."""
+
+import os
+import re
+from pathlib import Path
+
+import gradio as gr
+
+from agent.workflow import (
+ report_translation_target_files,
+ translate_docs_interactive,
+ generate_github_pr,
+)
+from pr_generator.searcher import find_reference_pr_simple_stream
+from translator.content import get_full_prompt, get_content, preprocess_content
+from translator.project_config import get_available_projects, get_project_config
+
+
+# State management
+class ChatState:
+ def __init__(self):
+ self.step = "welcome" # welcome -> find_files -> translate -> create_github_pr
+
+ # Transient state (reset on restart)
+ self.selected_project = "transformers"
+ self.target_language = "ko"
+ self.k_files = 10
+ self.files_to_translate = []
+ self.additional_instruction = ""
+ self.current_file_content = {"translated": ""}
+ self.pr_result = None
+
+ # Persistent settings (preserved across restarts)
+ self.persistent_settings = {
+ "anthropic_api_key": "",
+ "aws_bearer_token_bedrock": "",
+ "github_config": {
+ "token": "",
+ "owner": "",
+ "repo_name": "",
+ "reference_pr_url": "",
+ }
+ }
+
+ def reset_transient_state(self):
+ """Reset only the workflow state, keep persistent settings"""
+ self.step = "welcome"
+ self.selected_project = "transformers"
+ self.target_language = "ko"
+ self.k_files = 10
+ self.files_to_translate = []
+ self.additional_instruction = ""
+ self.current_file_content = {"translated": ""}
+ self.pr_result = None
+
+ @property
+ def github_config(self):
+ return self.persistent_settings["github_config"]
+
+
+state = ChatState()
+
+
+def _extract_content_for_display(content: str) -> str:
+ """Extract text from document for display."""
+ # Remove Copyright header
+ to_translate = re.sub(r"", "", content, count=1, flags=re.DOTALL)
+ to_translate = to_translate.strip()
+ ## remove code blocks from text
+ to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
+ ## remove markdown tables from text
+ to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
+ ## remove empty lines from text
+ to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
+
+ return to_translate
+
+
+def get_welcome_message():
+ """Initial welcome message with project selection"""
+ return """**👋 Welcome to 🌐 Hugging Face i18n Translation Agent!**
+
+I'll help you find files that need translation and translate them in a streamlined workflow.
+
+**🎯 First, select which project you want to translate:**
+
+Use the **`Quick Controls`** on the right to select a project, or **ask me `what`, `how`, or `help`** to get started.
+"""
+
+
+def process_file_search_handler(project: str, lang: str, k: int, history: list) -> tuple:
+ """Process file search request and update Gradio UI components."""
+ global state
+ state.selected_project = project
+ state.target_language = lang
+ state.k_files = k
+ state.step = "find_files"
+
+ try:
+ status_report, files_list = report_translation_target_files(project, lang, k)
+ except Exception as e:
+ if "rate limit" in str(e).lower():
+ response = f"""❌ **GitHub API Rate Limit Exceeded**
+
+{str(e)}
+
+**💡 To fix this:**
+1. Set GitHub Token in Configuration panel above
+2. Click "💾 Save Configuration"
+3. Try "Find Files" again"""
+ history.append(["File search request", response])
+ return history, "", update_status(), gr.Tabs(selected=0), gr.update(choices=[]), gr.update(visible=False)
+ else:
+ raise # Re-raise non-rate-limit errors
+ state.files_to_translate = (
+ [file[0] for file in files_list]
+ if files_list
+ else []
+ )
+
+ response = f"""**✅ File search completed!**
+
+**Status Report:**
+{status_report}
+
+**📁 Found first {len(state.files_to_translate)} files to translate:**
+"""
+
+ if state.files_to_translate:
+ config = get_project_config(state.selected_project)
+ for i, file in enumerate(state.files_to_translate, 1):
+ file_link = f"{config.repo_url}/blob/main/{file}"
+ response += f"\n{i}. [`{file}`]({file_link})"
+
+ # if len(state.files_to_translate) > 5:
+ # response += f"\n... and {len(state.files_to_translate) - 5} more files"
+
+ response += "\n\n**🚀 Ready to start translation?**\nI can begin translating these files one by one. Would you like to proceed?"
+ else:
+ response += "\nNo files found that need translation."
+
+ # Add to history
+ history.append(["Please find files that need translation", response])
+ cleared_input = ""
+
+ # 드롭다운 choices로 쓸 파일 리스트 반환 추가
+ return (
+ history,
+ cleared_input,
+ update_status(),
+ gr.Tabs(), # Don't change tab
+ update_dropdown_choices(state.files_to_translate),
+ )
+
+
+def update_dropdown_choices(file_list):
+ return gr.update(choices=file_list, value=None)
+
+
+def confirm_and_go_translate_handler(history):
+ """Confirm selection and go to translate tab"""
+ global state
+
+ response = f"✅ **Selection confirmed!**\n\n🎯 **Project:** {state.selected_project}\n🌍 **Language:** {state.target_language}\n\n**➡️ Go to Tab 2 to start translation.**"
+ history.append(["Confirm selection", response])
+ return history, "", update_status(), gr.Tabs(selected=1)
+
+
+def confirm_translation_and_go_upload_handler(history):
+ """Confirm translation and go to upload PR tab"""
+ global state
+
+ if not state.current_file_content.get("translated"):
+ response = "❌ No translation available. Please complete translation first."
+ history.append(["Upload PR request", response])
+ return history, "", update_status(), gr.Tabs()
+
+ response = f"✅ **Translation confirmed!**\n\n📄 **File:** `{state.files_to_translate[0] if state.files_to_translate else 'Unknown'}`\n\n**➡️ Go to Tab 3 to upload PR.**"
+ history.append(["Upload PR request", response])
+ return history, "", update_status(), gr.Tabs(selected=2)
+
+
+def start_translation_process(force_retranslate=False):
+ """Start the translation process for the first file"""
+ if not state.files_to_translate:
+ return "❌ No files available for translation.", ""
+
+ current_file = state.files_to_translate[0]
+
+ # Call translation function (simplified for demo)
+ try:
+ status, translated = translate_docs_interactive(
+ state.target_language, [[current_file]], state.additional_instruction, state.selected_project, force_retranslate
+ )
+
+ state.current_file_content = {"translated": translated}
+ path = (
+ Path(__file__).resolve().parent.parent
+ / f"translation_result/{current_file}"
+ )
+ p = Path(path)
+ p.parent.mkdir(parents=True, exist_ok=True)
+ p.write_text(translated, encoding="utf-8")
+
+ config = get_project_config(state.selected_project)
+ original_file_link = f"{config.repo_url}/blob/main/{current_file}"
+ print("Compeleted translation:\n")
+ print(translated)
+ print("----------------------------")
+
+ # Different response format for existing vs new translation
+ if isinstance(status, str) and "Existing translation loaded" in status:
+ response = f"{status}\n**📄 Original Content Link:** {original_file_link}\n\n**🌐 Translated Content:**"
+ else:
+ response = (
+ f"""🔄 Translation for: `{current_file}`\n"""
+ f"**📄 Original Content Link:** {original_file_link}\n\n"
+ f"{status}\n\n"
+ "**🌐 Translated Content:**"
+ )
+ return response, translated
+
+
+ except Exception as e:
+ response = f"❌ Translation failed: {str(e)}"
+ response += "\n**➡️ Please try from the beginning.**"
+ return response, ""
+
+
+def handle_general_message(message):
+ """Handle general messages"""
+ message_lower = message.lower()
+
+ if any(word in message_lower for word in ["help", "what", "how"]):
+ return """**🤖 I'm your Hugging Face i18n Translation Agent!**
+
+I can help you:
+1. **🔍 Find files** that need translation
+2. **🌐 Translate documents** using AI
+3. **📋 Review translations** for quality
+4. **🚀 Create GitHub PR** for translation
+
+Currently available actions with quick controls:
+- "find files" - Search for files needing translation
+- "translate" - Start translation process
+- "review" - Review current translation
+- "github" - Create GitHub Pull Request
+- "restart" - Start over"""
+
+ elif "restart" in message_lower:
+ global state
+ state = ChatState()
+ return get_welcome_message()
+
+ else:
+ return """I understand you want to work on translations!
+
+**Two ways to get started:**
+
+1. **🔍 Find Files first** - Use Tab 1 to discover files that need translation
+2. **🚀 Direct Translation** - Go to Tab 2 and enter a file path directly (e.g., `docs/source/en/model_doc/bert.md`)
+
+Make sure to configure your API keys in the Configuration panel above.
+"""
+
+
+# Main handler
+def handle_user_message(message, history):
+ """Handle user messages and provide appropriate responses"""
+ global state
+
+ if not message.strip():
+ return history, ""
+
+ elif state.step == "find_files" and any(
+ word in message.lower()
+ for word in ["yes", "proceed", "start", "translate", "translation"]
+ ):
+ # User wants to start translation
+ if state.files_to_translate:
+ state.step = "translate"
+ response, translated = start_translation_process()
+ history.append([message, response])
+ history.append(["", translated])
+ return history, ""
+ else:
+ response = (
+ "❌ No files available for translation. Please search for files first."
+ )
+ # Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
+ else:
+ # General response
+ response = handle_general_message(message)
+
+ history.append([message, response])
+ return history, ""
+
+
+def update_status():
+ if state.step == "welcome":
+ return f"""
+
+
🔄 Step: Welcome
+
🎯 Project: {state.selected_project}
+
📁 Files: 0
+
🌍 Language: {state.target_language}
+
+ """
+
+ step_map = {
+ "welcome": "Welcome",
+ "find_files": "Finding Files",
+ "translate": "Translating",
+ "review": "Reviewing",
+ "create_github_pr": "Creating PR",
+ }
+
+ progress_map = {
+ "welcome": "Ready to start",
+ "find_files": "Files found",
+ "translate": f"{len(state.files_to_translate)} remaining",
+ "review": "Review complete",
+ "create_github_pr": "PR generation in progress",
+ }
+
+ # Check GitHub configuration status
+ github_status = "❌ Not configured"
+ if all(
+ [
+ state.github_config["token"],
+ state.github_config["owner"],
+ state.github_config["repo_name"],
+ ]
+ ):
+ github_status = (
+ f"✅ {state.github_config['owner']}/{state.github_config['repo_name']}"
+ )
+
+ status_html = f"""
+
+
🔄 Step: {step_map.get(state.step, state.step)}
+
🎯 Project: {state.selected_project}
+
📁 Files: {len(state.files_to_translate)}
+
🌍 Language: {state.target_language}
+
⏳ Progress: {progress_map.get(state.step, 'In progress')}
+
🔧 GitHub: {github_status}
+
+ """
+
+ return status_html
+
+
+# Event handlers
+
+
+def sync_language_displays(lang):
+ return lang
+
+
+def update_project_selection(project, history):
+ """Update state when project is selected"""
+ global state
+ state.selected_project = project
+ response = f"Selection confirmed: 🎯 Project → **{project}**"
+ history.append(["Project selection", response])
+ return history, "", update_status()
+
+
+def update_language_selection(lang, history):
+ """Update state when language is selected"""
+ global state
+ state.target_language = lang
+ response = f"Selection confirmed: 🌍 Language → **{lang}**"
+ history.append(["Language selection", response])
+ return history, "", update_status(), lang
+
+
+def update_persistent_config(api_provider, anthropic_key, aws_bearer_token_bedrock, github_token, github_owner, github_repo, reference_pr_url, history):
+ """Update persistent configuration settings."""
+ global state
+
+ # Update API keys based on provider selection
+ if api_provider == "Anthropic":
+ state.persistent_settings["anthropic_api_key"] = anthropic_key
+ os.environ["ANTHROPIC_API_KEY"] = anthropic_key
+ # Clear AWS Bedrock token if Anthropic is selected
+ state.persistent_settings["aws_bearer_token_bedrock"] = ""
+ os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None)
+ elif api_provider == "AWS Bedrock":
+ state.persistent_settings["aws_bearer_token_bedrock"] = aws_bearer_token_bedrock
+ os.environ["AWS_BEARER_TOKEN_BEDROCK"] = aws_bearer_token_bedrock
+ # Clear Anthropic key if AWS Bedrock is selected
+ state.persistent_settings["anthropic_api_key"] = ""
+ os.environ.pop("ANTHROPIC_API_KEY", None)
+ else:
+ # If no provider is selected or unknown, clear both
+ state.persistent_settings["anthropic_api_key"] = ""
+ os.environ.pop("ANTHROPIC_API_KEY", None)
+ state.persistent_settings["aws_bearer_token_bedrock"] = ""
+ os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None)
+
+ if github_token:
+ os.environ["GITHUB_TOKEN"] = github_token
+
+ # Get default reference PR URL from project config if not provided
+ if not reference_pr_url and state.selected_project:
+ try:
+ config = get_project_config(state.selected_project)
+ reference_pr_url = config.reference_pr_url
+ except:
+ pass
+
+ # Save GitHub configuration to persistent settings
+ state.persistent_settings["github_config"].update({
+ "token": github_token or "",
+ "owner": github_owner or "",
+ "repo_name": github_repo or "",
+ "reference_pr_url": reference_pr_url or "",
+ })
+
+ # Build response message based on what was configured
+ response = "✅ Configuration saved!"
+ if github_owner and github_repo:
+ response += f" GitHub: {github_owner}/{github_repo}"
+
+ if api_provider == "Anthropic" and anthropic_key:
+ response += " Anthropic API key updated."
+ elif api_provider == "AWS Bedrock" and aws_bearer_token_bedrock:
+ response += " AWS Bedrock Bearer Token updated."
+
+ history.append(["Configuration update", response])
+ return history, "", update_status()
+
+
+def update_github_config(token, owner, repo, reference_pr_url):
+ """Legacy function for backward compatibility."""
+ return update_persistent_config("", token, owner, repo, reference_pr_url)
+
+
+def update_prompt_preview(language, file_path, additional_instruction):
+ """Update prompt preview based on current settings"""
+ if not file_path.strip():
+ return "Select a file to see the prompt preview..."
+
+ try:
+ # Get language name
+ if language == "ko":
+ translation_lang = "Korean"
+ else:
+ translation_lang = language
+
+ # Get sample content (first 500 characters)
+ content = get_content(file_path, state.selected_project)
+ to_translate = preprocess_content(content)
+
+ # Truncate for preview
+ sample_content = to_translate[:500] + ("..." if len(to_translate) > 500 else "")
+
+ # Generate prompt
+ prompt = get_full_prompt(translation_lang, sample_content, additional_instruction)
+
+ return prompt
+ except Exception as e:
+ error_str = str(e)
+ if "Failed to retrieve content from the URL" in error_str:
+ return f"❌ **File not found:** `{file_path}`\n\n💡 **Please check:**\n1. Is this file in the **{state.selected_project}** project?\n2. Use \"🔍 Find Files to Translate\" to see available files\n3. Verify the file path is correct"
+ return f"Error generating prompt preview: {error_str}"
+
+
+def send_message(message, history):
+ new_history, cleared_input = handle_user_message(message, history)
+ return new_history, cleared_input, update_status()
+
+
+# Button handlers with tab switching
+def start_translate_handler(history, file_to_translate, additional_instruction="", force_retranslate=False):
+ # Use persistent anthropic key
+ anthropic_key = state.persistent_settings["anthropic_api_key"]
+ aws_bearer_token_bedrock = state.persistent_settings["aws_bearer_token_bedrock"]
+
+ if not anthropic_key and not aws_bearer_token_bedrock:
+ response = "❌ Please set either Anthropic API key or AWS Bearer Token for Bedrock in Configuration panel first."
+ history.append(["Translation request", response])
+ return history, "", update_status(), gr.Tabs(), gr.update(), gr.update()
+
+ # Set the active API key to environment variable for translator.content.py
+ if anthropic_key:
+ os.environ["ANTHROPIC_API_KEY"] = anthropic_key
+ os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None) # Ensure only one is active
+ elif aws_bearer_token_bedrock:
+ os.environ["AWS_BEARER_TOKEN_BEDROCK"] = aws_bearer_token_bedrock
+ os.environ.pop("ANTHROPIC_API_KEY", None) # Ensure only one is active
+
+ # Check if file path is provided
+ if not file_to_translate or not file_to_translate.strip():
+ response = "❌ Please select a file from the dropdown or enter a file path to translate."
+ history.append(["Translation request", response])
+ return history, "", update_status(), gr.Tabs(), gr.update(), gr.update()
+
+ state.additional_instruction = additional_instruction
+ state.files_to_translate = [file_to_translate]
+ state.step = "translate"
+
+ # Start translation directly
+ if force_retranslate:
+ history.append(["Translation request", "🔄 **Force retranslation started...**"])
+ response, translated = start_translation_process(force_retranslate)
+ history.append(["", response])
+ if translated:
+ history.append(["", translated])
+
+ # Update button text and show confirm button after translation
+ start_btn_text = "🔄 Retranslation" if state.current_file_content["translated"] else "🚀 Start Translation"
+ confirm_btn_visible = bool(state.current_file_content["translated"])
+
+ return history, "", update_status(), gr.Tabs(), gr.update(value=start_btn_text), gr.update(visible=confirm_btn_visible)
+
+
+def approve_handler(history, owner, repo, reference_pr_url):
+ """Handles the request to generate a GitHub PR."""
+ global state
+ state.step = "create_github_pr"
+
+ # Check all required GitHub configuration at once
+ github_config = state.persistent_settings["github_config"]
+ missing_config = []
+
+ if not github_config.get("token"):
+ missing_config.append("GitHub Token")
+ if not owner:
+ missing_config.append("GitHub Owner")
+ if not repo:
+ missing_config.append("Repository Name")
+
+ if missing_config:
+ config = get_project_config(state.selected_project)
+ repo_name = config.repo_url.split('/')[-1] # Extract repo name from URL
+ response = f"❌ Please set the following in Configuration panel first: {', '.join(missing_config)}\n\n💡 **Note:** GitHub Owner/Repository should be your fork of [`{repo_name}`]({config.repo_url}) (e.g., Owner: `your-username`, Repository: `{repo_name}`)"
+ history.append(["GitHub PR creation request", response])
+ return history, "", update_status()
+
+ # Update reference PR URL (can be set per PR)
+ if reference_pr_url:
+ state.persistent_settings["github_config"]["reference_pr_url"] = reference_pr_url
+
+ # Use persistent settings
+ github_config = state.persistent_settings["github_config"]
+
+ # Initialize response variable
+ response = ""
+
+ # If reference PR is not provided, use the agent to find one
+ if not github_config.get("reference_pr_url"):
+ response = "🤖 **Reference PR URL not found. The agent will now search for a suitable one...**"
+ try:
+ # This part is simplified to avoid streaming logic in a non-generator function
+ stream_gen = find_reference_pr_simple_stream(
+ target_language=state.target_language,
+ context="documentation translation",
+ )
+ # We will just get the final result from the generator
+ final_result = None
+ try:
+ while True:
+ # We are not interested in the streamed messages here, just the final result.
+ next(stream_gen)
+ except StopIteration as e:
+ final_result = e.value
+
+ if final_result and final_result.get("status") == "success":
+ result_text = final_result.get("result", "")
+ match = re.search(r"https://github.com/[^\s]+", result_text)
+ if match:
+ found_url = match.group(0)
+ state.github_config["reference_pr_url"] = found_url
+ response += f"\n✅ **Agent found a reference PR:** {found_url}"
+ else:
+ raise ValueError(
+ "Could not extract a valid PR URL from agent's response."
+ )
+ else:
+ error_message = final_result.get("message") or final_result.get(
+ "result", "Unknown error"
+ )
+ raise ValueError(f"Agent failed to find a PR. Reason: {error_message}")
+ except Exception as e:
+ response += f"\n❌ **Agent failed to find a reference PR.**\nReason: {e}\n\nPlease provide a reference PR URL manually in Tab 3 and try again."
+ history.append(["Agent searching for PR", response])
+ return history, "", update_status()
+
+ # Proceed with PR generation
+ if state.files_to_translate and state.current_file_content.get("translated"):
+ current_file = state.files_to_translate[0]
+ translated_content = state.current_file_content["translated"]
+ response += "\n\n🚀 **Generating GitHub PR...**"
+
+ # Extract title from file for toctree mapping
+ file_name = current_file.split("/")[-1].replace(".md", "").replace("_", " ").title()
+ print(file_name)
+
+ pr_response = generate_github_pr(
+ target_language=state.target_language,
+ filepath=current_file,
+ translated_content=translated_content,
+ github_config=state.github_config,
+ en_title=file_name,
+ project=state.selected_project,
+ )
+ response += f"\n{pr_response}"
+ else:
+ response = "❌ No translated file available. Please complete the translation process first."
+
+ history.append(["GitHub PR creation request", response])
+ return history, "", update_status()
+
+
+def restart_handler(history):
+ """Resets the workflow state but preserves persistent settings."""
+ global state
+ # Backup persistent settings
+ backup_settings = state.persistent_settings.copy()
+
+ # Reset state
+ state = ChatState()
+
+ # Restore persistent settings
+ state.persistent_settings = backup_settings
+
+ # Restore environment variables
+ if backup_settings["anthropic_api_key"]:
+ os.environ["ANTHROPIC_API_KEY"] = backup_settings["anthropic_api_key"]
+ if backup_settings["aws_bearer_token_bedrock"]:
+ os.environ["AWS_BEARER_TOKEN_BEDROCK"] = backup_settings["aws_bearer_token_bedrock"]
+ if backup_settings["github_config"]["token"]:
+ os.environ["GITHUB_TOKEN"] = backup_settings["github_config"]["token"]
+
+ welcome_msg = get_welcome_message()
+ new_hist = [[None, welcome_msg]]
+ return new_hist, "", update_status(), gr.Tabs(selected=0)
diff --git a/agent/toctree_handler.py b/agent/toctree_handler.py
index 59715d3..e15a814 100644
--- a/agent/toctree_handler.py
+++ b/agent/toctree_handler.py
@@ -1,419 +1,419 @@
-import yaml
-import requests
-from typing import Dict, List, Any
-import os
-
-class TocTreeHandler:
- def __init__(self, project: str = "transformers"):
- from translator.project_config import get_project_config
- self.project = project
- self.project_config = get_project_config(project)
-
- # Extract repository path from config
- repo_path = self.project_config.repo_url.replace("https://github.com/", "")
-
- # Build project-specific URLs
- self.en_toctree_url = f"https://raw.githubusercontent.com/{repo_path}/main/docs/source/en/_toctree.yml"
- self.ko_toctree_url = f"https://raw.githubusercontent.com/{repo_path}/main/docs/source/ko/_toctree.yml"
- self.local_docs_path = "docs/source/ko"
-
- def fetch_toctree(self, url: str) -> Dict[str, Any]:
- """Fetch and parse YAML from URL"""
- response = requests.get(url)
- response.raise_for_status()
- return yaml.safe_load(response.text)
-
- def get_en_toctree(self) -> Dict[str, Any]:
- """Get English toctree structure"""
- return self.fetch_toctree(self.en_toctree_url)
-
- def get_ko_toctree(self) -> Dict[str, Any]:
- """Get Korean toctree structure"""
- return self.fetch_toctree(self.ko_toctree_url)
-
- def extract_title_mappings(self, en_data: List[Dict], ko_data: List[Dict]) -> Dict[str, str]:
- """Extract title mappings between English and Korean"""
- mappings = {}
-
- def process_section(en_section: Dict, ko_section: Dict):
- if 'local' in en_section and 'local' in ko_section:
- if en_section['local'] == ko_section['local']:
- en_title = en_section.get('title', '')
- ko_title = ko_section.get('title', '')
- if en_title and ko_title:
- mappings[en_title] = ko_title
-
- if 'sections' in en_section and 'sections' in ko_section:
- en_sections = en_section['sections']
- ko_sections = ko_section['sections']
-
- for i, en_sub in enumerate(en_sections):
- if i < len(ko_sections):
- process_section(en_sub, ko_sections[i])
-
- for i, en_item in enumerate(en_data):
- if i < len(ko_data):
- process_section(en_item, ko_data[i])
-
- return mappings
-
- def translate_title(self, en_title: str) -> str:
- """Translate English title to Korean using LLM"""
- try:
- from translator.content import llm_translate
-
- prompt = f"""Translate the following English documentation title to Korean. Return only the translated title, nothing else.
-
-English title: {en_title}
-
-Korean title:"""
-
- callback_result, translated_title = llm_translate(prompt)
- return translated_title.strip()
- except Exception as e:
- print(f"Error translating title '{en_title}': {e}")
- return en_title
-
- def create_local_toctree(self, en_title: str, local_file_path: str) -> Dict[str, str]:
- """Create local toctree entry with Korean title and local path"""
- try:
- # First try to get Korean title from existing mappings
- en_data = self.get_en_toctree()
- ko_data = self.get_ko_toctree()
-
- title_mappings = self.extract_title_mappings(en_data, ko_data)
- ko_title = title_mappings.get(en_title)
-
- # If no existing mapping, translate the title
- if not ko_title:
- ko_title = self.translate_title(en_title)
-
- return {
- 'local': local_file_path,
- 'title': ko_title
- }
- except Exception as e:
- print(f"Error creating local toctree: {e}")
- return {
- 'local': local_file_path,
- 'title': en_title
- }
-
- def find_and_update_translation_entry(self, ko_toctree_data, target_local: str, english_title: str, korean_title: str):
- """Find entry with '(번역중) 영어제목' and update it"""
- target_title_pattern = f"(번역중) {english_title}"
-
- def process_item(item):
- if isinstance(item, dict):
- # Check if title matches the pattern
- if item.get('title') == target_title_pattern:
- # Update local path and title
- item['local'] = target_local
- item['title'] = korean_title
- return True
-
- # Process sections recursively
- if 'sections' in item:
- for section in item['sections']:
- if process_item(section):
- return True
- return False
-
- # Process the toctree data
- if isinstance(ko_toctree_data, list):
- for item in ko_toctree_data:
- if process_item(item):
- return True
- return False
-
- def create_updated_toctree_with_replacement(self, ko_toctree: list, target_local: str) -> list:
- """Update Korean toctree by finding and updating translation entry"""
- try:
- # Step 1: Get English toctree and find the English title for target_local
- en_toctree = self.get_en_toctree()
- english_title = self.find_title_for_local(en_toctree, target_local)
-
- if not english_title:
- print(f"⚠️ Toctree entry not found: '{target_local}' not in English toctree")
- print(f"🔍 Attempting to find appropriate section for new entry...")
- # Try to add new entry in appropriate location
- return self.add_new_toctree_entry(ko_toctree, target_local)
-
- print(f"Found English title: {english_title} for local: {target_local}")
-
- # Step 2: Translate the English title to Korean
- korean_title = self.translate_title(english_title)
- print(f"Translated Korean title: {korean_title}")
-
- # Step 3: Make a deep copy to avoid modifying original
- import copy
- updated_toctree = copy.deepcopy(ko_toctree)
-
- # Step 4: Find and update the "(번역중) 영어제목" entry
- updated = self.find_and_update_translation_entry(
- updated_toctree, target_local, english_title, korean_title
- )
-
- if updated:
- print(f"✅ Successfully updated translation entry: local={target_local}, title={korean_title}")
- return updated_toctree
- else:
- print(f"⚠️ Toctree update skipped: '(번역중) {english_title}' entry not found")
- print(f"📋 This may be a new file not yet added to Korean toctree")
- return ko_toctree
-
- except Exception as e:
- print(f"Error creating updated toctree: {e}")
- return ko_toctree
-
- def find_title_for_local(self, toctree_data, target_local: str):
- """Find title for given local path in toctree"""
- def search_item(item):
- if isinstance(item, dict):
- if item.get('local') == target_local:
- return item.get('title', '')
-
- if 'sections' in item:
- for section in item['sections']:
- result = search_item(section)
- if result:
- return result
- return None
-
- if isinstance(toctree_data, list):
- for item in toctree_data:
- result = search_item(item)
- if result:
- return result
- return None
-
- def process_pr_commit(self, filepath: str):
- """Process PR commit by updating Korean toctree with translated entry"""
- # Get filepath without prefix
- filepath_without_prefix = filepath.replace("docs/source/en/", "").replace(".md", "")
-
- # Get Korean toctree
- ko_toctree = self.get_ko_toctree()
-
- # Use diff-merge algorithm to add new entry
- updated_ko_toctree = self.add_new_toctree_entry(ko_toctree, filepath_without_prefix)
-
- if not updated_ko_toctree:
- print(f"Failed to create updated Korean toctree for local: {filepath_without_prefix}")
- return
-
- print(f"Successfully updated Korean toctree")
-
- # Store the updated toctree for commit
- self.updated_ko_toctree = updated_ko_toctree
-
- def commit_and_push_toctree(self, pr_agent, owner: str, repo_name: str, branch_name: str):
- """Commit and push toctree updates as a separate commit"""
- try:
- # Use the updated toctree created by LLM
- if not hasattr(self, 'updated_ko_toctree') or not self.updated_ko_toctree:
- print("No updated Korean toctree available")
- return {"status": "error", "message": "No updated toctree to commit"}
-
- ko_data = self.updated_ko_toctree
-
- # Convert to YAML string
- toctree_content = yaml.dump(ko_data, allow_unicode=True, default_flow_style=False, sort_keys=False)
-
- # Create toctree commit message
- commit_message = "docs: update Korean documentation table of contents"
-
- # Commit toctree file
- file_result = pr_agent.create_or_update_file(
- owner=owner,
- repo_name=repo_name,
- path="docs/source/ko/_toctree.yml",
- message=commit_message,
- content=toctree_content,
- branch_name=branch_name
- )
-
- if file_result.startswith("SUCCESS"):
- return {
- "status": "success",
- "message": f"Toctree committed successfully: {file_result}",
- "commit_message": commit_message
- }
- else:
- return {
- "status": "error",
- "message": f"Toctree commit failed: {file_result}"
- }
-
- except Exception as e:
- return {
- "status": "error",
- "message": f"Error committing toctree: {str(e)}"
- }
-
- def update_toctree_after_translation(
- self,
- translation_result: dict,
- filepath: str,
- pr_agent,
- github_config: dict,
- project: str = "transformers"
- ) -> dict:
- """Update toctree after successful translation PR.
-
- Args:
- translation_result: Result from translation PR workflow
- filepath: Original file path
- pr_agent: GitHub PR agent instance
- github_config: GitHub configuration dictionary
-
- Returns:
- Dictionary with toctree update result
- """
- if translation_result["status"] == "error":
- return None
-
- try:
- # Process toctree update with LLM
- self.process_pr_commit(filepath)
- # Commit toctree as separate commit
- if self.updated_ko_toctree:
- return self.commit_and_push_toctree(
- pr_agent=pr_agent,
- owner=github_config["owner"],
- repo_name=github_config["repo_name"],
- branch_name=translation_result["branch"]
- )
-
- except Exception as e:
- return {
- "status": "error",
- "message": f"Error updating toctree: {str(e)}"
- }
-
- def add_new_toctree_entry(self, ko_toctree: list, target_local: str) -> list:
- """Add new toctree entry using diff-merge algorithm"""
- try:
- import copy
- updated_toctree = copy.deepcopy(ko_toctree)
-
- # Generate new entry
- filename = target_local.split('/')[-1].replace('_', ' ').title()
- korean_title = self.translate_title(filename)
- new_entry = {
- 'local': target_local,
- 'title': korean_title
- }
-
- # Get English toctree for structure reference
- en_toctree = self.get_en_toctree()
-
- # Use diff-merge algorithm
- if self.merge_toctree_sections(en_toctree, updated_toctree, target_local, new_entry):
- return updated_toctree
- else:
- # Fallback: add to root level
- updated_toctree.append(new_entry)
- print(f"✅ Added new entry at root level: {target_local} -> {korean_title}")
- return updated_toctree
-
- except Exception as e:
- print(f"❌ Error adding new toctree entry: {e}")
- return ko_toctree
-
- def merge_toctree_sections(self, en_toctree: list, ko_toctree: list, target_local: str, new_entry: dict) -> bool:
- """Merge English toctree structure into Korean toctree for target_local"""
- for en_section in en_toctree:
- en_title = en_section.get('title')
-
- # Check if this English section contains our target
- if self.contains_target(en_section, target_local):
- # Find matching Korean section
- ko_section = self.find_matching_section(ko_toctree, en_title)
-
- if ko_section:
- # Section exists - merge subsections
- return self.merge_subsections(en_section, ko_section, target_local, new_entry)
- else:
- # Section doesn't exist - create new section
- new_ko_section = self.create_section_with_order(en_section, target_local, new_entry)
- ko_toctree.append(new_ko_section)
- print(f"✅ Created new section '{new_ko_section.get('title')}' with ordered structure")
- return True
- return False
-
- def contains_target(self, section: dict, target_local: str) -> bool:
- """Check if section contains target_local recursively"""
- if 'sections' in section:
- for subsection in section['sections']:
- if subsection.get('local') == target_local:
- return True
- if self.contains_target(subsection, target_local):
- return True
- return False
-
- def find_matching_section(self, ko_toctree: list, en_title: str) -> dict:
- """Find Korean section that matches English title"""
- # Try exact match first
- for item in ko_toctree:
- if item.get('title') == en_title:
- return item
-
- # Try translated title match
- try:
- translated_title = self.translate_title(en_title)
- for item in ko_toctree:
- if item.get('title') == translated_title:
- return item
- except:
- pass
-
- return None
-
- def merge_subsections(self, en_section: dict, ko_section: dict, target_local: str, new_entry: dict) -> bool:
- """Merge subsections while maintaining order"""
- if 'sections' not in en_section:
- return False
-
- # Find target index in English sections
- target_index = -1
- for i, en_subsection in enumerate(en_section['sections']):
- if en_subsection.get('local') == target_local:
- target_index = i
- break
-
- if target_index == -1:
- return False
-
- # Ensure Korean section has sections array
- if 'sections' not in ko_section:
- ko_section['sections'] = []
-
- # Insert at correct position
- self.insert_at_correct_position(ko_section, target_index, new_entry)
- return True
-
- def insert_at_correct_position(self, ko_section: dict, target_index: int, new_entry: dict):
- """Insert entry at correct position, expanding array if needed"""
- sections = ko_section['sections']
-
- # Expand sections array if needed
- while len(sections) <= target_index:
- sections.append(None) # Placeholder
-
- # Insert new entry
- sections[target_index] = new_entry
-
- # Clean up None placeholders at the end
- while sections and sections[-1] is None:
- sections.pop()
-
- def create_section_with_order(self, en_section: dict, target_local: str, new_entry: dict) -> dict:
- """Create new Korean section with only the translated entry"""
- new_ko_section = {
- 'title': self.translate_title(en_section.get('title')),
- 'isExpanded': en_section.get('isExpanded', False),
- 'sections': [new_entry] # Only add the translated entry
- }
-
- return new_ko_section
+import yaml
+import requests
+from typing import Dict, List, Any
+import os
+
+class TocTreeHandler:
+ def __init__(self, project: str = "transformers"):
+ from translator.project_config import get_project_config
+ self.project = project
+ self.project_config = get_project_config(project)
+
+ # Extract repository path from config
+ repo_path = self.project_config.repo_url.replace("https://github.com/", "")
+
+ # Build project-specific URLs
+ self.en_toctree_url = f"https://raw.githubusercontent.com/{repo_path}/main/docs/source/en/_toctree.yml"
+ self.ko_toctree_url = f"https://raw.githubusercontent.com/{repo_path}/main/docs/source/ko/_toctree.yml"
+ self.local_docs_path = "docs/source/ko"
+
+ def fetch_toctree(self, url: str) -> Dict[str, Any]:
+ """Fetch and parse YAML from URL"""
+ response = requests.get(url)
+ response.raise_for_status()
+ return yaml.safe_load(response.text)
+
+ def get_en_toctree(self) -> Dict[str, Any]:
+ """Get English toctree structure"""
+ return self.fetch_toctree(self.en_toctree_url)
+
+ def get_ko_toctree(self) -> Dict[str, Any]:
+ """Get Korean toctree structure"""
+ return self.fetch_toctree(self.ko_toctree_url)
+
+ def extract_title_mappings(self, en_data: List[Dict], ko_data: List[Dict]) -> Dict[str, str]:
+ """Extract title mappings between English and Korean"""
+ mappings = {}
+
+ def process_section(en_section: Dict, ko_section: Dict):
+ if 'local' in en_section and 'local' in ko_section:
+ if en_section['local'] == ko_section['local']:
+ en_title = en_section.get('title', '')
+ ko_title = ko_section.get('title', '')
+ if en_title and ko_title:
+ mappings[en_title] = ko_title
+
+ if 'sections' in en_section and 'sections' in ko_section:
+ en_sections = en_section['sections']
+ ko_sections = ko_section['sections']
+
+ for i, en_sub in enumerate(en_sections):
+ if i < len(ko_sections):
+ process_section(en_sub, ko_sections[i])
+
+ for i, en_item in enumerate(en_data):
+ if i < len(ko_data):
+ process_section(en_item, ko_data[i])
+
+ return mappings
+
+ def translate_title(self, en_title: str) -> str:
+ """Translate English title to Korean using LLM"""
+ try:
+ from translator.content import llm_translate
+
+ prompt = f"""Translate the following English documentation title to Korean. Return only the translated title, nothing else.
+
+English title: {en_title}
+
+Korean title:"""
+
+ callback_result, translated_title = llm_translate(prompt)
+ return translated_title.strip()
+ except Exception as e:
+ print(f"Error translating title '{en_title}': {e}")
+ return en_title
+
+ def create_local_toctree(self, en_title: str, local_file_path: str) -> Dict[str, str]:
+ """Create local toctree entry with Korean title and local path"""
+ try:
+ # First try to get Korean title from existing mappings
+ en_data = self.get_en_toctree()
+ ko_data = self.get_ko_toctree()
+
+ title_mappings = self.extract_title_mappings(en_data, ko_data)
+ ko_title = title_mappings.get(en_title)
+
+ # If no existing mapping, translate the title
+ if not ko_title:
+ ko_title = self.translate_title(en_title)
+
+ return {
+ 'local': local_file_path,
+ 'title': ko_title
+ }
+ except Exception as e:
+ print(f"Error creating local toctree: {e}")
+ return {
+ 'local': local_file_path,
+ 'title': en_title
+ }
+
+ def find_and_update_translation_entry(self, ko_toctree_data, target_local: str, english_title: str, korean_title: str):
+ """Find entry with '(번역중) 영어제목' and update it"""
+ target_title_pattern = f"(번역중) {english_title}"
+
+ def process_item(item):
+ if isinstance(item, dict):
+ # Check if title matches the pattern
+ if item.get('title') == target_title_pattern:
+ # Update local path and title
+ item['local'] = target_local
+ item['title'] = korean_title
+ return True
+
+ # Process sections recursively
+ if 'sections' in item:
+ for section in item['sections']:
+ if process_item(section):
+ return True
+ return False
+
+ # Process the toctree data
+ if isinstance(ko_toctree_data, list):
+ for item in ko_toctree_data:
+ if process_item(item):
+ return True
+ return False
+
+ def create_updated_toctree_with_replacement(self, ko_toctree: list, target_local: str) -> list:
+ """Update Korean toctree by finding and updating translation entry"""
+ try:
+ # Step 1: Get English toctree and find the English title for target_local
+ en_toctree = self.get_en_toctree()
+ english_title = self.find_title_for_local(en_toctree, target_local)
+
+ if not english_title:
+ print(f"⚠️ Toctree entry not found: '{target_local}' not in English toctree")
+ print(f"🔍 Attempting to find appropriate section for new entry...")
+ # Try to add new entry in appropriate location
+ return self.add_new_toctree_entry(ko_toctree, target_local)
+
+ print(f"Found English title: {english_title} for local: {target_local}")
+
+ # Step 2: Translate the English title to Korean
+ korean_title = self.translate_title(english_title)
+ print(f"Translated Korean title: {korean_title}")
+
+ # Step 3: Make a deep copy to avoid modifying original
+ import copy
+ updated_toctree = copy.deepcopy(ko_toctree)
+
+ # Step 4: Find and update the "(번역중) 영어제목" entry
+ updated = self.find_and_update_translation_entry(
+ updated_toctree, target_local, english_title, korean_title
+ )
+
+ if updated:
+ print(f"✅ Successfully updated translation entry: local={target_local}, title={korean_title}")
+ return updated_toctree
+ else:
+ print(f"⚠️ Toctree update skipped: '(번역중) {english_title}' entry not found")
+ print(f"📋 This may be a new file not yet added to Korean toctree")
+ return ko_toctree
+
+ except Exception as e:
+ print(f"Error creating updated toctree: {e}")
+ return ko_toctree
+
+ def find_title_for_local(self, toctree_data, target_local: str):
+ """Find title for given local path in toctree"""
+ def search_item(item):
+ if isinstance(item, dict):
+ if item.get('local') == target_local:
+ return item.get('title', '')
+
+ if 'sections' in item:
+ for section in item['sections']:
+ result = search_item(section)
+ if result:
+ return result
+ return None
+
+ if isinstance(toctree_data, list):
+ for item in toctree_data:
+ result = search_item(item)
+ if result:
+ return result
+ return None
+
+ def process_pr_commit(self, filepath: str):
+ """Process PR commit by updating Korean toctree with translated entry"""
+ # Get filepath without prefix
+ filepath_without_prefix = filepath.replace("docs/source/en/", "").replace(".md", "")
+
+ # Get Korean toctree
+ ko_toctree = self.get_ko_toctree()
+
+ # Use diff-merge algorithm to add new entry
+ updated_ko_toctree = self.add_new_toctree_entry(ko_toctree, filepath_without_prefix)
+
+ if not updated_ko_toctree:
+ print(f"Failed to create updated Korean toctree for local: {filepath_without_prefix}")
+ return
+
+ print(f"Successfully updated Korean toctree")
+
+ # Store the updated toctree for commit
+ self.updated_ko_toctree = updated_ko_toctree
+
+ def commit_and_push_toctree(self, pr_agent, owner: str, repo_name: str, branch_name: str):
+ """Commit and push toctree updates as a separate commit"""
+ try:
+ # Use the updated toctree created by LLM
+ if not hasattr(self, 'updated_ko_toctree') or not self.updated_ko_toctree:
+ print("No updated Korean toctree available")
+ return {"status": "error", "message": "No updated toctree to commit"}
+
+ ko_data = self.updated_ko_toctree
+
+ # Convert to YAML string
+ toctree_content = yaml.dump(ko_data, allow_unicode=True, default_flow_style=False, sort_keys=False)
+
+ # Create toctree commit message
+ commit_message = "docs: update Korean documentation table of contents"
+
+ # Commit toctree file
+ file_result = pr_agent.create_or_update_file(
+ owner=owner,
+ repo_name=repo_name,
+ path="docs/source/ko/_toctree.yml",
+ message=commit_message,
+ content=toctree_content,
+ branch_name=branch_name
+ )
+
+ if file_result.startswith("SUCCESS"):
+ return {
+ "status": "success",
+ "message": f"Toctree committed successfully: {file_result}",
+ "commit_message": commit_message
+ }
+ else:
+ return {
+ "status": "error",
+ "message": f"Toctree commit failed: {file_result}"
+ }
+
+ except Exception as e:
+ return {
+ "status": "error",
+ "message": f"Error committing toctree: {str(e)}"
+ }
+
+ def update_toctree_after_translation(
+ self,
+ translation_result: dict,
+ filepath: str,
+ pr_agent,
+ github_config: dict,
+ project: str = "transformers"
+ ) -> dict:
+ """Update toctree after successful translation PR.
+
+ Args:
+ translation_result: Result from translation PR workflow
+ filepath: Original file path
+ pr_agent: GitHub PR agent instance
+ github_config: GitHub configuration dictionary
+
+ Returns:
+ Dictionary with toctree update result
+ """
+ if translation_result["status"] == "error":
+ return None
+
+ try:
+ # Process toctree update with LLM
+ self.process_pr_commit(filepath)
+ # Commit toctree as separate commit
+ if self.updated_ko_toctree:
+ return self.commit_and_push_toctree(
+ pr_agent=pr_agent,
+ owner=github_config["owner"],
+ repo_name=github_config["repo_name"],
+ branch_name=translation_result["branch"]
+ )
+
+ except Exception as e:
+ return {
+ "status": "error",
+ "message": f"Error updating toctree: {str(e)}"
+ }
+
+ def add_new_toctree_entry(self, ko_toctree: list, target_local: str) -> list:
+ """Add new toctree entry using diff-merge algorithm"""
+ try:
+ import copy
+ updated_toctree = copy.deepcopy(ko_toctree)
+
+ # Generate new entry
+ filename = target_local.split('/')[-1].replace('_', ' ').title()
+ korean_title = self.translate_title(filename)
+ new_entry = {
+ 'local': target_local,
+ 'title': korean_title
+ }
+
+ # Get English toctree for structure reference
+ en_toctree = self.get_en_toctree()
+
+ # Use diff-merge algorithm
+ if self.merge_toctree_sections(en_toctree, updated_toctree, target_local, new_entry):
+ return updated_toctree
+ else:
+ # Fallback: add to root level
+ updated_toctree.append(new_entry)
+ print(f"✅ Added new entry at root level: {target_local} -> {korean_title}")
+ return updated_toctree
+
+ except Exception as e:
+ print(f"❌ Error adding new toctree entry: {e}")
+ return ko_toctree
+
+ def merge_toctree_sections(self, en_toctree: list, ko_toctree: list, target_local: str, new_entry: dict) -> bool:
+ """Merge English toctree structure into Korean toctree for target_local"""
+ for en_section in en_toctree:
+ en_title = en_section.get('title')
+
+ # Check if this English section contains our target
+ if self.contains_target(en_section, target_local):
+ # Find matching Korean section
+ ko_section = self.find_matching_section(ko_toctree, en_title)
+
+ if ko_section:
+ # Section exists - merge subsections
+ return self.merge_subsections(en_section, ko_section, target_local, new_entry)
+ else:
+ # Section doesn't exist - create new section
+ new_ko_section = self.create_section_with_order(en_section, target_local, new_entry)
+ ko_toctree.append(new_ko_section)
+ print(f"✅ Created new section '{new_ko_section.get('title')}' with ordered structure")
+ return True
+ return False
+
+ def contains_target(self, section: dict, target_local: str) -> bool:
+ """Check if section contains target_local recursively"""
+ if 'sections' in section:
+ for subsection in section['sections']:
+ if subsection.get('local') == target_local:
+ return True
+ if self.contains_target(subsection, target_local):
+ return True
+ return False
+
+ def find_matching_section(self, ko_toctree: list, en_title: str) -> dict:
+ """Find Korean section that matches English title"""
+ # Try exact match first
+ for item in ko_toctree:
+ if item.get('title') == en_title:
+ return item
+
+ # Try translated title match
+ try:
+ translated_title = self.translate_title(en_title)
+ for item in ko_toctree:
+ if item.get('title') == translated_title:
+ return item
+ except:
+ pass
+
+ return None
+
+ def merge_subsections(self, en_section: dict, ko_section: dict, target_local: str, new_entry: dict) -> bool:
+ """Merge subsections while maintaining order"""
+ if 'sections' not in en_section:
+ return False
+
+ # Find target index in English sections
+ target_index = -1
+ for i, en_subsection in enumerate(en_section['sections']):
+ if en_subsection.get('local') == target_local:
+ target_index = i
+ break
+
+ if target_index == -1:
+ return False
+
+ # Ensure Korean section has sections array
+ if 'sections' not in ko_section:
+ ko_section['sections'] = []
+
+ # Insert at correct position
+ self.insert_at_correct_position(ko_section, target_index, new_entry)
+ return True
+
+ def insert_at_correct_position(self, ko_section: dict, target_index: int, new_entry: dict):
+ """Insert entry at correct position, expanding array if needed"""
+ sections = ko_section['sections']
+
+ # Expand sections array if needed
+ while len(sections) <= target_index:
+ sections.append(None) # Placeholder
+
+ # Insert new entry
+ sections[target_index] = new_entry
+
+ # Clean up None placeholders at the end
+ while sections and sections[-1] is None:
+ sections.pop()
+
+ def create_section_with_order(self, en_section: dict, target_local: str, new_entry: dict) -> dict:
+ """Create new Korean section with only the translated entry"""
+ new_ko_section = {
+ 'title': self.translate_title(en_section.get('title')),
+ 'isExpanded': en_section.get('isExpanded', False),
+ 'sections': [new_entry] # Only add the translated entry
+ }
+
+ return new_ko_section
diff --git a/agent/workflow.py b/agent/workflow.py
index 31f271c..fe85267 100644
--- a/agent/workflow.py
+++ b/agent/workflow.py
@@ -1,338 +1,338 @@
-"""Module for gradio interfaces."""
-
-import os
-from pathlib import Path
-import gradio as gr
-
-from translator.content import (
- fill_scaffold,
- get_content,
- get_full_prompt,
- llm_translate,
- preprocess_content,
-)
-from translator.retriever import report, get_github_issue_open_pr, get_github_repo_files
-# GitHub PR Agent import
-try:
- from pr_generator.agent import GitHubPRAgent
-
- GITHUB_PR_AVAILABLE = True
-except ImportError as e:
- print(f"⚠️ GitHub PR Agent is not available: {e}")
- GITHUB_PR_AVAILABLE = False
-
-import json
-from logger.github_logger import GitHubLogger
-
-
-def report_translation_target_files(
- project: str, translate_lang: str, top_k: int = 1
-) -> tuple[str, list[list[str]]]:
- """Return the top-k files that need translation, excluding files already in progress.
-
- Args:
- project: Project to translate (e.g., "transformers", "smolagents")
- translate_lang: Target language to translate
- top_k: Number of top-first files to return for translation. (Default 1)
- """
- # Get repo files once to avoid duplicate API calls
- all_repo_files = get_github_repo_files(project)
-
- # Get all available files for translation using the file list
- all_status_report, all_filepath_list = report(project, translate_lang, top_k * 2, all_repo_files) # Get more to account for filtering
-
- # Get files in progress using the same file list
- docs_in_progress, pr_info_list = get_github_issue_open_pr(project, translate_lang, all_repo_files)
-
- # Filter out files that are already in progress
- available_files = [f for f in all_filepath_list if f not in docs_in_progress]
-
- # Take only the requested number
- filepath_list = available_files[:top_k]
-
- # Build combined status report
- status_report = all_status_report
-
- if docs_in_progress:
- status_report += f"\n\n🤖 Found {len(docs_in_progress)} files in progress for translation:"
- for i, file in enumerate(docs_in_progress):
- status_report += f"\n{i+1}. [`{file}`]({pr_info_list[i]})"
- status_report += f"\n\n📋 Showing {len(filepath_list)} available files (excluding in-progress):"
-
- return status_report, [[file] for file in filepath_list]
-
-
-def translate_docs(lang: str, file_path: str, additional_instruction: str = "", project: str = "transformers", force_retranslate: bool = False) -> tuple[str, str]:
- """Translate documentation."""
- # Check if translation already exists (unless force retranslate is enabled)
- translation_file_path = (
- Path(__file__).resolve().parent.parent
- / f"translation_result/{file_path}"
- )
-
- if not force_retranslate and translation_file_path.exists():
- print(f"📄 Found existing translation: {translation_file_path}")
- with open(translation_file_path, "r", encoding="utf-8") as f:
- existing_content = f.read()
- if existing_content.strip():
- existing_msg = f"♻️ **Existing translation loaded** (no tokens used)\n📁 **File:** `{file_path}`\n📅 **Loaded from:** `{translation_file_path}`\n💡 **To retranslate:** Check 'Force Retranslate' option."
- return existing_msg, existing_content
-
- # step 1. Get content from file path
- content = get_content(file_path, project)
- to_translate = preprocess_content(content)
-
- # step 2. Prepare prompt with docs content
- if lang == "ko":
- translation_lang = "Korean"
- to_translate_with_prompt = get_full_prompt(translation_lang, to_translate, additional_instruction)
-
- print("to_translate_with_prompt:\n", to_translate_with_prompt)
-
- # step 3. Translate with LLM
- # TODO: MCP clilent 넘길 부분
- callback_result, translated_content = llm_translate(to_translate_with_prompt)
- print("translated_content:\n")
- print(translated_content)
- if translated_content.startswith("```md\n") and translated_content.endswith("```"):
- print("Satisfied translated_content.startswith ``` md")
- translated_content = translated_content[5:-3].strip()
- # step 4. Add scaffold to translation result
- translated_doc = fill_scaffold(content, to_translate, translated_content)
- print("translated_doc:\n")
- print(translated_doc)
- return callback_result, translated_doc
-
-
-def translate_docs_interactive(
- translate_lang: str, selected_files: list[list[str]], additional_instruction: str = "", project: str = "transformers", force_retranslate: bool = False
-) -> tuple[str, str]:
- """Interactive translation function that processes files one by one.
-
- Args:
- translate_lang: Target language to translate
- selected_files: List of file paths to translate
- """
- # Extract file paths from the dataframe format
- file_paths = [row[0] for row in selected_files if row and len(row) > 0]
-
- # Start with the first file
- current_file = file_paths[0]
-
- callback_result, translated_content = translate_docs(translate_lang, current_file, additional_instruction, project, force_retranslate)
-
- # Check if existing translation was loaded
- if isinstance(callback_result, str) and "Existing translation loaded" in callback_result:
- status = callback_result # Use the existing translation message
- else:
- if force_retranslate:
- status = f"🔄 **Force Retranslation completed**: `{current_file}` → `{translate_lang}`\n\n"
- else:
- status = f"✅ Translation completed: `{current_file}` → `{translate_lang}`\n\n"
- status += f"💰 Used token and cost: \n```\n{callback_result}\n```"
-
- print(callback_result)
- print(status)
-
- return status, translated_content
-
-
-def generate_github_pr(
- target_language: str,
- filepath: str,
- translated_content: str = None,
- github_config: dict = None,
- en_title: str = None,
- project: str = "transformers",
-) -> str:
- """Generate a GitHub PR for translated documentation.
-
- Args:
- target_language: Target language for translation (e.g., "ko")
- filepath: Original file path (e.g., "docs/source/en/accelerator_selection.md")
- translated_content: Translated content (if None, read from file)
- github_config: GitHub configuration dictionary
- en_title: English title for toctree mapping
-
- Returns:
- PR creation result message
- """
- if not GITHUB_PR_AVAILABLE:
- return "❌ GitHub PR Agent is not available. Please install required libraries."
-
- if not github_config:
- return "❌ GitHub configuration not provided. Please set up GitHub token, owner, and repository in Configuration panel."
-
- # Validate required configuration
- required_fields = ["token", "owner", "repo_name", "reference_pr_url"]
- missing_fields = [
- field for field in required_fields if not github_config.get(field)
- ]
-
- if missing_fields:
- return f"❌ Missing required GitHub configuration: {', '.join(missing_fields)}\n\n💡 Go to Configuration panel and set:\n" + "\n".join([f" • {field}" for field in missing_fields])
-
- # Set token in environment for the agent.
- os.environ["GITHUB_TOKEN"] = github_config["token"]
-
- try:
- # Read translated content from file if not provided
- if translated_content is None:
- translation_file_path = (
- Path(__file__).resolve().parent.parent
- / f"translation_result/{filepath}"
- )
- if not translation_file_path.exists():
- return f"❌ Translation file not found: {translation_file_path}\n\n💡 Please complete translation first in Tab 2 for file: {filepath}"
-
- with open(translation_file_path, "r", encoding="utf-8") as f:
- translated_content = f.read()
-
- if not translated_content or not translated_content.strip():
- return f"❌ Translated content is empty for file: {filepath}\n\n💡 Please complete translation first in Tab 2."
-
- # Execute GitHub PR Agent
- # Get base repository from project config
- from translator.project_config import get_project_config
- project_config = get_project_config(project)
- base_repo_path = project_config.repo_url.replace("https://github.com/", "")
- base_owner, base_repo = base_repo_path.split("/")
-
- print(f"🚀 Starting GitHub PR creation...")
- print(f" 📁 File: {filepath}")
- print(f" 🌍 Language: {target_language}")
- print(f" 📊 Reference PR: {github_config['reference_pr_url']}")
- print(f" 🏠 User Fork: {github_config['owner']}/{github_config['repo_name']}")
- print(f" 🎯 Base Repository: {base_owner}/{base_repo}")
-
- agent = GitHubPRAgent(
- user_owner=github_config["owner"],
- user_repo=github_config["repo_name"],
- base_owner=base_owner,
- base_repo=base_repo,
- )
- result = agent.run_translation_pr_workflow(
- reference_pr_url=github_config["reference_pr_url"],
- target_language=target_language,
- filepath=filepath,
- translated_doc=translated_content,
- base_branch=github_config.get("base_branch", "main"),
- )
- # TEST CODE
- # result = {
- # 'status': 'partial_success',
- # 'branch': 'ko-attention_interface',
- # 'file_path': 'docs/source/ko/attention_interface.md',
- # 'message': 'File was saved and commit was successful.\nPR creation failed: ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1', 'error_details': 'ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1'
- # }
- # Process toctree update after successful translation PR
- toctree_result = None
- if en_title:
- from agent.toctree_handler import TocTreeHandler
- toctree_handler = TocTreeHandler(project)
- toctree_result = toctree_handler.update_toctree_after_translation(
- result, filepath, agent, github_config, project
- )
-
- # Process result
- # Generate toctree status message (shared for both success and partial_success)
- toctree_status = ""
- if toctree_result:
- if toctree_result["status"] == "success":
- toctree_status = f"\n📋 **Toctree Updated:** ✅ {toctree_result['message']}"
- else:
- toctree_status = f"\n📋 **Toctree Update Failed:** ❌ {toctree_result['message']}"
-
- # Append full result JSON to dedicated GitHub logging repository (always)
- try:
- log_data = result.copy()
- if toctree_result:
- log_data["toctree_result"] = toctree_result
- log_entry = json.dumps(log_data, ensure_ascii=False) + "\n"
- log_res = GitHubLogger().append_jsonl(log_entry)
- print(f"📝 Log append result: {log_res}")
- except Exception as e:
- print(f"❌ Failed to append PR log via GitHub API: {e}")
-
- if result["status"] == "success":
- return f"""✅ **GitHub PR Creation Successful!**
-
-🔗 **PR URL:** {result.get('pr_url', 'NO_PR_URL')}
-🌿 **Branch:** {result["branch"]}
-📁 **File:** {result["file_path"]}{toctree_status}
-
-{result["message"]}"""
-
- elif result["status"] == "partial_success":
- error_details = result.get("error_details", "Unknown error")
-
- # Check if it's "existing PR" case (not really an error)
- if "Existing PR found" in error_details:
- existing_pr_url = error_details.split(": ")[-1] if ": " in error_details else "Unknown"
- return f"""🔄 **Translation Updated Successfully**
-
-🎯 **Selected Project:** {project}
-🌿 **Branch:** {result["branch"]}
-📁 **File:** {result["file_path"]}{toctree_status}
-
-🔗 **Existing PR Updated:** {existing_pr_url}
-
-✅ Your translation has been added to the existing PR. The file and toctree have been successfully updated!"""
- else:
- # Actual error case
- return f"""⚠️ **Partial Success**
-
-🎯 **Selected Project:** {project}
-🏠 **User Fork:** {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')}
-🎯 **Target Base:** {base_owner}/{base_repo}
-🌿 **Branch:** {result["branch"]}
-📁 **File:** {result["file_path"]}{toctree_status}
-
-{result["message"]}
-
-**Error Details:**
-{error_details}
-
-💡 **Project-Repository Mismatch Check:**
-- Selected project '{project}' should match repository '{github_config.get('repo_name', 'REPO')}'
-- For smolagents: use Jwaminju/smolagents fork
-- For transformers: use Jwaminju/transformers fork"""
-
- else:
- error_details = result.get("error_details", "No additional details")
- return f"""❌ **GitHub PR Creation Failed**
-
-🎯 **Selected Project:** {project}
-🏠 **User Fork:** {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')}
-🎯 **Target Base:** {base_owner}/{base_repo}
-
-**Error Message:**
-{result["message"]}
-
-**Error Details:**
-{error_details}
-
-💡 **Project-Repository Mismatch:**
-Selected project '{project}' but configured repository '{github_config.get('repo_name', 'REPO')}'
-• For smolagents project: use 'smolagents' repository
-• For transformers project: use 'transformers' repository"""
-
- except Exception as e:
- error_msg = f"""❌ **Unexpected Error During PR Creation**
-
-**Error:** {str(e)}
-
-**Configuration:**
-• Project: {project}
-• File: {filepath}
-• Target: {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')} → {base_owner if 'base_owner' in locals() else 'BASE'}/{base_repo if 'base_repo' in locals() else 'REPO'}"""
- print(error_msg)
- return error_msg
-
-
-# Backward compatibility function (replaces old mock function)
-def mock_generate_PR():
- """Backward compatibility function - returns warning message only"""
- return (
- "⚠️ mock_generate_PR() is deprecated. Please use generate_github_pr() instead."
- )
+"""Module for gradio interfaces."""
+
+import os
+from pathlib import Path
+import gradio as gr
+
+from translator.content import (
+ fill_scaffold,
+ get_content,
+ get_full_prompt,
+ llm_translate,
+ preprocess_content,
+)
+from translator.retriever import report, get_github_issue_open_pr, get_github_repo_files
+# GitHub PR Agent import
+try:
+ from pr_generator.agent import GitHubPRAgent
+
+ GITHUB_PR_AVAILABLE = True
+except ImportError as e:
+ print(f"⚠️ GitHub PR Agent is not available: {e}")
+ GITHUB_PR_AVAILABLE = False
+
+import json
+from logger.github_logger import GitHubLogger
+
+
+def report_translation_target_files(
+ project: str, translate_lang: str, top_k: int = 1
+) -> tuple[str, list[list[str]]]:
+ """Return the top-k files that need translation, excluding files already in progress.
+
+ Args:
+ project: Project to translate (e.g., "transformers", "smolagents")
+ translate_lang: Target language to translate
+ top_k: Number of top-first files to return for translation. (Default 1)
+ """
+ # Get repo files once to avoid duplicate API calls
+ all_repo_files = get_github_repo_files(project)
+
+ # Get all available files for translation using the file list
+ all_status_report, all_filepath_list = report(project, translate_lang, top_k * 2, all_repo_files) # Get more to account for filtering
+
+ # Get files in progress using the same file list
+ docs_in_progress, pr_info_list = get_github_issue_open_pr(project, translate_lang, all_repo_files)
+
+ # Filter out files that are already in progress
+ available_files = [f for f in all_filepath_list if f not in docs_in_progress]
+
+ # Take only the requested number
+ filepath_list = available_files[:top_k]
+
+ # Build combined status report
+ status_report = all_status_report
+
+ if docs_in_progress:
+ status_report += f"\n\n🤖 Found {len(docs_in_progress)} files in progress for translation:"
+ for i, file in enumerate(docs_in_progress):
+ status_report += f"\n{i+1}. [`{file}`]({pr_info_list[i]})"
+ status_report += f"\n\n📋 Showing {len(filepath_list)} available files (excluding in-progress):"
+
+ return status_report, [[file] for file in filepath_list]
+
+
+def translate_docs(lang: str, file_path: str, additional_instruction: str = "", project: str = "transformers", force_retranslate: bool = False) -> tuple[str, str]:
+ """Translate documentation."""
+ # Check if translation already exists (unless force retranslate is enabled)
+ translation_file_path = (
+ Path(__file__).resolve().parent.parent
+ / f"translation_result/{file_path}"
+ )
+
+ if not force_retranslate and translation_file_path.exists():
+ print(f"📄 Found existing translation: {translation_file_path}")
+ with open(translation_file_path, "r", encoding="utf-8") as f:
+ existing_content = f.read()
+ if existing_content.strip():
+ existing_msg = f"♻️ **Existing translation loaded** (no tokens used)\n📁 **File:** `{file_path}`\n📅 **Loaded from:** `{translation_file_path}`\n💡 **To retranslate:** Check 'Force Retranslate' option."
+ return existing_msg, existing_content
+
+ # step 1. Get content from file path
+ content = get_content(file_path, project)
+ to_translate = preprocess_content(content)
+
+ # step 2. Prepare prompt with docs content
+ if lang == "ko":
+ translation_lang = "Korean"
+ to_translate_with_prompt = get_full_prompt(translation_lang, to_translate, additional_instruction)
+
+ print("to_translate_with_prompt:\n", to_translate_with_prompt)
+
+ # step 3. Translate with LLM
+ # TODO: MCP clilent 넘길 부분
+ callback_result, translated_content = llm_translate(to_translate_with_prompt)
+ print("translated_content:\n")
+ print(translated_content)
+ if translated_content.startswith("```md\n") and translated_content.endswith("```"):
+ print("Satisfied translated_content.startswith ``` md")
+ translated_content = translated_content[5:-3].strip()
+ # step 4. Add scaffold to translation result
+ translated_doc = fill_scaffold(content, to_translate, translated_content)
+ print("translated_doc:\n")
+ print(translated_doc)
+ return callback_result, translated_doc
+
+
+def translate_docs_interactive(
+ translate_lang: str, selected_files: list[list[str]], additional_instruction: str = "", project: str = "transformers", force_retranslate: bool = False
+) -> tuple[str, str]:
+ """Interactive translation function that processes files one by one.
+
+ Args:
+ translate_lang: Target language to translate
+ selected_files: List of file paths to translate
+ """
+ # Extract file paths from the dataframe format
+ file_paths = [row[0] for row in selected_files if row and len(row) > 0]
+
+ # Start with the first file
+ current_file = file_paths[0]
+
+ callback_result, translated_content = translate_docs(translate_lang, current_file, additional_instruction, project, force_retranslate)
+
+ # Check if existing translation was loaded
+ if isinstance(callback_result, str) and "Existing translation loaded" in callback_result:
+ status = callback_result # Use the existing translation message
+ else:
+ if force_retranslate:
+ status = f"🔄 **Force Retranslation completed**: `{current_file}` → `{translate_lang}`\n\n"
+ else:
+ status = f"✅ Translation completed: `{current_file}` → `{translate_lang}`\n\n"
+ status += f"💰 Used token and cost: \n```\n{callback_result}\n```"
+
+ print(callback_result)
+ print(status)
+
+ return status, translated_content
+
+
+def generate_github_pr(
+ target_language: str,
+ filepath: str,
+ translated_content: str = None,
+ github_config: dict = None,
+ en_title: str = None,
+ project: str = "transformers",
+) -> str:
+ """Generate a GitHub PR for translated documentation.
+
+ Args:
+ target_language: Target language for translation (e.g., "ko")
+ filepath: Original file path (e.g., "docs/source/en/accelerator_selection.md")
+ translated_content: Translated content (if None, read from file)
+ github_config: GitHub configuration dictionary
+ en_title: English title for toctree mapping
+
+ Returns:
+ PR creation result message
+ """
+ if not GITHUB_PR_AVAILABLE:
+ return "❌ GitHub PR Agent is not available. Please install required libraries."
+
+ if not github_config:
+ return "❌ GitHub configuration not provided. Please set up GitHub token, owner, and repository in Configuration panel."
+
+ # Validate required configuration
+ required_fields = ["token", "owner", "repo_name", "reference_pr_url"]
+ missing_fields = [
+ field for field in required_fields if not github_config.get(field)
+ ]
+
+ if missing_fields:
+ return f"❌ Missing required GitHub configuration: {', '.join(missing_fields)}\n\n💡 Go to Configuration panel and set:\n" + "\n".join([f" • {field}" for field in missing_fields])
+
+ # Set token in environment for the agent.
+ os.environ["GITHUB_TOKEN"] = github_config["token"]
+
+ try:
+ # Read translated content from file if not provided
+ if translated_content is None:
+ translation_file_path = (
+ Path(__file__).resolve().parent.parent
+ / f"translation_result/{filepath}"
+ )
+ if not translation_file_path.exists():
+ return f"❌ Translation file not found: {translation_file_path}\n\n💡 Please complete translation first in Tab 2 for file: {filepath}"
+
+ with open(translation_file_path, "r", encoding="utf-8") as f:
+ translated_content = f.read()
+
+ if not translated_content or not translated_content.strip():
+ return f"❌ Translated content is empty for file: {filepath}\n\n💡 Please complete translation first in Tab 2."
+
+ # Execute GitHub PR Agent
+ # Get base repository from project config
+ from translator.project_config import get_project_config
+ project_config = get_project_config(project)
+ base_repo_path = project_config.repo_url.replace("https://github.com/", "")
+ base_owner, base_repo = base_repo_path.split("/")
+
+ print(f"🚀 Starting GitHub PR creation...")
+ print(f" 📁 File: {filepath}")
+ print(f" 🌍 Language: {target_language}")
+ print(f" 📊 Reference PR: {github_config['reference_pr_url']}")
+ print(f" 🏠 User Fork: {github_config['owner']}/{github_config['repo_name']}")
+ print(f" 🎯 Base Repository: {base_owner}/{base_repo}")
+
+ agent = GitHubPRAgent(
+ user_owner=github_config["owner"],
+ user_repo=github_config["repo_name"],
+ base_owner=base_owner,
+ base_repo=base_repo,
+ )
+ result = agent.run_translation_pr_workflow(
+ reference_pr_url=github_config["reference_pr_url"],
+ target_language=target_language,
+ filepath=filepath,
+ translated_doc=translated_content,
+ base_branch=github_config.get("base_branch", "main"),
+ )
+ # TEST CODE
+ # result = {
+ # 'status': 'partial_success',
+ # 'branch': 'ko-attention_interface',
+ # 'file_path': 'docs/source/ko/attention_interface.md',
+ # 'message': 'File was saved and commit was successful.\nPR creation failed: ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1', 'error_details': 'ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1'
+ # }
+ # Process toctree update after successful translation PR
+ toctree_result = None
+ if en_title:
+ from agent.toctree_handler import TocTreeHandler
+ toctree_handler = TocTreeHandler(project)
+ toctree_result = toctree_handler.update_toctree_after_translation(
+ result, filepath, agent, github_config, project
+ )
+
+ # Process result
+ # Generate toctree status message (shared for both success and partial_success)
+ toctree_status = ""
+ if toctree_result:
+ if toctree_result["status"] == "success":
+ toctree_status = f"\n📋 **Toctree Updated:** ✅ {toctree_result['message']}"
+ else:
+ toctree_status = f"\n📋 **Toctree Update Failed:** ❌ {toctree_result['message']}"
+
+ # Append full result JSON to dedicated GitHub logging repository (always)
+ try:
+ log_data = result.copy()
+ if toctree_result:
+ log_data["toctree_result"] = toctree_result
+ log_entry = json.dumps(log_data, ensure_ascii=False) + "\n"
+ log_res = GitHubLogger().append_jsonl(log_entry)
+ print(f"📝 Log append result: {log_res}")
+ except Exception as e:
+ print(f"❌ Failed to append PR log via GitHub API: {e}")
+
+ if result["status"] == "success":
+ return f"""✅ **GitHub PR Creation Successful!**
+
+🔗 **PR URL:** {result.get('pr_url', 'NO_PR_URL')}
+🌿 **Branch:** {result["branch"]}
+📁 **File:** {result["file_path"]}{toctree_status}
+
+{result["message"]}"""
+
+ elif result["status"] == "partial_success":
+ error_details = result.get("error_details", "Unknown error")
+
+ # Check if it's "existing PR" case (not really an error)
+ if "Existing PR found" in error_details:
+ existing_pr_url = error_details.split(": ")[-1] if ": " in error_details else "Unknown"
+ return f"""🔄 **Translation Updated Successfully**
+
+🎯 **Selected Project:** {project}
+🌿 **Branch:** {result["branch"]}
+📁 **File:** {result["file_path"]}{toctree_status}
+
+🔗 **Existing PR Updated:** {existing_pr_url}
+
+✅ Your translation has been added to the existing PR. The file and toctree have been successfully updated!"""
+ else:
+ # Actual error case
+ return f"""⚠️ **Partial Success**
+
+🎯 **Selected Project:** {project}
+🏠 **User Fork:** {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')}
+🎯 **Target Base:** {base_owner}/{base_repo}
+🌿 **Branch:** {result["branch"]}
+📁 **File:** {result["file_path"]}{toctree_status}
+
+{result["message"]}
+
+**Error Details:**
+{error_details}
+
+💡 **Project-Repository Mismatch Check:**
+- Selected project '{project}' should match repository '{github_config.get('repo_name', 'REPO')}'
+- For smolagents: use Jwaminju/smolagents fork
+- For transformers: use Jwaminju/transformers fork"""
+
+ else:
+ error_details = result.get("error_details", "No additional details")
+ return f"""❌ **GitHub PR Creation Failed**
+
+🎯 **Selected Project:** {project}
+🏠 **User Fork:** {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')}
+🎯 **Target Base:** {base_owner}/{base_repo}
+
+**Error Message:**
+{result["message"]}
+
+**Error Details:**
+{error_details}
+
+💡 **Project-Repository Mismatch:**
+Selected project '{project}' but configured repository '{github_config.get('repo_name', 'REPO')}'
+• For smolagents project: use 'smolagents' repository
+• For transformers project: use 'transformers' repository"""
+
+ except Exception as e:
+ error_msg = f"""❌ **Unexpected Error During PR Creation**
+
+**Error:** {str(e)}
+
+**Configuration:**
+• Project: {project}
+• File: {filepath}
+• Target: {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')} → {base_owner if 'base_owner' in locals() else 'BASE'}/{base_repo if 'base_repo' in locals() else 'REPO'}"""
+ print(error_msg)
+ return error_msg
+
+
+# Backward compatibility function (replaces old mock function)
+def mock_generate_PR():
+ """Backward compatibility function - returns warning message only"""
+ return (
+ "⚠️ mock_generate_PR() is deprecated. Please use generate_github_pr() instead."
+ )
diff --git a/app.py b/app.py
index 65402c8..51bca82 100644
--- a/app.py
+++ b/app.py
@@ -1,379 +1,379 @@
-"""Module for gradio chat-based translation agent interface."""
-
-import base64
-import os
-
-import gradio as gr
-from dotenv import load_dotenv
-
-from agent.handler import (
- approve_handler,
- confirm_and_go_translate_handler,
- confirm_translation_and_go_upload_handler,
- get_welcome_message,
- process_file_search_handler,
- restart_handler,
- send_message,
- start_translate_handler,
- sync_language_displays,
- update_language_selection,
- update_project_selection,
- update_prompt_preview,
- update_status,
- update_github_config,
- update_persistent_config,
-)
-from translator.model import Languages
-from translator.project_config import get_available_projects
-
-load_dotenv()
-
-
-css = """
-.gradio-container {
- background: linear-gradient(135deg, #ffeda7 0%, #ffbebf 100%);
-}
-.chat-container {
- background: rgba(255, 255, 180, 0.25);
- border-radius: 18px;
- box-shadow: 0 4px 24px rgba(0,0,0,0.08);
- padding: 1.0em;
- backdrop-filter: blur(8px);
- border: 1px solid rgba(255,255,180,0.25);
- width: 100%;
- height: 100%;
-}
-.control-panel {
- background: rgba(255, 255, 180, 0.25);
- border-radius: 18px;
- box-shadow: 0 4px 24px rgba(0,0,0,0.08);
- padding: 1.0em;
- backdrop-filter: blur(8px);
- border: 1px solid rgba(255,255,180,0.25);
- width: 100%;
- overflow: visible !important;
-
-}
-.status-card {
- width: 100%
-}
-.action-button {
- background: linear-gradient(135deg, #ff8c8c 0%, #f9a889 100%) !important;
- color: white !important;
- border: none !important;
- font-weight: 600 !important;
- box-shadow: 0 4px 12px rgba(0,0,0,0.15) !important;
- transition: all 0.3s ease-in-out !important;
-}
-.action-button:hover {
- background: linear-gradient(135deg, #f9a889 0%, #ff8c8c 100%) !important;
- box-shadow: 0 6px 16px rgba(0,0,0,0.2) !important;
- transform: translateY(-2px) !important;
-}
-
-.simple-tabs .tab-nav button {
- background: transparent !important;
- color: #4A5568 !important;
- box-shadow: none !important;
- transform: none !important;
- border: none !important;
- border-bottom: 2px solid #E2E8F0 !important;
- border-radius: 0 !important;
- font-weight: 600 !important;
-}
-
-.simple-tabs .tab-nav button.selected {
- color: #f97316 !important;
- border-bottom: 2px solid #f97316 !important;
-}
-
-.simple-tabs .tab-nav button:hover {
- background: #f3f4f6 !important;
- color: #f97316 !important;
- box-shadow: none !important;
- transform: none !important;
-}
-"""
-
-
-# Create the main interface
-with gr.Blocks(
- css=css, title=" 🌐 Hugging Face Transformers Docs i18n made easy"
-) as demo:
- # Title
- with open("images/hfkr_logo.png", "rb") as img_file:
- base64_img = base64.b64encode(img_file.read()).decode()
- gr.Markdown(
- f'
'
- )
- gr.Markdown(
- ' 🌐 Hugging Face Transformers Docs i18n made easy
'
- )
-
- # Content
- with gr.Row():
- # Chat interface
- with gr.Column(scale=3, elem_classes=["chat-container"]):
- gr.Markdown("### 🌐 Hugging Face i18n Agent")
-
- chatbot = gr.Chatbot(
- value=[[None, get_welcome_message()]], scale=1, height=525,
- show_copy_button=True
- )
-
- # Chat input directly under main chat
- gr.Markdown("### 💬 Chat with agent")
- with gr.Row():
- msg_input = gr.Textbox(
- placeholder="Type your message here... (e.g. 'what', 'how', or 'help')",
- container=False,
- scale=4,
- )
- send_btn = gr.Button("Send", scale=1, elem_classes="action-button")
-
- # Controller interface
- with gr.Column(scale=2):
- # Configuration Panel
- with gr.Column(elem_classes=["control-panel"]):
- gr.Markdown("### ⚙️ Configuration")
-
- with gr.Accordion("🔧 API & GitHub Settings", open=True):
- api_provider_radio = gr.Radio(
- ["Anthropic", "AWS Bedrock"],
- label="Select API Provider",
- value="Anthropic", # Default selection
- interactive=True,
- )
- config_anthropic_key = gr.Textbox(
- label="🔑 Anthropic API Key",
- type="password",
- placeholder="sk-ant-...",
- visible=True, # Initially visible as Anthropic is default
- )
- config_aws_bearer_token_bedrock = gr.Textbox(
- label="🔑 AWS Bearer Token for Bedrock",
- type="password",
- placeholder="AWS_BEARER_TOKEN_BEDROCK",
- visible=False, # Initially hidden
- )
- config_github_token = gr.Textbox(
- label="🔑 GitHub Token (Required for PR, Optional for file search)",
- type="password",
- placeholder="ghp_...",
- )
-
- with gr.Row():
- config_github_owner = gr.Textbox(
- label="👤 GitHub Owner",
- placeholder="your-username",
- scale=1,
- )
- config_github_repo = gr.Textbox(
- label="📁 Repository Name",
- placeholder="your-repository",
- scale=1,
- )
-
- save_config_btn = gr.Button(
- "💾 Save Configuration", elem_classes="action-button"
- )
-
- # Quick Controller
- with gr.Column(elem_classes=["control-panel"]):
- gr.Markdown("### 🛠️ Quick Controls")
- status_display = gr.HTML(update_status())
-
- with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
- with gr.TabItem("1. Find Files", id=0):
- with gr.Group():
- project_dropdown = gr.Radio(
- choices=get_available_projects(),
- label="🎯 Select Project",
- value="transformers",
- )
- lang_dropdown = gr.Radio(
- choices=[language.value for language in Languages],
- label="🌍 Translate To",
- value="ko",
- )
- k_input = gr.Number(
- label="📊 First k missing translated docs",
- value=10,
- minimum=1,
- )
- find_btn = gr.Button(
- "🔍 Find Files to Translate",
- elem_classes="action-button",
- )
-
- confirm_go_btn = gr.Button(
- "✅ Confirm Selection & Go to Translate",
- elem_classes="action-button",
- )
-
- with gr.TabItem("2. Translate", id=1):
- with gr.Group():
- files_to_translate = gr.Radio(
- choices=[],
- label="📄 Select a file to translate",
- interactive=True,
- value=None,
- )
- file_to_translate_input = gr.Textbox(
- label="🌍 Select in the dropdown or write the file path to translate",
- value="",
- )
-
- translate_lang_display = gr.Dropdown(
- choices=[language.value for language in Languages],
- label="🌍 Translation Language",
- value="ko",
- interactive=False,
- )
- additional_instruction = gr.Textbox(
- label="📝 Additional instructions (Optional - e.g., custom glossary)",
- placeholder="Example: Translate 'model' as '모델' consistently",
- lines=2,
- )
-
- force_retranslate = gr.Checkbox(
- label="🔄 Force Retranslate (ignore existing translations)",
- value=False,
- )
-
- with gr.Accordion("🔍 Preview Translation Prompt", open=False):
- prompt_preview = gr.Textbox(
- lines=8,
- interactive=False,
- placeholder="Select a file and language to see the prompt preview...",
- show_copy_button=True,
- )
-
- start_translate_btn = gr.Button(
- "🚀 Start Translation", elem_classes="action-button"
- )
-
- confirm_upload_btn = gr.Button(
- "✅ Confirm Translation & Upload PR",
- elem_classes="action-button",
- visible=False,
- )
-
- with gr.TabItem("3. Upload PR", id=2):
- with gr.Group():
- reference_pr_url = gr.Textbox(
- label="🔗 Reference PR URL (Optional)",
- placeholder="Auto-filled based on project selection",
- )
- approve_btn = gr.Button(
- "✅ Generate GitHub PR", elem_classes="action-button"
- )
- restart_btn = gr.Button(
- "🔄 Restart Translation", elem_classes="action-button"
- )
-
- # Event Handlers
-
- find_btn.click(
- fn=process_file_search_handler,
- inputs=[project_dropdown, lang_dropdown, k_input, chatbot],
- outputs=[chatbot, msg_input, status_display, control_tabs, files_to_translate],
- )
-
- confirm_go_btn.click(
- fn=confirm_and_go_translate_handler,
- inputs=[chatbot],
- outputs=[chatbot, msg_input, status_display, control_tabs],
- )
-
- # Auto-save selections to state and update prompt preview
- project_dropdown.change(
- fn=update_project_selection,
- inputs=[project_dropdown, chatbot],
- outputs=[chatbot, msg_input, status_display],
- )
-
- # Update prompt preview when project changes
- project_dropdown.change(
- fn=update_prompt_preview,
- inputs=[translate_lang_display, file_to_translate_input, additional_instruction],
- outputs=[prompt_preview],
- )
-
- lang_dropdown.change(
- fn=update_language_selection,
- inputs=[lang_dropdown, chatbot],
- outputs=[chatbot, msg_input, status_display, translate_lang_display],
- )
-
- #
- files_to_translate.change(
- fn=lambda x: x,
- inputs=[files_to_translate],
- outputs=[file_to_translate_input],
- )
-
- # Button event handlers
- start_translate_btn.click(
- fn=start_translate_handler,
- inputs=[chatbot, file_to_translate_input, additional_instruction, force_retranslate],
- outputs=[chatbot, msg_input, status_display, control_tabs, start_translate_btn, confirm_upload_btn],
- )
-
- confirm_upload_btn.click(
- fn=confirm_translation_and_go_upload_handler,
- inputs=[chatbot],
- outputs=[chatbot, msg_input, status_display, control_tabs],
- )
-
- # Configuration Save
- save_config_btn.click(
- fn=update_persistent_config,
- inputs=[api_provider_radio, config_anthropic_key, config_aws_bearer_token_bedrock, config_github_token, config_github_owner, config_github_repo, reference_pr_url, chatbot],
- outputs=[chatbot, msg_input, status_display],
- )
-
- # API Provider selection handler
- api_provider_radio.change(
- fn=lambda provider: (
- gr.update(visible=True) if provider == "Anthropic" else gr.update(visible=False),
- gr.update(visible=True) if provider == "AWS Bedrock" else gr.update(visible=False),
- ),
- inputs=[api_provider_radio],
- outputs=[config_anthropic_key, config_aws_bearer_token_bedrock],
- )
-
- approve_btn.click(
- fn=approve_handler,
- inputs=[chatbot, config_github_owner, config_github_repo, reference_pr_url],
- outputs=[chatbot, msg_input, status_display],
- )
-
- restart_btn.click(
- fn=restart_handler,
- inputs=[chatbot],
- outputs=[chatbot, msg_input, status_display, control_tabs],
- )
-
- send_btn.click(
- fn=send_message,
- inputs=[msg_input, chatbot],
- outputs=[chatbot, msg_input, status_display],
- )
-
- msg_input.submit(
- fn=send_message,
- inputs=[msg_input, chatbot],
- outputs=[chatbot, msg_input, status_display],
- )
-
- # Update prompt preview when inputs change
- for input_component in [translate_lang_display, file_to_translate_input, additional_instruction]:
- input_component.change(
- fn=update_prompt_preview,
- inputs=[translate_lang_display, file_to_translate_input, additional_instruction],
- outputs=[prompt_preview],
- )
-
-root_path = os.environ.get("GRADIO_ROOT_PATH")
-demo.launch(root_path=root_path)
+"""Module for gradio chat-based translation agent interface."""
+
+import base64
+import os
+
+import gradio as gr
+from dotenv import load_dotenv
+
+from agent.handler import (
+ approve_handler,
+ confirm_and_go_translate_handler,
+ confirm_translation_and_go_upload_handler,
+ get_welcome_message,
+ process_file_search_handler,
+ restart_handler,
+ send_message,
+ start_translate_handler,
+ sync_language_displays,
+ update_language_selection,
+ update_project_selection,
+ update_prompt_preview,
+ update_status,
+ update_github_config,
+ update_persistent_config,
+)
+from translator.model import Languages
+from translator.project_config import get_available_projects
+
+load_dotenv()
+
+
+css = """
+.gradio-container {
+ background: linear-gradient(135deg, #ffeda7 0%, #ffbebf 100%);
+}
+.chat-container {
+ background: rgba(255, 255, 180, 0.25);
+ border-radius: 18px;
+ box-shadow: 0 4px 24px rgba(0,0,0,0.08);
+ padding: 1.0em;
+ backdrop-filter: blur(8px);
+ border: 1px solid rgba(255,255,180,0.25);
+ width: 100%;
+ height: 100%;
+}
+.control-panel {
+ background: rgba(255, 255, 180, 0.25);
+ border-radius: 18px;
+ box-shadow: 0 4px 24px rgba(0,0,0,0.08);
+ padding: 1.0em;
+ backdrop-filter: blur(8px);
+ border: 1px solid rgba(255,255,180,0.25);
+ width: 100%;
+ overflow: visible !important;
+
+}
+.status-card {
+ width: 100%
+}
+.action-button {
+ background: linear-gradient(135deg, #ff8c8c 0%, #f9a889 100%) !important;
+ color: white !important;
+ border: none !important;
+ font-weight: 600 !important;
+ box-shadow: 0 4px 12px rgba(0,0,0,0.15) !important;
+ transition: all 0.3s ease-in-out !important;
+}
+.action-button:hover {
+ background: linear-gradient(135deg, #f9a889 0%, #ff8c8c 100%) !important;
+ box-shadow: 0 6px 16px rgba(0,0,0,0.2) !important;
+ transform: translateY(-2px) !important;
+}
+
+.simple-tabs .tab-nav button {
+ background: transparent !important;
+ color: #4A5568 !important;
+ box-shadow: none !important;
+ transform: none !important;
+ border: none !important;
+ border-bottom: 2px solid #E2E8F0 !important;
+ border-radius: 0 !important;
+ font-weight: 600 !important;
+}
+
+.simple-tabs .tab-nav button.selected {
+ color: #f97316 !important;
+ border-bottom: 2px solid #f97316 !important;
+}
+
+.simple-tabs .tab-nav button:hover {
+ background: #f3f4f6 !important;
+ color: #f97316 !important;
+ box-shadow: none !important;
+ transform: none !important;
+}
+"""
+
+
+# Create the main interface
+with gr.Blocks(
+ css=css, title=" 🌐 Hugging Face Transformers Docs i18n made easy"
+) as demo:
+ # Title
+ with open("images/hfkr_logo.png", "rb") as img_file:
+ base64_img = base64.b64encode(img_file.read()).decode()
+ gr.Markdown(
+ f'
'
+ )
+ gr.Markdown(
+ ' 🌐 Hugging Face Transformers Docs i18n made easy
'
+ )
+
+ # Content
+ with gr.Row():
+ # Chat interface
+ with gr.Column(scale=3, elem_classes=["chat-container"]):
+ gr.Markdown("### 🌐 Hugging Face i18n Agent")
+
+ chatbot = gr.Chatbot(
+ value=[[None, get_welcome_message()]], scale=1, height=525,
+ show_copy_button=True
+ )
+
+ # Chat input directly under main chat
+ gr.Markdown("### 💬 Chat with agent")
+ with gr.Row():
+ msg_input = gr.Textbox(
+ placeholder="Type your message here... (e.g. 'what', 'how', or 'help')",
+ container=False,
+ scale=4,
+ )
+ send_btn = gr.Button("Send", scale=1, elem_classes="action-button")
+
+ # Controller interface
+ with gr.Column(scale=2):
+ # Configuration Panel
+ with gr.Column(elem_classes=["control-panel"]):
+ gr.Markdown("### ⚙️ Configuration")
+
+ with gr.Accordion("🔧 API & GitHub Settings", open=True):
+ api_provider_radio = gr.Radio(
+ ["Anthropic", "AWS Bedrock"],
+ label="Select API Provider",
+ value="Anthropic", # Default selection
+ interactive=True,
+ )
+ config_anthropic_key = gr.Textbox(
+ label="🔑 Anthropic API Key",
+ type="password",
+ placeholder="sk-ant-...",
+ visible=True, # Initially visible as Anthropic is default
+ )
+ config_aws_bearer_token_bedrock = gr.Textbox(
+ label="🔑 AWS Bearer Token for Bedrock",
+ type="password",
+ placeholder="AWS_BEARER_TOKEN_BEDROCK",
+ visible=False, # Initially hidden
+ )
+ config_github_token = gr.Textbox(
+ label="🔑 GitHub Token (Required for PR, Optional for file search)",
+ type="password",
+ placeholder="ghp_...",
+ )
+
+ with gr.Row():
+ config_github_owner = gr.Textbox(
+ label="👤 GitHub Owner",
+ placeholder="your-username",
+ scale=1,
+ )
+ config_github_repo = gr.Textbox(
+ label="📁 Repository Name",
+ placeholder="your-repository",
+ scale=1,
+ )
+
+ save_config_btn = gr.Button(
+ "💾 Save Configuration", elem_classes="action-button"
+ )
+
+ # Quick Controller
+ with gr.Column(elem_classes=["control-panel"]):
+ gr.Markdown("### 🛠️ Quick Controls")
+ status_display = gr.HTML(update_status())
+
+ with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
+ with gr.TabItem("1. Find Files", id=0):
+ with gr.Group():
+ project_dropdown = gr.Radio(
+ choices=get_available_projects(),
+ label="🎯 Select Project",
+ value="transformers",
+ )
+ lang_dropdown = gr.Radio(
+ choices=[language.value for language in Languages],
+ label="🌍 Translate To",
+ value="ko",
+ )
+ k_input = gr.Number(
+ label="📊 First k missing translated docs",
+ value=10,
+ minimum=1,
+ )
+ find_btn = gr.Button(
+ "🔍 Find Files to Translate",
+ elem_classes="action-button",
+ )
+
+ confirm_go_btn = gr.Button(
+ "✅ Confirm Selection & Go to Translate",
+ elem_classes="action-button",
+ )
+
+ with gr.TabItem("2. Translate", id=1):
+ with gr.Group():
+ files_to_translate = gr.Radio(
+ choices=[],
+ label="📄 Select a file to translate",
+ interactive=True,
+ value=None,
+ )
+ file_to_translate_input = gr.Textbox(
+ label="🌍 Select in the dropdown or write the file path to translate",
+ value="",
+ )
+
+ translate_lang_display = gr.Dropdown(
+ choices=[language.value for language in Languages],
+ label="🌍 Translation Language",
+ value="ko",
+ interactive=False,
+ )
+ additional_instruction = gr.Textbox(
+ label="📝 Additional instructions (Optional - e.g., custom glossary)",
+ placeholder="Example: Translate 'model' as '모델' consistently",
+ lines=2,
+ )
+
+ force_retranslate = gr.Checkbox(
+ label="🔄 Force Retranslate (ignore existing translations)",
+ value=False,
+ )
+
+ with gr.Accordion("🔍 Preview Translation Prompt", open=False):
+ prompt_preview = gr.Textbox(
+ lines=8,
+ interactive=False,
+ placeholder="Select a file and language to see the prompt preview...",
+ show_copy_button=True,
+ )
+
+ start_translate_btn = gr.Button(
+ "🚀 Start Translation", elem_classes="action-button"
+ )
+
+ confirm_upload_btn = gr.Button(
+ "✅ Confirm Translation & Upload PR",
+ elem_classes="action-button",
+ visible=False,
+ )
+
+ with gr.TabItem("3. Upload PR", id=2):
+ with gr.Group():
+ reference_pr_url = gr.Textbox(
+ label="🔗 Reference PR URL (Optional)",
+ placeholder="Auto-filled based on project selection",
+ )
+ approve_btn = gr.Button(
+ "✅ Generate GitHub PR", elem_classes="action-button"
+ )
+ restart_btn = gr.Button(
+ "🔄 Restart Translation", elem_classes="action-button"
+ )
+
+ # Event Handlers
+
+ find_btn.click(
+ fn=process_file_search_handler,
+ inputs=[project_dropdown, lang_dropdown, k_input, chatbot],
+ outputs=[chatbot, msg_input, status_display, control_tabs, files_to_translate],
+ )
+
+ confirm_go_btn.click(
+ fn=confirm_and_go_translate_handler,
+ inputs=[chatbot],
+ outputs=[chatbot, msg_input, status_display, control_tabs],
+ )
+
+ # Auto-save selections to state and update prompt preview
+ project_dropdown.change(
+ fn=update_project_selection,
+ inputs=[project_dropdown, chatbot],
+ outputs=[chatbot, msg_input, status_display],
+ )
+
+ # Update prompt preview when project changes
+ project_dropdown.change(
+ fn=update_prompt_preview,
+ inputs=[translate_lang_display, file_to_translate_input, additional_instruction],
+ outputs=[prompt_preview],
+ )
+
+ lang_dropdown.change(
+ fn=update_language_selection,
+ inputs=[lang_dropdown, chatbot],
+ outputs=[chatbot, msg_input, status_display, translate_lang_display],
+ )
+
+ #
+ files_to_translate.change(
+ fn=lambda x: x,
+ inputs=[files_to_translate],
+ outputs=[file_to_translate_input],
+ )
+
+ # Button event handlers
+ start_translate_btn.click(
+ fn=start_translate_handler,
+ inputs=[chatbot, file_to_translate_input, additional_instruction, force_retranslate],
+ outputs=[chatbot, msg_input, status_display, control_tabs, start_translate_btn, confirm_upload_btn],
+ )
+
+ confirm_upload_btn.click(
+ fn=confirm_translation_and_go_upload_handler,
+ inputs=[chatbot],
+ outputs=[chatbot, msg_input, status_display, control_tabs],
+ )
+
+ # Configuration Save
+ save_config_btn.click(
+ fn=update_persistent_config,
+ inputs=[api_provider_radio, config_anthropic_key, config_aws_bearer_token_bedrock, config_github_token, config_github_owner, config_github_repo, reference_pr_url, chatbot],
+ outputs=[chatbot, msg_input, status_display],
+ )
+
+ # API Provider selection handler
+ api_provider_radio.change(
+ fn=lambda provider: (
+ gr.update(visible=True) if provider == "Anthropic" else gr.update(visible=False),
+ gr.update(visible=True) if provider == "AWS Bedrock" else gr.update(visible=False),
+ ),
+ inputs=[api_provider_radio],
+ outputs=[config_anthropic_key, config_aws_bearer_token_bedrock],
+ )
+
+ approve_btn.click(
+ fn=approve_handler,
+ inputs=[chatbot, config_github_owner, config_github_repo, reference_pr_url],
+ outputs=[chatbot, msg_input, status_display],
+ )
+
+ restart_btn.click(
+ fn=restart_handler,
+ inputs=[chatbot],
+ outputs=[chatbot, msg_input, status_display, control_tabs],
+ )
+
+ send_btn.click(
+ fn=send_message,
+ inputs=[msg_input, chatbot],
+ outputs=[chatbot, msg_input, status_display],
+ )
+
+ msg_input.submit(
+ fn=send_message,
+ inputs=[msg_input, chatbot],
+ outputs=[chatbot, msg_input, status_display],
+ )
+
+ # Update prompt preview when inputs change
+ for input_component in [translate_lang_display, file_to_translate_input, additional_instruction]:
+ input_component.change(
+ fn=update_prompt_preview,
+ inputs=[translate_lang_display, file_to_translate_input, additional_instruction],
+ outputs=[prompt_preview],
+ )
+
+root_path = os.environ.get("GRADIO_ROOT_PATH")
+demo.launch(root_path=root_path)
diff --git a/config.py b/config.py
index f439a30..eb6a00f 100644
--- a/config.py
+++ b/config.py
@@ -1,10 +1,10 @@
-# config.py
-
-# 기본 모델 목록
-default_models = [
- "Helsinki-NLP/opus-mt-ko-en",
- "Helsinki-NLP/opus-mt-tc-big-en-ko",
- "davidkim205/iris-7b",
- "maywell/Synatra-7B-v0.3-Translation",
- "CUSTOM_MODEL_INPUT" # Placeholder for custom model input
+# config.py
+
+# 기본 모델 목록
+default_models = [
+ "Helsinki-NLP/opus-mt-ko-en",
+ "Helsinki-NLP/opus-mt-tc-big-en-ko",
+ "davidkim205/iris-7b",
+ "maywell/Synatra-7B-v0.3-Translation",
+ "CUSTOM_MODEL_INPUT" # Placeholder for custom model input
]
\ No newline at end of file
diff --git a/example.env b/example.env
index dc8789c..60e6a2d 100644
--- a/example.env
+++ b/example.env
@@ -1,18 +1,18 @@
-ANTHROPIC_API_KEY=
-
-# GitHub PR Agent Configuration
-GITHUB_TOKEN=
-GITHUB_OWNER=
-GITHUB_REPO=
-REFERENCE_PR_URL=
-
-# Secrets for deployment to HF space
-HF_TOKEN=
-HF_USERNAME=
-HF_SPACE_NAME=
-
-# Secrets for logging to Github
-LOG_REPO=
-LOG_GITHUB_TOKEN=
-LOG_BRANCH=
-LOG_FILE_PATH=
+ANTHROPIC_API_KEY=
+
+# GitHub PR Agent Configuration
+GITHUB_TOKEN=
+GITHUB_OWNER=
+GITHUB_REPO=
+REFERENCE_PR_URL=
+
+# Secrets for deployment to HF space
+HF_TOKEN=
+HF_USERNAME=
+HF_SPACE_NAME=
+
+# Secrets for logging to Github
+LOG_REPO=
+LOG_GITHUB_TOKEN=
+LOG_BRANCH=
+LOG_FILE_PATH=
diff --git a/external/mcp-servers/hf-translation-docs-explorer/.gitattributes b/external/mcp-servers/hf-translation-docs-explorer/.gitattributes
new file mode 100644
index 0000000..a6344aa
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs-explorer/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/external/mcp-servers/hf-translation-docs-explorer/README.md b/external/mcp-servers/hf-translation-docs-explorer/README.md
new file mode 100644
index 0000000..e650ca1
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs-explorer/README.md
@@ -0,0 +1,13 @@
+---
+title: Traslation File Explorer
+emoji: ⚡
+colorFrom: red
+colorTo: pink
+sdk: gradio
+sdk_version: 5.49.1
+app_file: app.py
+pinned: false
+license: mit
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/external/mcp-servers/hf-translation-docs-explorer/adapters.py b/external/mcp-servers/hf-translation-docs-explorer/adapters.py
new file mode 100644
index 0000000..e20f937
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs-explorer/adapters.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+
+from typing import Dict, List
+
+import requests
+
+from setting import SETTINGS
+
+
+def _build_auth_headers() -> Dict[str, str]:
+ """
+ GitHub 호출용 Authorization 헤더 생성.
+ - 우선순위: SETTINGS.github_token → (fallback) 환경변수 GITHUB_TOKEN
+ """
+ token = SETTINGS.github_token
+ if not token:
+ # 환경변수 직접 조회
+ import os
+ token = os.environ.get("GITHUB_TOKEN", "")
+
+ if not token:
+ return {}
+ return {"Authorization": f"token {token}"}
+
+
+def fetch_document_paths(api_url: str) -> List[str]:
+ """
+ GitHub git/trees API에서 blob 경로 목록만 추출.
+
+ Parameters
+ ----------
+ api_url : str
+ 예: https://api.github.com/repos/huggingface/transformers/git/trees/main?recursive=1
+ """
+ response = requests.get(
+ api_url,
+ headers=_build_auth_headers(),
+ timeout=SETTINGS.request_timeout_seconds,
+ )
+
+ if response.status_code == 403 and "rate limit" in response.text.lower():
+ raise RuntimeError(
+ "GitHub API rate limit exceeded. Provide a GITHUB_TOKEN to continue."
+ )
+
+ response.raise_for_status()
+ tree = response.json().get("tree", [])
+ return [item["path"] for item in tree if item.get("type") == "blob"]
diff --git a/external/mcp-servers/hf-translation-docs-explorer/app.py b/external/mcp-servers/hf-translation-docs-explorer/app.py
new file mode 100644
index 0000000..2838495
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs-explorer/app.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+
+import os
+import gradio as gr
+
+from services import get_available_projects, LANGUAGE_CHOICES
+from tools import list_projects, search_files, list_missing_files
+from setting import SETTINGS
+
+
+def ensure_mcp_support() -> None:
+ """Verify that `gradio[mcp]` is installed and enable the MCP server flag."""
+ try:
+ import gradio.mcp # noqa: F401
+ except ImportError as exc:
+ raise RuntimeError("Install gradio[mcp] before launching this module.") from exc
+
+ os.environ.setdefault("GRADIO_MCP_SERVER", "true")
+ os.environ.setdefault("GRADIO_SHOW_API", "true")
+
+
+def build_ui() -> gr.Blocks:
+ """Create a lightweight Gradio Blocks UI for exercising the MCP tools."""
+ projects = get_available_projects()
+ languages = LANGUAGE_CHOICES[:]
+
+ with gr.Blocks(title=SETTINGS.ui_title) as demo:
+ gr.Markdown("# Translation MCP Server\nTry the MCP tools exposed below.")
+
+ # --- 1) Project catalog ---
+ with gr.Tab("Project catalog"):
+ catalog_output = gr.JSON(label="catalog")
+ gr.Button("Fetch").click(
+ fn=list_projects,
+ inputs=[],
+ outputs=catalog_output,
+ api_name="translation_project_catalog",
+ )
+
+ # --- 2) File search ---
+ with gr.Tab("File search"):
+ project_input = gr.Dropdown(
+ choices=projects,
+ label="Project",
+ value=projects[0] if projects else "",
+ )
+ lang_input = gr.Dropdown(
+ choices=languages,
+ label="Language",
+ value=SETTINGS.default_language,
+ )
+ limit_input = gr.Number(
+ label="Limit",
+ value=SETTINGS.default_limit,
+ minimum=1,
+ )
+ include_report = gr.Checkbox(
+ label="Include status report",
+ value=True,
+ )
+
+ search_output = gr.JSON(label="search result")
+ gr.Button("Search").click(
+ fn=search_files,
+ inputs=[project_input, lang_input, limit_input, include_report],
+ outputs=search_output,
+ api_name="translation_file_search",
+ )
+
+ # --- 3) Missing docs only ---
+ with gr.Tab("Missing docs"):
+ missing_project = gr.Dropdown(
+ choices=projects,
+ label="Project",
+ value=projects[0] if projects else "",
+ )
+ missing_lang = gr.Dropdown(
+ choices=languages,
+ label="Language",
+ value=SETTINGS.default_language,
+ )
+ missing_limit = gr.Number(
+ label="Limit",
+ value=max(SETTINGS.default_limit, 20),
+ minimum=1,
+ )
+
+ missing_output = gr.JSON(label="missing files")
+ gr.Button("List missing").click(
+ fn=list_missing_files,
+ inputs=[missing_project, missing_lang, missing_limit],
+ outputs=missing_output,
+ api_name="translation_missing_list",
+ )
+
+ return demo
+
+
+ensure_mcp_support()
+
+ui = build_ui()
+
+ui.launch(
+ server_name="0.0.0.0",
+ server_port=int(os.environ.get("PORT", "7860")),
+ share=False,
+ show_api=True,
+ mcp_server=True
+)
diff --git a/external/mcp-servers/hf-translation-docs-explorer/configs/defaults.yaml b/external/mcp-servers/hf-translation-docs-explorer/configs/defaults.yaml
new file mode 100644
index 0000000..f3726fb
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs-explorer/configs/defaults.yaml
@@ -0,0 +1,10 @@
+github:
+ token: "" # 기본값: 환경변수 GITHUB_TOKEN 사용 권장
+ request_timeout_seconds: 30
+
+translation:
+ default_language: "ko" # 기본 타겟 언어
+ default_limit: 5 # 기본 검색/누락 파일 개수
+
+ui:
+ title: "Translation Docs Search MCP Server"
\ No newline at end of file
diff --git a/external/mcp-servers/hf-translation-docs-explorer/pyproject.toml b/external/mcp-servers/hf-translation-docs-explorer/pyproject.toml
new file mode 100644
index 0000000..46dbed5
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs-explorer/pyproject.toml
@@ -0,0 +1,13 @@
+[project]
+name = "translation-file-explorer-mcp"
+version = "0.1.0"
+requires-python = ">=3.10"
+dependencies = [
+ "gradio[mcp]>=5.33.0",
+ "pydantic>=2.7.0",
+ "requests>=2.31.0",
+ "pyyaml>=6.0.1",
+]
+
+[tool.ruff]
+line-length = 100
\ No newline at end of file
diff --git a/external/mcp-servers/hf-translation-docs-explorer/requirements.txt b/external/mcp-servers/hf-translation-docs-explorer/requirements.txt
new file mode 100644
index 0000000..bc57906
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs-explorer/requirements.txt
@@ -0,0 +1,11 @@
+gradio[mcp]==5.33.0
+requests
+pydantic
+langchain-anthropic
+python-dotenv
+langchain
+PyGithub
+langchain-core
+langchain-community
+boto3
+PyYAML
\ No newline at end of file
diff --git a/external/mcp-servers/hf-translation-docs-explorer/services.py b/external/mcp-servers/hf-translation-docs-explorer/services.py
new file mode 100644
index 0000000..3c91181
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs-explorer/services.py
@@ -0,0 +1,236 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Tuple
+
+from adapters import fetch_document_paths
+from setting import SETTINGS
+
+
+# Gradio / UI 에 노출할 언어 선택지
+LANGUAGE_CHOICES: List[str] = [
+ "ko",
+ "ja",
+ "zh",
+ "fr",
+ "de",
+]
+
+
+@dataclass(frozen=True)
+class Project:
+ """Store the minimum metadata required for documentation lookups."""
+
+ slug: str
+ name: str
+ repo_url: str
+ docs_path: str
+ tree_api_url: str
+
+ @property
+ def repo_path(self) -> str:
+ """Return the ``owner/repo`` identifier for GitHub API requests."""
+ return self.repo_url.replace("https://github.com/", "")
+
+
+# 지원 프로젝트 정의
+PROJECTS: Dict[str, Project] = {
+ "transformers": Project(
+ slug="transformers",
+ name="Transformers",
+ repo_url="https://github.com/huggingface/transformers",
+ docs_path="docs/source",
+ tree_api_url=(
+ "https://api.github.com/repos/huggingface/transformers/git/trees/main?recursive=1"
+ ),
+ ),
+ "smolagents": Project(
+ slug="smolagents",
+ name="SmolAgents",
+ repo_url="https://github.com/huggingface/smolagents",
+ docs_path="docs/source",
+ tree_api_url=(
+ "https://api.github.com/repos/huggingface/smolagents/git/trees/main?recursive=1"
+ ),
+ ),
+}
+
+
+def get_available_projects() -> List[str]:
+ """Return the list of project slugs supported by this module."""
+ return sorted(PROJECTS.keys())
+
+
+def _iter_english_docs(all_docs: Iterable[str], docs_root: str) -> Iterable[Path]:
+ """Yield English documentation files as ``Path`` objects."""
+ english_root = Path(docs_root) / "en"
+
+ for doc_path in all_docs:
+ if not doc_path.endswith(".md"):
+ continue
+
+ path = Path(doc_path)
+ try:
+ # en/ 아래에 있는지 필터링
+ path.relative_to(english_root)
+ except ValueError:
+ continue
+
+ yield path
+
+
+def _compute_missing_translations(
+ project_key: str,
+ language: str,
+ limit: int,
+) -> Tuple[str, List[str], Project]:
+ """
+ 영어 기준으로 누락 번역 파일을 계산하고,
+ 마크다운 요약 리포트 + 누락 경로 리스트 + Project 메타데이터를 반환.
+ """
+ project = PROJECTS[project_key]
+
+ all_paths = fetch_document_paths(project.tree_api_url)
+ english_docs = list(_iter_english_docs(all_paths, project.docs_path))
+ english_total = len(english_docs)
+
+ missing: List[str] = []
+ docs_set = set(all_paths)
+
+ for english_doc in english_docs:
+ relative = english_doc.relative_to(Path(project.docs_path) / "en")
+ translated_path = str(Path(project.docs_path) / language / relative)
+
+ if translated_path not in docs_set:
+ # 누락된 경우: 기준은 영어 경로(en/...)
+ missing.append(str(english_doc))
+ if len(missing) >= limit:
+ break
+
+ missing_count = len(missing)
+ percentage = (missing_count / english_total * 100) if english_total else 0.0
+
+ report = (
+ "| Item | Count | Percentage |\n"
+ "|------|-------|------------|\n"
+ f"| English docs | {english_total} | - |\n"
+ f"| Missing translations | {missing_count} | {percentage:.2f}% |"
+ )
+
+ return report, missing, project
+
+
+def build_project_catalog(default: str | None) -> Dict[str, Any]:
+ """Build the project catalog payload (API-neutral, pure logic)."""
+ slugs = get_available_projects()
+ default = default if default in slugs else None
+
+ return {
+ "type": "translation.project_list",
+ "projects": [
+ {
+ "slug": slug,
+ "display_name": PROJECTS[slug].name,
+ "repo_url": PROJECTS[slug].repo_url,
+ "docs_path": PROJECTS[slug].docs_path,
+ }
+ for slug in slugs
+ ],
+ "default_project": default,
+ "total_projects": len(slugs),
+ }
+
+
+def build_search_response(
+ project: str,
+ lang: str,
+ limit: int,
+ include_status_report: bool,
+) -> Dict[str, Any]:
+ """
+ 누락 번역 파일 후보 + (선택) 상태 리포트를 포함한 검색 응답.
+ MCP / Gradio 에서 사용 가능한 JSON 형태.
+ """
+ project = project.strip()
+ lang = lang.strip()
+ limit = max(1, int(limit))
+
+ project_config = PROJECTS[project]
+
+ status_report, candidate_paths, project_config = _compute_missing_translations(
+ project_key=project,
+ language=lang,
+ limit=limit,
+ )
+
+ repo_url = project_config.repo_url.rstrip("/")
+
+ return {
+ "type": "translation.search.response",
+ "request": {
+ "type": "translation.search.request",
+ "project": project,
+ "target_language": lang,
+ "limit": limit,
+ "include_status_report": include_status_report,
+ },
+ "files": [
+ {
+ "rank": index,
+ "path": path,
+ "repo_url": f"{repo_url}/blob/main/{path}",
+ "metadata": {
+ "project": project,
+ "target_language": lang,
+ "docs_path": project_config.docs_path,
+ },
+ }
+ for index, path in enumerate(candidate_paths, start=1)
+ ],
+ "total_candidates": len(candidate_paths),
+ "status_report": status_report if include_status_report else None,
+ }
+
+
+def build_missing_list_response(
+ project: str,
+ lang: str,
+ limit: int,
+) -> Dict[str, Any]:
+ """
+ 누락 번역 파일 목록만 제공하는 응답(JSON).
+ """
+ project = project.strip()
+ lang = lang.strip()
+ limit_int = max(1, int(limit))
+
+ status_report, missing_paths, project_config = _compute_missing_translations(
+ project_key=project,
+ language=lang,
+ limit=limit_int,
+ )
+
+ repo_url = project_config.repo_url.rstrip("/")
+
+ return {
+ "type": "translation.missing_list",
+ "project": project,
+ "target_language": lang,
+ "limit": limit_int,
+ "count": len(missing_paths),
+ "files": [
+ {
+ "rank": index,
+ "path": path,
+ "repo_url": f"{repo_url}/blob/main/{path}",
+ "metadata": {
+ "project": project,
+ "target_language": lang,
+ "docs_path": project_config.docs_path,
+ },
+ }
+ for index, path in enumerate(missing_paths, start=1)
+ ],
+ "status_report": status_report, # 필요 없다면 제거 가능
+ }
diff --git a/external/mcp-servers/hf-translation-docs-explorer/setting.py b/external/mcp-servers/hf-translation-docs-explorer/setting.py
new file mode 100644
index 0000000..8aac827
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs-explorer/setting.py
@@ -0,0 +1,62 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict
+
+import os
+
+try:
+ import yaml # type: ignore
+except Exception:
+ yaml = None
+
+
+@dataclass
+class AppSettings:
+ github_token: str = ""
+ request_timeout_seconds: int = 30
+ default_language: str = "ko"
+ default_limit: int = 5
+ ui_title: str = "Translation MCP Server"
+
+
+def _load_yaml(path: Path) -> Dict[str, Any]:
+ if not path.is_file():
+ return {}
+ if yaml is None:
+ return {}
+ with path.open("r", encoding="utf-8") as f:
+ data = yaml.safe_load(f) or {}
+ return data if isinstance(data, dict) else {}
+
+
+def load_settings(config_path: str = "configs/default.yaml") -> AppSettings:
+ cfg = _load_yaml(Path(config_path))
+
+ github_cfg = cfg.get("github", {}) if isinstance(cfg.get("github"), dict) else {}
+ trans_cfg = cfg.get("translation", {}) if isinstance(cfg.get("translation"), dict) else {}
+ ui_cfg = cfg.get("ui", {}) if isinstance(cfg.get("ui"), dict) else {}
+
+ # ENV > YAML
+ github_token = os.getenv("GITHUB_TOKEN", github_cfg.get("token", ""))
+ request_timeout_seconds = int(
+ os.getenv("REQUEST_TIMEOUT_SECONDS", github_cfg.get("request_timeout_seconds", 30))
+ )
+ default_language = os.getenv("DEFAULT_LANGUAGE", trans_cfg.get("default_language", "ko"))
+ default_limit = int(
+ os.getenv("DEFAULT_LIMIT", trans_cfg.get("default_limit", 5))
+ )
+ ui_title = ui_cfg.get("title", "Translation MCP Server")
+
+ return AppSettings(
+ github_token=github_token,
+ request_timeout_seconds=request_timeout_seconds,
+ default_language=default_language,
+ default_limit=default_limit,
+ ui_title=ui_title,
+ )
+
+
+# 전역 설정 인스턴스
+SETTINGS: AppSettings = load_settings()
diff --git a/external/mcp-servers/hf-translation-docs-explorer/tools.py b/external/mcp-servers/hf-translation-docs-explorer/tools.py
new file mode 100644
index 0000000..f53595d
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs-explorer/tools.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from services import (
+ build_project_catalog,
+ build_search_response,
+ build_missing_list_response,
+)
+
+
+def list_projects() -> Dict[str, Any]:
+ """
+ Gradio + MCP에서 사용되는 'translation_project_catalog' 엔드포인트.
+ 입력값 없이 전체 프로젝트 카탈로그를 반환한다.
+ """
+ return build_project_catalog(default=None)
+
+
+def search_files(
+ project: str,
+ lang: str,
+ limit: float | int,
+ include_status_report: bool,
+) -> Dict[str, Any]:
+ """
+ Gradio + MCP에서 사용되는 'translation_file_search' 엔드포인트.
+ """
+ return build_search_response(
+ project=project,
+ lang=lang,
+ limit=int(limit or 1),
+ include_status_report=bool(include_status_report),
+ )
+
+
+def list_missing_files(
+ project: str,
+ lang: str,
+ limit: float | int,
+) -> Dict[str, Any]:
+ """
+ Gradio + MCP에서 사용되는 'translation_missing_list' 엔드포인트.
+ 누락 파일 리스트만 반환.
+ """
+ return build_missing_list_response(
+ project=project,
+ lang=lang,
+ limit=int(limit or 1),
+ )
diff --git a/external/mcp-servers/hf-translation-docs/MCP_TOOLS_SPEC.md b/external/mcp-servers/hf-translation-docs/MCP_TOOLS_SPEC.md
new file mode 100644
index 0000000..89210bd
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs/MCP_TOOLS_SPEC.md
@@ -0,0 +1,302 @@
+# HuggingFace Translation Documentation MCP Server Tools Specification
+
+## Overview
+MCP Server for HuggingFace documentation translation with streamable HTTP implementation.
+Client performs translation while server handles file operations and workflow management.
+
+## Tools Definition
+
+### 1. search_translation_files
+**Description:** Search for files that need translation in a HuggingFace project
+
+**Input:**
+```json
+{
+ "project": "transformers", // Project name (transformers, diffusers, etc.)
+ "target_language": "ko", // Target language code (ko, zh, ja, etc.)
+ "max_files": 10 // Maximum number of files to return
+}
+```
+
+**Output:**
+```json
+{
+ "status": "success",
+ "data": {
+ "project": "transformers",
+ "target_language": "ko",
+ "total_found": 156,
+ "files": [
+ {
+ "path": "docs/source/en/model_doc/bert.md",
+ "size": 15420,
+ "last_modified": "2024-01-15T10:30:00Z",
+ "priority": "high",
+ "translation_status": "missing"
+ }
+ ],
+ "statistics": {
+ "missing": 145,
+ "outdated": 11,
+ "up_to_date": 0
+ }
+ }
+}
+```
+
+### 2. get_file_content
+**Description:** Retrieve original file content for translation
+
+**Input:**
+```json
+{
+ "project": "transformers",
+ "file_path": "docs/source/en/model_doc/bert.md",
+ "include_metadata": true
+}
+```
+
+**Output:**
+```json
+{
+ "status": "success",
+ "data": {
+ "file_path": "docs/source/en/model_doc/bert.md",
+ "content": "# BERT\n\n\n\nThis is the BERT model...",
+ "metadata": {
+ "encoding": "utf-8",
+ "size": 15420,
+ "last_modified": "2024-01-15T10:30:00Z",
+ "content_hash": "sha256:abc123..."
+ },
+ "processed_content": {
+ "to_translate": "BERT\n\nThis is the BERT model...",
+ "code_blocks_removed": 3,
+ "tables_removed": 1
+ }
+ }
+}
+```
+
+### 3. generate_translation_prompt
+**Description:** Generate optimized translation prompt for the content
+
+**Input:**
+```json
+{
+ "target_language": "ko",
+ "content": "# BERT\n\nThis is the BERT model...",
+ "additional_instruction": "Use technical terms consistently",
+ "project": "transformers",
+ "file_path": "docs/source/en/model_doc/bert.md"
+}
+```
+
+**Output:**
+```json
+{
+ "status": "success",
+ "data": {
+ "prompt": "You are a professional technical translator...",
+ "context": {
+ "target_language_name": "Korean",
+ "content_type": "technical_documentation",
+ "domain": "machine_learning",
+ "file_type": "model_documentation"
+ },
+ "guidelines": [
+ "Preserve markdown formatting",
+ "Keep technical terms in English where appropriate",
+ "Maintain code block integrity"
+ ]
+ }
+}
+```
+
+### 4. validate_translation
+**Description:** Validate translated content for quality and formatting
+
+**Input:**
+```json
+{
+ "original_content": "# BERT\n\nThis is the BERT model...",
+ "translated_content": "# BERT\n\n이것은 BERT 모델입니다...",
+ "target_language": "ko",
+ "file_path": "docs/source/en/model_doc/bert.md"
+}
+```
+
+**Output:**
+```json
+{
+ "status": "success",
+ "data": {
+ "is_valid": true,
+ "quality_score": 0.95,
+ "issues": [],
+ "suggestions": [
+ {
+ "type": "terminology",
+ "message": "Consider using '모델' consistently for 'model'",
+ "line": 3
+ }
+ ],
+ "formatting": {
+ "markdown_valid": true,
+ "links_preserved": true,
+ "code_blocks_intact": true
+ }
+ }
+}
+```
+
+### 5. save_translation_result
+**Description:** Save translation result to file system
+
+**Input:**
+```json
+{
+ "project": "transformers",
+ "original_file_path": "docs/source/en/model_doc/bert.md",
+ "translated_content": "# BERT\n\n이것은 BERT 모델입니다...",
+ "target_language": "ko",
+ "metadata": {
+ "translator": "claude-3.5-sonnet",
+ "translation_date": "2024-01-20T14:30:00Z",
+ "additional_instruction": "Use technical terms consistently"
+ }
+}
+```
+
+**Output:**
+```json
+{
+ "status": "success",
+ "data": {
+ "saved_path": "/path/to/translation_result/docs/source/ko/model_doc/bert.md",
+ "backup_path": "/path/to/backup/bert_20240120_143000.md",
+ "file_size": 16840,
+ "checksum": "sha256:def456...",
+ "created_directories": ["docs/source/ko/model_doc"]
+ }
+}
+```
+
+### 6. create_github_pr
+**Description:** Create GitHub Pull Request for translation
+
+**Input:**
+```json
+{
+ "github_config": {
+ "token": "ghp_...",
+ "owner": "user-fork",
+ "repo_name": "transformers",
+ "reference_pr_url": "https://github.com/huggingface/transformers/pull/12345"
+ },
+ "translation_data": {
+ "file_path": "docs/source/en/model_doc/bert.md",
+ "target_language": "ko",
+ "translated_content": "# BERT\n\n이것은 BERT 모델입니다...",
+ "en_title": "BERT"
+ },
+ "project": "transformers"
+}
+```
+
+**Output:**
+```json
+{
+ "status": "success",
+ "data": {
+ "pr_url": "https://github.com/user-fork/transformers/pull/123",
+ "pr_number": 123,
+ "branch_name": "add-korean-bert-docs",
+ "commit_hash": "abc1234567890",
+ "files_changed": [
+ "docs/source/ko/model_doc/bert.md",
+ "docs/source/ko/_toctree.yml"
+ ],
+ "pr_details": {
+ "title": "Add Korean translation for BERT documentation",
+ "body": "This PR adds Korean translation for BERT model documentation...",
+ "reviewers": []
+ }
+ }
+}
+```
+
+### 7. get_project_config
+**Description:** Get project-specific configuration and settings
+
+**Input:**
+```json
+{
+ "project": "transformers"
+}
+```
+
+**Output:**
+```json
+{
+ "status": "success",
+ "data": {
+ "project": "transformers",
+ "repo_url": "https://github.com/huggingface/transformers",
+ "docs_path": "docs/source",
+ "supported_languages": ["ko", "zh", "ja", "es", "fr"],
+ "reference_pr_url": "https://github.com/huggingface/transformers/pull/12345",
+ "translation_guidelines": {
+ "preserve_code_blocks": true,
+ "keep_english_terms": ["API", "token", "embedding"],
+ "style_guide_url": "https://..."
+ }
+ }
+}
+```
+
+## Streaming Implementation
+
+All tools support streaming responses for better UX:
+
+```http
+GET /tools/{tool_name}
+Content-Type: application/json
+Accept: text/event-stream
+
+Response:
+data: {"type": "progress", "message": "Searching files...", "progress": 0.3}
+
+data: {"type": "partial", "data": {"files": [...partial_results...]}}
+
+data: {"type": "complete", "data": {...final_result...}}
+```
+
+## Error Handling
+
+Standard error response format:
+```json
+{
+ "status": "error",
+ "error": {
+ "code": "FILE_NOT_FOUND",
+ "message": "The specified file could not be found",
+ "details": {
+ "file_path": "docs/source/en/model_doc/bert.md",
+ "project": "transformers"
+ }
+ }
+}
+```
+
+## Workflow Integration
+
+1. Client calls `search_translation_files` → gets file list
+2. Client calls `get_file_content` → gets original content
+3. Client calls `generate_translation_prompt` → gets optimized prompt
+4. **Client performs translation using LLM** ← Key difference
+5. Client calls `validate_translation` → checks quality
+6. Client calls `save_translation_result` → saves result
+7. Client calls `create_github_pr` → creates PR
+
+This architecture separates concerns: MCP server handles file operations, client handles translation.
\ No newline at end of file
diff --git a/external/mcp-servers/hf-translation-docs/adapters.py b/external/mcp-servers/hf-translation-docs/adapters.py
new file mode 100644
index 0000000..9b5012d
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs/adapters.py
@@ -0,0 +1,69 @@
+"""External API adapters for file content retrieval."""
+
+import re
+import string
+import requests
+from project_config import get_project_config
+from prompt_glossary import PROMPT_WITH_GLOSSARY
+
+
+def get_content(filepath: str, project: str = "transformers") -> str:
+ """Get file content from GitHub raw URL."""
+ if filepath == "":
+ raise ValueError("No files selected for translation.")
+
+ config = get_project_config(project)
+ # Extract repo path from repo_url (e.g., "huggingface/transformers")
+ repo_path = config.repo_url.replace("https://github.com/", "")
+
+ url = f"https://raw.githubusercontent.com/{repo_path}/main/{filepath}"
+ response = requests.get(url)
+ if response.status_code == 200:
+ content = response.text
+ return content
+ else:
+ raise ValueError(f"Failed to retrieve content from the URL: {url}")
+
+
+def preprocess_content(content: str) -> str:
+ """Extract text to translate from document."""
+ # ignore top license comment
+ to_translate = content[content.find("#") :]
+ # remove empty lines from text
+ to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
+ return to_translate
+
+
+def get_full_prompt(language: str, to_translate: str, additional_instruction: str = "") -> str:
+ """Generate optimized translation prompt for the content."""
+ base_prompt = string.Template(
+ "What do these sentences about Hugging Face Transformers "
+ "(a machine learning library) mean in $language? "
+ "Please do not translate the word after a 🤗 emoji "
+ "as it is a product name. Output the complete markdown file**, with prose translated and all other content intact"
+ "No explanations or extras—only the translated markdown. Also translate all comments within code blocks as well."
+ ).safe_substitute(language=language)
+
+ base_prompt += "\n\n```md"
+
+ full_prompt = "\n".join([base_prompt, to_translate.strip(), "```", PROMPT_WITH_GLOSSARY])
+
+ if additional_instruction.strip():
+ full_prompt += f"\n\n🗒️ Additional instructions: {additional_instruction.strip()}"
+
+ return full_prompt
+
+
+def get_language_name(language_code: str) -> str:
+ """Convert language code to full language name."""
+ language_map = {
+ "ko": "Korean",
+ "zh": "Chinese",
+ "ja": "Japanese",
+ "es": "Spanish",
+ "fr": "French",
+ "de": "German",
+ "it": "Italian",
+ "pt": "Portuguese"
+ }
+ return language_map.get(language_code, language_code.title())
\ No newline at end of file
diff --git a/external/mcp-servers/hf-translation-docs/app.py b/external/mcp-servers/hf-translation-docs/app.py
new file mode 100644
index 0000000..893af25
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs/app.py
@@ -0,0 +1,207 @@
+"""HuggingFace Translation Documentation MCP Server."""
+
+from __future__ import annotations
+
+import os
+import gradio as gr
+
+from services import get_supported_projects
+from tools import get_project_config, search_translation_files, get_file_content, generate_translation_prompt, validate_translation, save_translation_result
+from setting import SETTINGS, LANGUAGE_CHOICES
+
+
+def ensure_mcp_support() -> None:
+ """Verify that `gradio[mcp]` is installed and enable the MCP server flag."""
+ try:
+ import gradio.mcp # noqa: F401
+ except ImportError as exc:
+ raise RuntimeError("Install gradio[mcp] before launching this module.") from exc
+
+ os.environ.setdefault("GRADIO_MCP_SERVER", "true")
+ os.environ.setdefault("GRADIO_SHOW_API", "true")
+
+
+def build_ui() -> gr.Blocks:
+ """Create a lightweight Gradio Blocks UI for testing MCP tools."""
+ projects = get_supported_projects()
+ languages = [lang[1] for lang in LANGUAGE_CHOICES] # Extract language codes
+
+ with gr.Blocks(title=SETTINGS.ui_title) as demo:
+ gr.Markdown("# HuggingFace Translation Documentation MCP Server\nTest the MCP tools below.")
+
+ # --- 1) Get Project Config ---
+ with gr.Tab("Project Config"):
+ project_input = gr.Dropdown(
+ choices=projects,
+ label="Project",
+ value=SETTINGS.default_project,
+ )
+ config_output = gr.JSON(label="Project Configuration")
+ gr.Button("Get Config").click(
+ fn=get_project_config,
+ inputs=[project_input],
+ outputs=config_output,
+ api_name="translation_get_project_config",
+ )
+
+ # --- 2) Search Translation Files ---
+ with gr.Tab("Search Files"):
+ search_project = gr.Dropdown(
+ choices=projects,
+ label="Project",
+ value=SETTINGS.default_project,
+ )
+ search_language = gr.Dropdown(
+ choices=languages,
+ label="Target Language",
+ value=SETTINGS.default_language,
+ )
+ search_limit = gr.Number(
+ label="Max Files",
+ value=SETTINGS.default_limit,
+ minimum=1,
+ maximum=100,
+ )
+ search_output = gr.JSON(label="Search Results")
+ gr.Button("Search Files").click(
+ fn=search_translation_files,
+ inputs=[search_project, search_language, search_limit],
+ outputs=search_output,
+ api_name="translation_search_files",
+ )
+
+ # --- 3) Get File Content ---
+ with gr.Tab("Get File Content"):
+ content_project = gr.Dropdown(
+ choices=projects,
+ label="Project",
+ value=SETTINGS.default_project,
+ )
+ content_file_path = gr.Textbox(
+ label="File Path",
+ placeholder="docs/source/en/model_doc/bert.md",
+ )
+ content_include_metadata = gr.Checkbox(
+ label="Include Metadata",
+ value=True,
+ )
+ content_output = gr.JSON(label="File Content")
+ gr.Button("Get Content").click(
+ fn=get_file_content,
+ inputs=[content_project, content_file_path, content_include_metadata],
+ outputs=content_output,
+ api_name="translation_get_file_content",
+ )
+
+ # --- 4) Generate Translation Prompt ---
+ with gr.Tab("Generate Prompt"):
+ prompt_target_language = gr.Dropdown(
+ choices=languages,
+ label="Target Language",
+ value=SETTINGS.default_language,
+ )
+ prompt_content = gr.Textbox(
+ label="Content to Translate",
+ placeholder="Enter markdown content...",
+ lines=5,
+ )
+ prompt_additional = gr.Textbox(
+ label="Additional Instructions",
+ placeholder="Optional additional instructions...",
+ lines=2,
+ )
+ prompt_project = gr.Dropdown(
+ choices=projects,
+ label="Project",
+ value=SETTINGS.default_project,
+ )
+ prompt_file_path = gr.Textbox(
+ label="File Path (optional)",
+ placeholder="docs/source/en/model_doc/bert.md",
+ )
+ prompt_output = gr.JSON(label="Translation Prompt")
+ gr.Button("Generate Prompt").click(
+ fn=generate_translation_prompt,
+ inputs=[prompt_target_language, prompt_content, prompt_additional, prompt_project, prompt_file_path],
+ outputs=prompt_output,
+ api_name="translation_generate_prompt",
+ )
+
+ # --- 5) Validate Translation ---
+ with gr.Tab("Validate Translation"):
+ validate_original = gr.Textbox(
+ label="Original Content",
+ placeholder="Enter original content...",
+ lines=5,
+ )
+ validate_translated = gr.Textbox(
+ label="Translated Content",
+ placeholder="Enter translated content...",
+ lines=5,
+ )
+ validate_language = gr.Dropdown(
+ choices=languages,
+ label="Target Language",
+ value=SETTINGS.default_language,
+ )
+ validate_file_path = gr.Textbox(
+ label="File Path (optional)",
+ placeholder="docs/source/en/model_doc/bert.md",
+ )
+ validate_output = gr.JSON(label="Validation Results")
+ gr.Button("Validate").click(
+ fn=validate_translation,
+ inputs=[validate_original, validate_translated, validate_language, validate_file_path],
+ outputs=validate_output,
+ api_name="translation_validate",
+ )
+
+ # --- 6) Save Translation Result ---
+ with gr.Tab("Save Result"):
+ save_project = gr.Dropdown(
+ choices=projects,
+ label="Project",
+ value=SETTINGS.default_project,
+ )
+ save_original_path = gr.Textbox(
+ label="Original File Path",
+ placeholder="docs/source/en/model_doc/bert.md",
+ )
+ save_content = gr.Textbox(
+ label="Translated Content",
+ placeholder="Enter translated content to save...",
+ lines=8,
+ )
+ save_language = gr.Dropdown(
+ choices=languages,
+ label="Target Language",
+ value=SETTINGS.default_language,
+ )
+ save_output = gr.JSON(label="Save Results")
+ gr.Button("Save Translation").click(
+ fn=save_translation_result,
+ inputs=[save_project, save_original_path, save_content, save_language],
+ outputs=save_output,
+ api_name="translation_save_result",
+ )
+
+ return demo
+
+
+def main():
+ """Main entry point for the MCP server."""
+ ensure_mcp_support()
+
+ ui = build_ui()
+
+ # Launch with MCP server enabled
+ ui.launch(
+ server_name="0.0.0.0",
+ server_port=int(os.environ.get("PORT", "7860")),
+ share=False,
+ mcp_server=True
+ )
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/external/mcp-servers/hf-translation-docs/configs/default.yaml b/external/mcp-servers/hf-translation-docs/configs/default.yaml
new file mode 100644
index 0000000..4badb63
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs/configs/default.yaml
@@ -0,0 +1 @@
+# Configuration values
\ No newline at end of file
diff --git a/external/mcp-servers/hf-translation-docs/project_config.py b/external/mcp-servers/hf-translation-docs/project_config.py
new file mode 100644
index 0000000..24b646e
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs/project_config.py
@@ -0,0 +1,48 @@
+"""Project configuration for different HuggingFace repositories."""
+
+from dataclasses import dataclass
+from typing import Dict
+
+
+@dataclass
+class ProjectConfig:
+ """Configuration for a specific HuggingFace project."""
+ name: str
+ repo_url: str
+ api_url: str
+ docs_path: str
+ github_issues: Dict[str, str] # language -> issue_id
+ reference_pr_url: str
+
+
+# Project configurations
+PROJECTS = {
+ "transformers": ProjectConfig(
+ name="Transformers",
+ repo_url="https://github.com/huggingface/transformers",
+ api_url="https://api.github.com/repos/huggingface/transformers/git/trees/main?recursive=1",
+ docs_path="docs/source",
+ github_issues={"ko": "20179"},
+ reference_pr_url="https://github.com/huggingface/transformers/pull/24968"
+ ),
+ "smolagents": ProjectConfig(
+ name="SmolAgents",
+ repo_url="https://github.com/huggingface/smolagents",
+ api_url="https://api.github.com/repos/huggingface/smolagents/git/trees/main?recursive=1",
+ docs_path="docs/source",
+ github_issues={"ko": "20179"},
+ reference_pr_url="https://github.com/huggingface/smolagents/pull/1581"
+ )
+}
+
+
+def get_project_config(project_key: str) -> ProjectConfig:
+ """Get project configuration by key."""
+ if project_key not in PROJECTS:
+ raise ValueError(f"Unknown project: {project_key}. Available: {list(PROJECTS.keys())}")
+ return PROJECTS[project_key]
+
+
+def get_available_projects() -> list[str]:
+ """Get list of available project keys."""
+ return list(PROJECTS.keys())
\ No newline at end of file
diff --git a/external/mcp-servers/hf-translation-docs/prompt_glossary.py b/external/mcp-servers/hf-translation-docs/prompt_glossary.py
new file mode 100644
index 0000000..261f30f
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs/prompt_glossary.py
@@ -0,0 +1,126 @@
+PROMPT_WITH_GLOSSARY = """
+You have a glossary of terms with their Korean translations. When translating a sentence, you need to check if any of the words in the sentence are in the glossary, and if so, translate them according to the provided Korean terms. Here is the glossary:
+
+🔹 Glossary (English → Korean):
+- revision: 개정
+- method: 메소드
+- secrets: 비밀값
+- search helper: 검색 헬퍼
+- logging level: 로그 레벨
+- workflow: 워크플로우
+- corner case: 코너 케이스
+- tokenization: 토큰화
+- architecture: 아키텍처
+- attention mask: 어텐션 마스크
+- backbone: 백본
+- argmax: argmax
+- beam search: 빔 서치
+- clustering: 군집화
+- configuration: 구성
+- context: 문맥
+- cross entropy: 교차 엔트로피
+- cross-attention: 크로스 어텐션
+- dictionary: 딕셔너리
+- entry: 엔트리
+- few shot: 퓨샷
+- flatten: 평탄화
+- ground truth: 정답
+- head: 헤드
+- helper function: 헬퍼 함수
+- image captioning: 이미지 캡셔닝
+- image patch: 이미지 패치
+- inference: 추론
+- instance: 인스턴스
+- Instantiate: 인스턴스화
+- knowledge distillation: 지식 증류
+- labels: 레이블
+- large language models (LLM): 대규모 언어 모델
+- layer: 레이어
+- learning rate scheduler: Learning Rate Scheduler
+- localization: 로컬리제이션
+- log mel-filter bank: 로그 멜 필터 뱅크
+- look-up table: 룩업 테이블
+- loss function: 손실 함수
+- machine learning: 머신 러닝
+- mapping: 매핑
+- masked language modeling (MLM): 마스크드 언어 모델
+- malware: 악성코드
+- metric: 지표
+- mixed precision: 혼합 정밀도
+- modality: 모달리티
+- monolingual model: 단일 언어 모델
+- multi gpu: 다중 GPU
+- multilingual model: 다국어 모델
+- parsing: 파싱
+- perplexity (PPL): 펄플렉서티(Perplexity)
+- pipeline: 파이프라인
+- pixel values: 픽셀 값
+- pooling: 풀링
+- position IDs: 위치 ID
+- preprocessing: 전처리
+- prompt: 프롬프트
+- pythonic: 파이써닉
+- query: 쿼리
+- question answering: 질의 응답
+- raw audio waveform: 원시 오디오 파형
+- recurrent neural network (RNN): 순환 신경망
+- accelerator: 가속기
+- Accelerate: Accelerate
+- architecture: 아키텍처
+- arguments: 인수
+- attention mask: 어텐션 마스크
+- augmentation: 증강
+- autoencoding models: 오토인코딩 모델
+- autoregressive models: 자기회귀 모델
+- backward: 역방향
+- bounding box: 바운딩 박스
+- causal language modeling: 인과적 언어 모델링(causal language modeling)
+- channel: 채널
+- checkpoint: 체크포인트(checkpoint)
+- chunk: 묶음
+- computer vision: 컴퓨터 비전
+- convolution: 합성곱
+- crop: 자르기
+- custom: 사용자 정의
+- customize: 맞춤 설정하다
+- data collator: 데이터 콜레이터
+- dataset: 데이터 세트
+- decoder input IDs: 디코더 입력 ID
+- decoder models: 디코더 모델
+- deep learning (DL): 딥러닝
+- directory: 디렉터리
+- distributed training: 분산 학습
+- downstream: 다운스트림
+- encoder models: 인코더 모델
+- entity: 개체
+- epoch: 에폭
+- evaluation method: 평가 방법
+- feature extraction: 특성 추출
+- feature matrix: 특성 행렬(feature matrix)
+- fine-tunning: 미세 조정
+- finetuned models: 미세 조정 모델
+- hidden state: 은닉 상태
+- hyperparameter: 하이퍼파라미터
+- learning: 학습
+- load: 가져오다
+- method: 메소드
+- optimizer: 옵티마이저
+- pad (padding): 패드 (패딩)
+- parameter: 매개변수
+- pretrained model: 사전훈련된 모델
+- separator (* [SEP]를 부르는 이름): 분할 토큰
+- sequence: 시퀀스
+- silent error: 조용한 오류
+- token: 토큰
+- tokenizer: 토크나이저
+- training: 훈련
+- workflow: 워크플로우
+
+📌 Instructions:
+1. Whenever a source term from the glossary appears **in any form** (full match or partial match within a larger phrase), **replace it with the exact Korean translation** from the glossary, keeping the rest of the phrase in Korean.
+ - Example: “Attention Interface” → “어텐션 인터페이스”
+ - Example: “Architecture details” → “아키텍처 상세”
+2. Non-glossary words should be translated naturally, respecting context and technical nuance.
+
+Please revise the translated sentences accordingly using the terms provided in this glossary.
+"""
diff --git a/external/mcp-servers/hf-translation-docs/retriever.py b/external/mcp-servers/hf-translation-docs/retriever.py
new file mode 100644
index 0000000..7ef0448
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs/retriever.py
@@ -0,0 +1,237 @@
+"""File retrieval and analysis for HuggingFace documentation."""
+
+import os
+import re
+from pathlib import Path
+from typing import Tuple, List, Dict, Any
+import requests
+
+from project_config import get_project_config
+
+
+def get_github_repo_files(project: str = "transformers") -> List[str]:
+ """Get github repo files."""
+ config = get_project_config(project)
+
+ # Add GitHub token if available to avoid rate limiting (optional)
+ headers = {}
+ github_token = os.environ.get("GITHUB_TOKEN")
+ if github_token:
+ headers["Authorization"] = f"token {github_token}"
+
+ response = requests.get(config.api_url, headers=headers)
+
+ # Handle rate limit with helpful message
+ if response.status_code == 403 and "rate limit" in response.text.lower():
+ raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
+
+ data = response.json()
+ all_items = data.get("tree", [])
+
+ file_paths = [
+ item["path"]
+ for item in all_items
+ if item["type"] == "blob" and (item["path"].startswith("docs"))
+ ]
+ return file_paths
+
+
+def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko", all_files: List[str] = None) -> Tuple[List[str], List[str]]:
+ """Get open PR in the github issue, filtered by title containing '[i18n-KO]'."""
+ config = get_project_config(project)
+ issue_id = config.github_issues.get(lang)
+
+ # For projects without GitHub issue tracking, still search for PRs
+ if not issue_id:
+ raise ValueError(f"⚠️ No GitHub issue registered for {project}.")
+
+ # Require all_files parameter
+ if all_files is None:
+ raise ValueError("Repository file list must be provided")
+
+ headers = {
+ "Accept": "application/vnd.github+json",
+ }
+
+ # Add GitHub token if available to avoid rate limiting (optional)
+ github_token = os.environ.get("GITHUB_TOKEN")
+ if github_token:
+ headers["Authorization"] = f"token {github_token}"
+
+ all_open_prs = []
+ page = 1
+ per_page = 100 # Maximum allowed by GitHub API
+
+ while True:
+ repo_path = config.repo_url.replace("https://github.com/", "")
+ url = f"https://api.github.com/repos/{repo_path}/pulls?state=open&page={page}&per_page={per_page}"
+ response = requests.get(url, headers=headers)
+
+ if response.status_code == 403 and "rate limit" in response.text.lower():
+ raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
+ elif response.status_code != 200:
+ raise Exception(f"GitHub API error: {response.status_code} {response.text}")
+
+ page_prs = response.json()
+ if not page_prs: # No more PRs
+ break
+
+ all_open_prs.extend(page_prs)
+ page += 1
+
+ # Break if we got less than per_page results (last page)
+ if len(page_prs) < per_page:
+ break
+
+ filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]]
+
+ # Pattern to match filenames after "Translated" keyword
+ pattern = re.compile(r"Translated\s+(?:`([^`]+)`|(\S+))\s+to")
+
+ def find_original_file_path(filename_from_title, all_files):
+ """Find the exact file path from repo files by matching filename"""
+ if not filename_from_title:
+ return None
+
+ # Remove .md extension for matching
+ base_name = filename_from_title.replace('.md', '')
+
+ # Find files that end with this base name
+ matching_files = [f for f in all_files if f.endswith(f"{base_name}.md")]
+
+ # Prefer files in English docs
+ en_files = [f for f in matching_files if "/en/" in f]
+ if en_files:
+ return en_files[0]
+ elif matching_files:
+ return matching_files[0]
+
+ return None
+
+ docs_in_progress = []
+ pr_info_list = []
+
+ for pr in filtered_prs:
+ title = pr["title"]
+ pr_url = pr["html_url"]
+
+ # Extract the filename from the title
+ match = pattern.search(title)
+ if match:
+ # Get the filename (from either backticks or without)
+ filename_from_title = match.group(1) if match.group(1) else match.group(2)
+
+ # Find the actual file path in the repository
+ original_file_path = find_original_file_path(filename_from_title, all_files)
+
+ if original_file_path:
+ docs_in_progress.append(original_file_path)
+ pr_info_list.append(pr_url)
+
+ return docs_in_progress, pr_info_list
+
+
+# Simplified translation analysis classes
+class LanguageInfo:
+ def __init__(self, code: str, name: str):
+ self.code = code
+ self.value = code
+ self.name = name
+
+# Simple language lookup
+def get_language_info(lang_code: str) -> LanguageInfo:
+ """Get language info by code."""
+ languages = {
+ "ko": LanguageInfo("ko", "Korean"),
+ "zh": LanguageInfo("zh", "Chinese"),
+ "ja": LanguageInfo("ja", "Japanese"),
+ "es": LanguageInfo("es", "Spanish"),
+ "fr": LanguageInfo("fr", "French")
+ }
+ return languages.get(lang_code, languages["ko"])
+
+
+class TranslationDoc:
+ def __init__(self, translation_lang: str, original_file: str, translation_file: str, translation_exists: bool):
+ self.translation_lang = translation_lang
+ self.original_file = original_file
+ self.translation_file = translation_file
+ self.translation_exists = translation_exists
+
+
+class Summary:
+ def __init__(self, lang: str):
+ self.lang = lang
+ self.files: List[TranslationDoc] = []
+
+ def append_file(self, doc: TranslationDoc):
+ self.files.append(doc)
+
+ @property
+ def files_analyzed(self) -> int:
+ return len(self.files)
+
+ @property
+ def files_missing_translation(self) -> int:
+ return len([f for f in self.files if not f.translation_exists])
+
+ @property
+ def percentage_missing_translation(self) -> float:
+ if self.files_analyzed == 0:
+ return 0.0
+ return (self.files_missing_translation / self.files_analyzed) * 100
+
+ def first_missing_translation_files(self, limit: int) -> List[TranslationDoc]:
+ missing = [f for f in self.files if not f.translation_exists]
+ return missing[:limit]
+
+
+def retrieve(summary: Summary, table_size: int = 10) -> Tuple[str, List[str]]:
+ """Retrieve missing docs"""
+
+ report = f"""
+| Item | Count | Percentage |
+|------|-------|------------|
+| 📂 HuggingFaces docs | {summary.files_analyzed} | - |
+| 🪹 Missing translations | {summary.files_missing_translation} | {summary.percentage_missing_translation:.2f}% |
+"""
+ print(report)
+ first_missing_docs = []
+ for file in summary.first_missing_translation_files(table_size):
+ first_missing_docs.append(file.original_file)
+
+ print(first_missing_docs)
+ return report, first_missing_docs
+
+
+def report(project: str, target_lang: str, top_k: int = 1, docs_file: List[str] = None) -> Tuple[str, List[str]]:
+ """Generate a report for the translated docs"""
+ if docs_file is None:
+ raise ValueError("Repository file list must be provided")
+
+ base_docs_path = Path("docs/source")
+ en_docs_path = Path("docs/source/en")
+
+ lang = get_language_info(target_lang)
+ summary = Summary(lang=lang.value)
+
+ for file in docs_file:
+ if file.endswith(".md"):
+ try:
+ file_relative_path = Path(file).relative_to(en_docs_path)
+ except ValueError:
+ continue
+
+ translated_path = os.path.join(
+ base_docs_path, lang.value, file_relative_path
+ )
+ translation_exists = translated_path in docs_file
+
+ doc = TranslationDoc(
+ translation_lang=lang.value,
+ original_file=file,
+ translation_file=translated_path,
+ translation_exists=translation_exists,
+ )
+ summary.append_file(doc)
+ return retrieve(summary, top_k)
\ No newline at end of file
diff --git a/external/mcp-servers/hf-translation-docs/services.py b/external/mcp-servers/hf-translation-docs/services.py
new file mode 100644
index 0000000..8e9b65f
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs/services.py
@@ -0,0 +1,406 @@
+"""Business logic for HuggingFace translation documentation."""
+
+from pathlib import Path
+from typing import Dict, Any
+
+from project_config import get_project_config as get_base_config, get_available_projects
+from retriever import report, get_github_repo_files, get_github_issue_open_pr
+from adapters import get_content, preprocess_content, get_full_prompt, get_language_name
+from datetime import datetime
+import hashlib
+
+
+def get_project_configuration(project: str) -> Dict[str, Any]:
+ """
+ Get project-specific configuration and settings.
+ """
+ if project not in get_available_projects():
+ raise ValueError(f"Unknown project: {project}. Available: {get_available_projects()}")
+
+ config = get_base_config(project)
+
+ return {
+ "project": project,
+ "repo_url": config.repo_url,
+ "docs_path": config.docs_path,
+ "supported_languages": ["ko", "zh", "ja", "es", "fr", "de", "it", "pt"],
+ "reference_pr_url": config.reference_pr_url,
+ "translation_guidelines": {
+ "preserve_code_blocks": True,
+ "keep_english_terms": ["API", "token", "embedding", "transformer", "model"],
+ "style_guide_url": f"{config.repo_url}/blob/main/docs/TRANSLATION_GUIDE.md"
+ }
+ }
+
+
+def get_supported_projects() -> list[str]:
+ """
+ Get list of supported projects.
+ """
+ return get_available_projects()
+
+
+def search_translation_files_data(project: str, target_language: str, max_files: int = 10) -> Dict[str, Any]:
+ """
+ Search for files that need translation in a HuggingFace project.
+ """
+ try:
+ # Get all repository files
+ all_repo_files = get_github_repo_files(project)
+
+ # Get translation status report and file list
+ status_report, missing_files = report(project, target_language, max_files * 2, all_repo_files)
+
+ # Get files already in progress (if available)
+ try:
+ docs_in_progress, pr_info_list = get_github_issue_open_pr(project, target_language, all_repo_files)
+ # Filter out files already in progress
+ available_files = [f for f in missing_files if f not in docs_in_progress]
+ except (ValueError, Exception):
+ # If no issue tracking or error, use all missing files
+ available_files = missing_files
+ docs_in_progress = []
+
+ # Limit to max_files
+ files_to_return = available_files[:max_files]
+
+ # Build file list with metadata
+ files_data = []
+ for file_path in files_to_return:
+ # Estimate file size and priority (simplified)
+ file_size = len(file_path) * 100 # Rough estimate
+ priority = "high" if "model_doc" in file_path else "medium"
+
+ files_data.append({
+ "path": file_path,
+ "size": file_size,
+ "last_modified": datetime.now().isoformat() + "Z",
+ "priority": priority,
+ "translation_status": "missing"
+ })
+
+ return {
+ "project": project,
+ "target_language": target_language,
+ "total_found": len(missing_files),
+ "files": files_data,
+ "statistics": {
+ "missing": len(missing_files),
+ "in_progress": len(docs_in_progress) if 'docs_in_progress' in locals() else 0,
+ "up_to_date": 0
+ },
+ "status_report": status_report
+ }
+
+ except Exception as e:
+ raise ValueError(f"Failed to search translation files: {str(e)}")
+
+
+def get_file_content_data(project: str, file_path: str, include_metadata: bool = True) -> Dict[str, Any]:
+ """
+ Retrieve original file content for translation.
+ """
+ try:
+ # Get raw file content
+ content = get_content(file_path, project)
+
+ # Generate metadata
+ metadata = {}
+ if include_metadata:
+ content_bytes = content.encode('utf-8')
+ metadata = {
+ "encoding": "utf-8",
+ "size": len(content_bytes),
+ "last_modified": datetime.now().isoformat() + "Z",
+ "content_hash": f"sha256:{hashlib.sha256(content_bytes).hexdigest()[:12]}..."
+ }
+
+ # Process content for translation
+ processed_content = preprocess_content(content)
+
+ # Count removed elements (simplified)
+ original_code_blocks = content.count('```')
+ processed_code_blocks = processed_content.count('```')
+ code_blocks_removed = max(0, original_code_blocks - processed_code_blocks)
+
+ original_tables = content.count('|')
+ processed_tables = processed_content.count('|')
+ tables_removed = max(0, (original_tables - processed_tables) // 4) # Rough estimate
+
+ return {
+ "file_path": file_path,
+ "content": content,
+ "metadata": metadata,
+ "processed_content": {
+ "to_translate": processed_content,
+ "code_blocks_removed": code_blocks_removed,
+ "tables_removed": tables_removed
+ }
+ }
+
+ except Exception as e:
+ raise ValueError(f"Failed to get file content: {str(e)}")
+
+
+def generate_translation_prompt_data(
+ target_language: str,
+ content: str,
+ additional_instruction: str = "",
+ project: str = "transformers",
+ file_path: str = ""
+) -> Dict[str, Any]:
+ """
+ Generate optimized translation prompt for the content.
+ """
+ try:
+ # Convert language code to full language name
+ target_language_name = get_language_name(target_language)
+
+ # Generate the complete translation prompt
+ prompt = get_full_prompt(target_language_name, content, additional_instruction)
+
+ # Determine content type and domain based on file path
+ content_type = "technical_documentation"
+ domain = "machine_learning"
+ file_type = "general_documentation"
+
+ if file_path:
+ if "model_doc" in file_path:
+ file_type = "model_documentation"
+ elif "tutorial" in file_path:
+ file_type = "tutorial"
+ elif "api" in file_path:
+ file_type = "api_reference"
+
+ # Translation guidelines based on project
+ guidelines = [
+ "Preserve markdown formatting",
+ "Keep technical terms in English where appropriate",
+ "Maintain code block integrity",
+ "Use glossary terms when available",
+ "Do not translate product names after 🤗 emoji"
+ ]
+
+ return {
+ "prompt": prompt,
+ "context": {
+ "target_language_name": target_language_name,
+ "content_type": content_type,
+ "domain": domain,
+ "file_type": file_type,
+ "project": project
+ },
+ "guidelines": guidelines,
+ "metadata": {
+ "prompt_length": len(prompt),
+ "content_length": len(content),
+ "has_additional_instruction": bool(additional_instruction.strip()),
+ "language_code": target_language
+ }
+ }
+
+ except Exception as e:
+ raise ValueError(f"Failed to generate translation prompt: {str(e)}")
+
+
+def validate_translation_data(
+ original_content: str,
+ translated_content: str,
+ target_language: str,
+ file_path: str = ""
+) -> Dict[str, Any]:
+ """
+ Validate translated content for quality and formatting.
+ """
+ try:
+ issues = []
+ suggestions = []
+ quality_score = 1.0
+
+ # Basic validation checks
+ if not translated_content.strip():
+ issues.append({
+ "type": "content",
+ "message": "Translated content is empty",
+ "severity": "error"
+ })
+ quality_score = 0.0
+
+ # Check if content length is reasonable
+ original_length = len(original_content)
+ translated_length = len(translated_content)
+ length_ratio = translated_length / original_length if original_length > 0 else 0
+
+ if length_ratio < 0.3:
+ issues.append({
+ "type": "length",
+ "message": "Translated content seems too short",
+ "severity": "warning"
+ })
+ quality_score -= 0.2
+ elif length_ratio > 3.0:
+ issues.append({
+ "type": "length",
+ "message": "Translated content seems too long",
+ "severity": "warning"
+ })
+ quality_score -= 0.1
+
+ # Markdown formatting validation
+ formatting_valid = True
+ links_preserved = True
+ code_blocks_intact = True
+
+ # Check markdown headers
+ original_headers = original_content.count('#')
+ translated_headers = translated_content.count('#')
+ if abs(original_headers - translated_headers) > 2:
+ formatting_valid = False
+ issues.append({
+ "type": "formatting",
+ "message": f"Header count mismatch: {original_headers} vs {translated_headers}",
+ "severity": "warning"
+ })
+
+ # Check code blocks
+ original_code_blocks = original_content.count('```')
+ translated_code_blocks = translated_content.count('```')
+ if original_code_blocks != translated_code_blocks:
+ code_blocks_intact = False
+ issues.append({
+ "type": "formatting",
+ "message": f"Code block count mismatch: {original_code_blocks} vs {translated_code_blocks}",
+ "severity": "error"
+ })
+ quality_score -= 0.3
+
+ # Check links
+ original_links = original_content.count('](')
+ translated_links = translated_content.count('](')
+ if abs(original_links - translated_links) > 1:
+ links_preserved = False
+ issues.append({
+ "type": "formatting",
+ "message": f"Link count mismatch: {original_links} vs {translated_links}",
+ "severity": "warning"
+ })
+ quality_score -= 0.1
+
+ # Language-specific suggestions
+ if target_language == "ko":
+ suggestions.append({
+ "type": "terminology",
+ "message": "Consider using '모델' consistently for 'model'",
+ "line": 0
+ })
+
+ # Check for common Korean translation issues
+ if "transformer" in translated_content.lower() and "트랜스포머" not in translated_content:
+ suggestions.append({
+ "type": "terminology",
+ "message": "Consider using '트랜스포머' for 'transformer'",
+ "line": 0
+ })
+
+ # Final quality score adjustment
+ quality_score = max(0.0, min(1.0, quality_score))
+
+ is_valid = quality_score >= 0.7 and not any(issue["severity"] == "error" for issue in issues)
+
+ return {
+ "is_valid": is_valid,
+ "quality_score": quality_score,
+ "issues": issues,
+ "suggestions": suggestions,
+ "formatting": {
+ "markdown_valid": formatting_valid,
+ "links_preserved": links_preserved,
+ "code_blocks_intact": code_blocks_intact
+ },
+ "statistics": {
+ "original_length": original_length,
+ "translated_length": translated_length,
+ "length_ratio": length_ratio,
+ "header_count": translated_headers,
+ "code_block_count": translated_code_blocks // 2 if translated_code_blocks % 2 == 0 else 0
+ }
+ }
+
+ except Exception as e:
+ raise ValueError(f"Failed to validate translation: {str(e)}")
+
+
+def save_translation_result_data(
+ project: str,
+ original_file_path: str,
+ translated_content: str,
+ target_language: str,
+ metadata: Dict[str, Any] = None
+) -> Dict[str, Any]:
+ """
+ Save translation result to file system.
+ """
+ try:
+ from pathlib import Path
+ import os
+ import time
+
+ if metadata is None:
+ metadata = {}
+
+ # Create target directory structure
+ base_path = Path("translation_result")
+
+ # Convert English path to target language path
+ original_path = Path(original_file_path)
+ if "docs/source/en/" in original_file_path:
+ # Replace /en/ with target language
+ target_path = Path(original_file_path.replace("/en/", f"/{target_language}/"))
+ else:
+ # Add language prefix to filename
+ target_path = original_path.parent / f"{target_language}_{original_path.name}"
+
+ # Full save path
+ save_path = base_path / target_path
+ save_path.parent.mkdir(parents=True, exist_ok=True)
+
+ # Create backup if file exists
+ backup_path = None
+ if save_path.exists():
+ timestamp = time.strftime("%Y%m%d_%H%M%S")
+ backup_path = save_path.parent / f"{save_path.stem}_backup_{timestamp}.md"
+ backup_path.write_text(save_path.read_text(encoding='utf-8'), encoding='utf-8')
+
+ # Write translated content
+ save_path.write_text(translated_content, encoding='utf-8')
+
+ # Calculate file info
+ file_size = len(translated_content.encode('utf-8'))
+ checksum = hashlib.sha256(translated_content.encode('utf-8')).hexdigest()
+
+ # Create metadata file
+ metadata_info = {
+ "project": project,
+ "original_file": original_file_path,
+ "target_language": target_language,
+ "translation_date": datetime.now().isoformat(),
+ "file_size": file_size,
+ "checksum": checksum,
+ **metadata
+ }
+
+ metadata_path = save_path.with_suffix('.meta.json')
+ import json
+ metadata_path.write_text(json.dumps(metadata_info, indent=2), encoding='utf-8')
+
+ return {
+ "saved_path": str(save_path),
+ "backup_path": str(backup_path) if backup_path else None,
+ "file_size": file_size,
+ "checksum": f"sha256:{checksum[:12]}...",
+ "created_directories": [str(save_path.parent)],
+ "metadata_path": str(metadata_path)
+ }
+
+ except Exception as e:
+ raise ValueError(f"Failed to save translation result: {str(e)}")
\ No newline at end of file
diff --git a/external/mcp-servers/hf-translation-docs/setting.py b/external/mcp-servers/hf-translation-docs/setting.py
new file mode 100644
index 0000000..3cae441
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs/setting.py
@@ -0,0 +1,34 @@
+"""Configuration settings for HF Translation Docs MCP Server."""
+
+from dataclasses import dataclass
+from typing import List
+
+
+@dataclass
+class Settings:
+ """Settings for the MCP server."""
+ ui_title: str = "HuggingFace Translation Documentation MCP Server"
+ default_project: str = "transformers"
+ default_language: str = "ko"
+ default_limit: int = 10
+ supported_languages: List[str] = None
+
+ def __post_init__(self):
+ if self.supported_languages is None:
+ self.supported_languages = ["ko", "zh", "ja", "es", "fr", "de", "it", "pt"]
+
+
+# Global settings instance
+SETTINGS = Settings()
+
+# Language choices for UI
+LANGUAGE_CHOICES = [
+ ("한국어 (Korean)", "ko"),
+ ("中文 (Chinese)", "zh"),
+ ("日本語 (Japanese)", "ja"),
+ ("Español (Spanish)", "es"),
+ ("Français (French)", "fr"),
+ ("Deutsch (German)", "de"),
+ ("Italiano (Italian)", "it"),
+ ("Português (Portuguese)", "pt"),
+]
\ No newline at end of file
diff --git a/external/mcp-servers/hf-translation-docs/tools.py b/external/mcp-servers/hf-translation-docs/tools.py
new file mode 100644
index 0000000..4e7b170
--- /dev/null
+++ b/external/mcp-servers/hf-translation-docs/tools.py
@@ -0,0 +1,203 @@
+"""MCP tool endpoints for HuggingFace translation documentation."""
+
+from __future__ import annotations
+from typing import Any, Dict
+
+from services import (
+ get_project_configuration,
+ search_translation_files_data,
+ get_file_content_data,
+ generate_translation_prompt_data,
+ validate_translation_data,
+ save_translation_result_data,
+)
+
+
+def get_project_config(project: str) -> Dict[str, Any]:
+ """
+ Get project-specific configuration and settings.
+
+ MCP endpoint: translation_get_project_config
+ """
+ try:
+ return {
+ "status": "success",
+ "data": get_project_configuration(project)
+ }
+ except Exception as e:
+ return {
+ "status": "error",
+ "error": {
+ "code": "PROJECT_CONFIG_ERROR",
+ "message": str(e),
+ "details": {"project": project}
+ }
+ }
+
+
+def search_translation_files(
+ project: str,
+ target_language: str,
+ max_files: int = 10
+) -> Dict[str, Any]:
+ """
+ Search for files that need translation.
+
+ MCP endpoint: translation_search_files
+ """
+ try:
+ data = search_translation_files_data(project, target_language, max_files)
+ return {
+ "status": "success",
+ "data": data
+ }
+ except Exception as e:
+ return {
+ "status": "error",
+ "error": {
+ "code": "SEARCH_FILES_ERROR",
+ "message": str(e),
+ "details": {
+ "project": project,
+ "target_language": target_language,
+ "max_files": max_files
+ }
+ }
+ }
+
+
+def get_file_content(
+ project: str,
+ file_path: str,
+ include_metadata: bool = True
+) -> Dict[str, Any]:
+ """
+ Get file content for translation.
+
+ MCP endpoint: translation_get_file_content
+ """
+ try:
+ data = get_file_content_data(project, file_path, include_metadata)
+ return {
+ "status": "success",
+ "data": data
+ }
+ except Exception as e:
+ return {
+ "status": "error",
+ "error": {
+ "code": "FILE_CONTENT_ERROR",
+ "message": str(e),
+ "details": {
+ "project": project,
+ "file_path": file_path
+ }
+ }
+ }
+
+
+def generate_translation_prompt(
+ target_language: str,
+ content: str,
+ additional_instruction: str = "",
+ project: str = "transformers",
+ file_path: str = ""
+) -> Dict[str, Any]:
+ """
+ Generate translation prompt for content.
+
+ MCP endpoint: translation_generate_prompt
+ """
+ try:
+ data = generate_translation_prompt_data(
+ target_language, content, additional_instruction, project, file_path
+ )
+ return {
+ "status": "success",
+ "data": data
+ }
+ except Exception as e:
+ return {
+ "status": "error",
+ "error": {
+ "code": "PROMPT_GENERATION_ERROR",
+ "message": str(e),
+ "details": {
+ "target_language": target_language,
+ "content_length": len(content),
+ "project": project,
+ "file_path": file_path
+ }
+ }
+ }
+
+
+def validate_translation(
+ original_content: str,
+ translated_content: str,
+ target_language: str,
+ file_path: str = ""
+) -> Dict[str, Any]:
+ """
+ Validate translated content for quality and formatting.
+
+ MCP endpoint: translation_validate
+ """
+ try:
+ data = validate_translation_data(
+ original_content, translated_content, target_language, file_path
+ )
+ return {
+ "status": "success",
+ "data": data
+ }
+ except Exception as e:
+ return {
+ "status": "error",
+ "error": {
+ "code": "VALIDATION_ERROR",
+ "message": str(e),
+ "details": {
+ "original_content_length": len(original_content),
+ "translated_content_length": len(translated_content),
+ "target_language": target_language,
+ "file_path": file_path
+ }
+ }
+ }
+
+
+def save_translation_result(
+ project: str,
+ original_file_path: str,
+ translated_content: str,
+ target_language: str,
+ metadata: Dict[str, Any] = None
+) -> Dict[str, Any]:
+ """
+ Save translation result to file system.
+
+ MCP endpoint: translation_save_result
+ """
+ try:
+ data = save_translation_result_data(
+ project, original_file_path, translated_content, target_language, metadata
+ )
+ return {
+ "status": "success",
+ "data": data
+ }
+ except Exception as e:
+ return {
+ "status": "error",
+ "error": {
+ "code": "SAVE_RESULT_ERROR",
+ "message": str(e),
+ "details": {
+ "project": project,
+ "original_file_path": original_file_path,
+ "target_language": target_language,
+ "content_length": len(translated_content)
+ }
+ }
+ }
\ No newline at end of file
diff --git a/external/mcp-servers/hf-translation-reviewer/.gitattributes b/external/mcp-servers/hf-translation-reviewer/.gitattributes
new file mode 100644
index 0000000..a6344aa
--- /dev/null
+++ b/external/mcp-servers/hf-translation-reviewer/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/external/mcp-servers/hf-translation-reviewer/README.md b/external/mcp-servers/hf-translation-reviewer/README.md
new file mode 100644
index 0000000..4c2d84e
--- /dev/null
+++ b/external/mcp-servers/hf-translation-reviewer/README.md
@@ -0,0 +1,13 @@
+---
+title: LLM Translation Reviewer
+emoji: 🦀
+colorFrom: blue
+colorTo: gray
+sdk: gradio
+sdk_version: 5.49.1
+app_file: app.py
+pinned: false
+license: mit
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/external/mcp-servers/hf-translation-reviewer/adapters.py b/external/mcp-servers/hf-translation-reviewer/adapters.py
new file mode 100644
index 0000000..e3e9be1
--- /dev/null
+++ b/external/mcp-servers/hf-translation-reviewer/adapters.py
@@ -0,0 +1,210 @@
+from __future__ import annotations
+
+import base64
+import os
+from typing import Dict, Optional
+
+import requests
+from setting import SETTINGS
+
+# Optional provider SDKs
+try:
+ import openai # type: ignore
+except Exception:
+ openai = None
+
+try:
+ import anthropic # type: ignore
+except Exception:
+ anthropic = None
+
+try:
+ import google.generativeai as genai # type: ignore
+except Exception:
+ genai = None
+
+
+# ---------------- Token resolution (Space Secrets fallback) -----------------
+
+_PROVIDER_ENV_KEY = {
+ "openai": "OPENAI_API_KEY",
+ "anthropic": "ANTHROPIC_API_KEY",
+ "gemini": "GEMINI_API_KEY",
+}
+
+
+def _resolve_token(explicit: str, env_key: str) -> str:
+ """
+ 1) MCP tool arguments로 넘어온 token이 있으면 사용
+ 2) 없으면 Space Secrets(환경변수)에서 읽어서 사용
+ """
+ t = (explicit or "").strip()
+ if t:
+ return t
+
+ t = (os.getenv(env_key) or "").strip()
+ if t:
+ return t
+
+ raise RuntimeError(
+ f"Missing token. Provide '{env_key}' as a HuggingFace Space Secret "
+ "or pass it explicitly to the tool."
+ )
+
+
+def resolve_github_token(explicit: str) -> str:
+ return _resolve_token(explicit, "GITHUB_TOKEN")
+
+
+def resolve_provider_token(provider: str, explicit: str) -> str:
+ if provider not in _PROVIDER_ENV_KEY:
+ raise ValueError(
+ f"Unknown provider '{provider}'. Choose from: {', '.join(_PROVIDER_ENV_KEY)}"
+ )
+ return _resolve_token(explicit, _PROVIDER_ENV_KEY[provider])
+
+
+# ---------------- GitHub HTTP adapters -----------------
+
+def github_request(
+ url: str,
+ token: str,
+ params: Optional[Dict[str, str]] = None,
+) -> Dict:
+ token = resolve_github_token(token)
+
+ headers = {
+ "Accept": "application/vnd.github.v3+json",
+ "Authorization": f"token {token}",
+ }
+
+ response = requests.get(url, headers=headers, params=params, timeout=30)
+
+ if response.status_code == 404:
+ raise FileNotFoundError(f"GitHub resource not found: {url}")
+ if response.status_code == 401:
+ raise PermissionError("GitHub token is invalid or lacks necessary scopes.")
+ if response.status_code >= 400:
+ raise RuntimeError(
+ f"GitHub API request failed with status {response.status_code}: {response.text}"
+ )
+
+ return response.json()
+
+
+def fetch_file_from_pr(
+ repo_name: str,
+ pr_number: int,
+ path: str,
+ head_sha: str,
+ github_token: str,
+) -> str:
+ url = f"{SETTINGS.github_api_base}/repos/{repo_name}/contents/{path}"
+ data = github_request(url, github_token, params={"ref": head_sha})
+
+ content = data.get("content")
+ encoding = data.get("encoding")
+
+ if content is None or encoding != "base64":
+ raise ValueError(
+ f"Unexpected content response for '{path}' (encoding={encoding!r})."
+ )
+
+ decoded = base64.b64decode(content)
+ try:
+ return decoded.decode("utf-8")
+ except UnicodeDecodeError as exc:
+ raise ValueError(
+ f"File '{path}' in PR {pr_number} is not valid UTF-8 text"
+ ) from exc
+
+
+# ---------------- LLM provider adapters -----------------
+
+def call_openai(
+ token: str,
+ system_prompt: str,
+ user_prompt: str,
+ model_name: str = "gpt-5",
+) -> str:
+ if openai is None:
+ raise RuntimeError("openai package not installed. Install with `pip install openai`.")
+
+ client = openai.OpenAI(api_key=token)
+
+ params = {
+ "model": model_name,
+ "messages": [
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": user_prompt},
+ ],
+ }
+
+ # Some models may not allow custom temperature.
+ if model_name not in {"gpt-5"}:
+ params["temperature"] = 0.2
+
+ response = client.chat.completions.create(**params)
+ return response.choices[0].message.content.strip()
+
+
+def call_anthropic(
+ token: str,
+ system_prompt: str,
+ user_prompt: str,
+ model_name: str = "claude-3-5-sonnet-20240620",
+) -> str:
+ if anthropic is None:
+ raise RuntimeError(
+ "anthropic package not installed. Install with `pip install anthropic`."
+ )
+
+ client = anthropic.Anthropic(api_key=token)
+ response = client.messages.create(
+ model=model_name,
+ system=system_prompt,
+ max_tokens=1500,
+ temperature=0.2,
+ messages=[{"role": "user", "content": user_prompt}],
+ )
+ return "".join(block.text for block in response.content if hasattr(block, "text")).strip()
+
+
+def call_gemini(
+ token: str,
+ system_prompt: str,
+ user_prompt: str,
+ model_name: str = "gemini-1.5-pro",
+) -> str:
+ if genai is None:
+ raise RuntimeError(
+ "google-generativeai package not installed. Install with `pip install google-generativeai`."
+ )
+
+ genai.configure(api_key=token)
+ model = genai.GenerativeModel(model_name)
+
+ prompt = f"{system_prompt}\n\n{user_prompt}"
+ response = model.generate_content(prompt, generation_config={"temperature": 0.2})
+ return response.text.strip()
+
+
+PROVIDERS = {
+ "openai": call_openai,
+ "anthropic": call_anthropic,
+ "gemini": call_gemini,
+}
+
+
+def dispatch_review(
+ provider: str,
+ token: str,
+ system_prompt: str,
+ user_prompt: str,
+ model_name: str,
+) -> str:
+ if provider not in PROVIDERS:
+ raise ValueError(f"Unknown provider '{provider}'. Choose from: {', '.join(PROVIDERS)}")
+
+ token = resolve_provider_token(provider, token)
+ return PROVIDERS[provider](token, system_prompt, user_prompt, model_name)
diff --git a/external/mcp-servers/hf-translation-reviewer/app.py b/external/mcp-servers/hf-translation-reviewer/app.py
new file mode 100644
index 0000000..f043049
--- /dev/null
+++ b/external/mcp-servers/hf-translation-reviewer/app.py
@@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+"""
+Gradio + MCP server app for LLM translation review on GitHub PRs.
+
+- UI만 담당하고, 실제 로직은 tools/services/adapters 로 분리.
+"""
+
+from __future__ import annotations
+
+import json
+
+import gradio as gr
+
+from setting import SETTINGS
+from tools import (
+ tool_prepare,
+ tool_review_and_emit,
+ tool_submit_review,
+ tool_end_to_end,
+)
+
+
+def build_ui() -> gr.Blocks:
+ with gr.Blocks(title=SETTINGS.ui_title) as demo:
+ gr.Markdown(
+ "# LLM Translation Reviewer for GitHub PRs (MCP-enabled)\n"
+ "Only **PR URL** + fields below are required. Repo/PR number are parsed."
+ )
+
+ # 공통 입력 영역
+ with gr.Row():
+ pr_url = gr.Textbox(
+ label="PR URL",
+ placeholder="https://github.com/owner/repo/pull/123",
+ scale=2,
+ )
+ provider = gr.Dropdown(
+ label="Provider",
+ choices=["openai", "anthropic", "gemini"],
+ value=SETTINGS.default_provider,
+ )
+ model_name = gr.Textbox(
+ label="Model name",
+ value=SETTINGS.default_model,
+ placeholder=(
+ "e.g., gpt-5 / gpt-4o / claude-3-5-sonnet-20240620 / gemini-1.5-pro"
+ ),
+ )
+ with gr.Row():
+ provider_token = gr.Textbox(
+ label="Provider API Token",
+ type="password",
+ )
+ github_token = gr.Textbox(
+ label="GitHub Token",
+ type="password",
+ )
+ with gr.Row():
+ original_path = gr.Textbox(
+ label="Original File Path (in repo)",
+ placeholder="docs/source/en/xxx.md",
+ )
+ translated_path = gr.Textbox(
+ label="Translated File Path (in repo)",
+ placeholder="docs/source/ko/xxx.md",
+ )
+
+ gr.Markdown("---")
+
+ # Tool 1: Prepare
+ with gr.Accordion(
+ "Tool 1: Prepare (Fetch Files + Build Prompts)", open=False
+ ):
+ prepare_btn = gr.Button("tool_prepare")
+ prepare_out = gr.JSON(label="Prepare result (files + prompts)")
+
+ prepare_btn.click(
+ fn=tool_prepare,
+ inputs=[github_token, pr_url, original_path, translated_path],
+ outputs=[prepare_out],
+ )
+
+ # Tool 2: Review + Emit Payload
+ with gr.Accordion("Tool 2: Review + Emit Payload", open=False):
+ review_btn = gr.Button("tool_review_and_emit")
+
+ original_text = gr.Textbox(
+ label="Original (for review)",
+ lines=6,
+ )
+ translated_text = gr.Textbox(
+ label="Translated (for review)",
+ lines=10,
+ )
+
+ review_out = gr.JSON(
+ label="Review result (verdict/summary/comments/event)"
+ )
+ payload_out = gr.JSON(label="Payload JSON (for GitHub)")
+
+ def _review_emit_proxy(
+ provider_: str,
+ provider_token_: str,
+ model_name_: str,
+ pr_url_: str,
+ translated_path_: str,
+ original_text_: str,
+ translated_text_: str,
+ ):
+ result = tool_review_and_emit(
+ provider=provider_,
+ provider_token=provider_token_,
+ model_name=model_name_,
+ pr_url=pr_url_,
+ translated_path=translated_path_,
+ original=original_text_,
+ translated=translated_text_,
+ )
+ return result, result.get("payload", {})
+
+ review_btn.click(
+ fn=_review_emit_proxy,
+ inputs=[
+ provider,
+ provider_token,
+ model_name,
+ pr_url,
+ translated_path,
+ original_text,
+ translated_text,
+ ],
+ outputs=[review_out, payload_out],
+ )
+
+ # Tool 3: Submit Review
+ with gr.Accordion("Tool 3: Submit Review", open=False):
+ submit_btn = gr.Button("tool_submit_review")
+ payload_in = gr.Textbox(
+ label="Payload or Review JSON (from Tool 2)",
+ lines=6,
+ )
+ submit_out = gr.JSON(label="Submission result")
+
+ def _submit_proxy(
+ github_token_: str,
+ pr_url_: str,
+ translated_path_: str,
+ payload_json_: str,
+ ):
+ try:
+ payload_obj = json.loads(payload_json_) if payload_json_ else {}
+ except Exception as e:
+ raise ValueError(f"Invalid JSON: {e}")
+ return tool_submit_review(
+ github_token=github_token_,
+ pr_url=pr_url_,
+ translated_path=translated_path_,
+ payload_or_review=payload_obj,
+ allow_self_request_changes=True,
+ )
+
+ submit_btn.click(
+ fn=_submit_proxy,
+ inputs=[github_token, pr_url, translated_path, payload_in],
+ outputs=[submit_out],
+ )
+
+ gr.Markdown("---")
+
+ # Tool 4: End-to-End
+ with gr.Accordion("Tool 4: End-to-End", open=True):
+ e2e_btn = gr.Button("tool_end_to_end")
+ save_review = gr.Checkbox(
+ label="Save review JSON to file", value=True
+ )
+ save_path = gr.Textbox(
+ label="Save path", value="review.json"
+ )
+ submit_flag = gr.Checkbox(
+ label="Submit to GitHub", value=False
+ )
+ e2e_out = gr.JSON(label="E2E result")
+
+ e2e_btn.click(
+ fn=tool_end_to_end,
+ inputs=[
+ provider,
+ provider_token,
+ model_name,
+ github_token,
+ pr_url,
+ original_path,
+ translated_path,
+ save_review,
+ save_path,
+ submit_flag,
+ ],
+ outputs=[e2e_out],
+ )
+
+ gr.Markdown(
+ """
+ **Notes**
+ - Tool 1: PR에서 파일을 읽고 프롬프트까지 준비합니다.
+ - Tool 2: LLM으로 리뷰한 뒤, GitHub 리뷰 payload까지 생성합니다.
+ - Tool 3: Tool 2에서 만든 payload JSON을 그대로 넣고 GitHub에 전송합니다.
+ - Tool 4: 파일 로드부터 리뷰/저장/제출까지 한 번에 처리하는 end-to-end 툴입니다.
+ - `launch(mcp_server=True)` 이므로 각 `tool_*` 버튼은 MCP 툴로도 사용 가능합니다.
+ """
+ )
+ return demo
+
+
+if __name__ == "__main__":
+ ui = build_ui()
+ ui.launch(
+ share=SETTINGS.ui_share,
+ mcp_server=SETTINGS.ui_launch_mcp_server,
+ )
diff --git a/external/mcp-servers/hf-translation-reviewer/configs/default.yaml b/external/mcp-servers/hf-translation-reviewer/configs/default.yaml
new file mode 100644
index 0000000..685606c
--- /dev/null
+++ b/external/mcp-servers/hf-translation-reviewer/configs/default.yaml
@@ -0,0 +1,11 @@
+provider:
+ default: "openai"
+ model: "gpt-5"
+
+github:
+ api_base: "https://api.github.com"
+
+ui:
+ title: "LLM Translation Reviewer (PR) — MCP Tools"
+ share: true
+ launch_mcp_server: true
\ No newline at end of file
diff --git a/external/mcp-servers/hf-translation-reviewer/requirements.txt b/external/mcp-servers/hf-translation-reviewer/requirements.txt
new file mode 100644
index 0000000..a8a28b4
--- /dev/null
+++ b/external/mcp-servers/hf-translation-reviewer/requirements.txt
@@ -0,0 +1,14 @@
+# Core dependencies
+requests>=2.31.0
+gradio>=5.0.0
+
+# LLM providers (optional, choose what you use)
+openai>=1.12.0
+anthropic>=0.34.0
+google-generativeai>=0.5.0
+
+# Typing helpers (optional, for static analysis)
+typing-extensions>=4.8.0
+
+# Python version note
+# Python >=3.9 recommended
diff --git a/external/mcp-servers/hf-translation-reviewer/services.py b/external/mcp-servers/hf-translation-reviewer/services.py
new file mode 100644
index 0000000..f12cc00
--- /dev/null
+++ b/external/mcp-servers/hf-translation-reviewer/services.py
@@ -0,0 +1,562 @@
+from __future__ import annotations
+
+import json
+import re
+import textwrap
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+from urllib.parse import urlparse
+
+import requests
+from setting import SETTINGS
+
+from adapters import github_request, fetch_file_from_pr, dispatch_review, resolve_github_token
+
+PROMPT_TEMPLATE = textwrap.dedent(
+ """
+ You are a meticulous bilingual reviewer checking a translation PR.
+
+ PR number: {pr_number}
+ PR URL: {pr_url}
+
+ Review the translated text against the original and focus on:
+ 1. Are there any typos or spelling mistakes?
+ 2. Are any sentences difficult to understand?
+ 3. Is the overall content hard to comprehend?
+
+ Always respond with strict JSON using this schema:
+ {{
+ "verdict": "request_changes" | "comment" | "approve",
+ "summary": "",
+ "comments": [
+ {{
+ "line": <1-based line number in the translated file>,
+ "issue": "",
+ "suggested_edit": "",
+ "context": ""
+ }},
+ ...
+ ]
+ }}
+
+ Guidelines:
+ - Only include comments for issues that warrant direct feedback.
+ - When a concrete rewrite is possible, populate "suggested_edit" with the full replacement line exactly as it should appear after fixing the issue.
+ - Keep edits scoped to the referenced line; do not span multiple lines.
+ - Always copy the current text of that line verbatim into "context".
+ - Omit the "suggested_edit" field or set it to an empty string if no suggestion is available.
+ - Use "request_changes" when the identified problems must be fixed before merging.
+ - Use "approve" only when the translation is correct and clear with no changes needed.
+ - For optional improvements or general observations, use "comment".
+ - Keep suggestions tightly scoped so they can be applied as GitHub suggestions.
+ - Do not output partial fragments in "suggested_edit"; always provide the entire replacement line including unchanged portions.
+ - Use the line numbers from the "TRANSLATED TEXT WITH LINE NUMBERS" section.
+ """
+).strip()
+
+
+# --------------------- Core helpers ------------------
+
+def parse_pr_url(pr_url: str) -> Tuple[str, int]:
+ """Extract repo (owner/name) and PR number from a GitHub PR URL."""
+ if not pr_url:
+ raise ValueError("PR URL is required")
+
+ parsed = urlparse(pr_url)
+ parts = [p for p in parsed.path.split("/") if p]
+ # Expect: [owner, repo, 'pull', pr_number, ...]
+ if len(parts) < 4 or parts[2] != "pull":
+ raise ValueError(f"Not a valid GitHub PR URL: {pr_url}")
+
+ owner, repo, _, num = parts[0], parts[1], parts[2], parts[3]
+ if not num.isdigit():
+ raise ValueError(f"PR number not found in URL: {pr_url}")
+
+ return f"{owner}/{repo}", int(num)
+
+
+def add_line_numbers(text: str) -> str:
+ return "\n".join(f"{i:04d}: {line}" for i, line in enumerate(text.splitlines(), 1))
+
+
+def load_pr_files(
+ github_token: str,
+ pr_url: str,
+ original_path: str,
+ translated_path: str,
+) -> Tuple[str, int, str, str]:
+ repo_name, pr_number = parse_pr_url(pr_url)
+
+ pr_api = f"{SETTINGS.github_api_base}/repos/{repo_name}/pulls/{pr_number}"
+ pr_data = github_request(pr_api, github_token)
+
+ head_sha = pr_data.get("head", {}).get("sha")
+ if not head_sha:
+ raise RuntimeError(
+ f"Unable to determine head SHA for PR {pr_number} in {repo_name}."
+ )
+
+ original = fetch_file_from_pr(repo_name, pr_number, original_path, head_sha, github_token)
+ translated = fetch_file_from_pr(repo_name, pr_number, translated_path, head_sha, github_token)
+ return repo_name, pr_number, original, translated
+
+
+def build_messages(
+ original: str,
+ translated: str,
+ pr_number: int,
+ pr_url: str,
+) -> Tuple[str, str]:
+ system_prompt = (
+ "You are an expert translation reviewer ensuring clarity, accuracy, "
+ "and readability of localized documentation."
+ )
+
+ user_prompt = (
+ f"{PROMPT_TEMPLATE}\n\n"
+ "----- ORIGINAL TEXT -----\n"
+ f"{original}\n\n"
+ "----- TRANSLATED TEXT -----\n"
+ f"{translated}\n\n"
+ "----- TRANSLATED TEXT WITH LINE NUMBERS -----\n"
+ f"{add_line_numbers(translated)}"
+ )
+
+ return system_prompt, user_prompt
+
+
+def normalize_summary_for_body(summary: str) -> str:
+ """
+ GitHub review body로 쓸 텍스트 정리.
+ """
+ s = (summary or "").strip()
+ if not s:
+ return "LLM translation review"
+
+ if s.startswith("{") or s.startswith("["):
+ try:
+ obj = json.loads(s)
+ if isinstance(obj, dict):
+ inner = obj.get("summary")
+ if isinstance(inner, str) and inner.strip():
+ return inner.strip()
+ except Exception:
+ return s
+
+ return s
+
+
+# ----------------------- Parsing & GitHub glue ----------------------
+
+def _extract_json_candidates(raw_response: str) -> List[str]:
+ candidates: List[str] = []
+
+ for match in re.finditer(r"```(?:json)?\s*(\{.*?\})\s*```", raw_response, re.DOTALL):
+ snippet = match.group(1).strip()
+ if snippet:
+ candidates.append(snippet)
+
+ stripped = raw_response.strip()
+ if stripped:
+ candidates.append(stripped)
+
+ return candidates
+
+
+def parse_review_response(raw_response: str) -> Tuple[str, str, List[Dict[str, object]]]:
+ parsed: Optional[Dict[str, object]] = None
+
+ for candidate in _extract_json_candidates(raw_response):
+ try:
+ parsed_candidate = json.loads(candidate)
+ except json.JSONDecodeError:
+ continue
+ if isinstance(parsed_candidate, dict):
+ parsed = parsed_candidate
+ break
+
+ if parsed is None:
+ return "comment", raw_response.strip(), []
+
+ verdict = parsed.get("verdict", "comment")
+ summary = str(parsed.get("summary", "")).strip()
+ comments = parsed.get("comments", [])
+
+ if not isinstance(verdict, str):
+ verdict = "comment"
+ verdict = verdict.lower()
+ if verdict not in {"request_changes", "comment", "approve"}:
+ verdict = "comment"
+
+ if not summary:
+ summary = raw_response.strip()
+
+ if not isinstance(comments, list):
+ comments = []
+
+ normalized_comments: List[Dict[str, object]] = []
+ for comment in comments:
+ if not isinstance(comment, dict):
+ continue
+
+ line = comment.get("line")
+ issue = str(comment.get("issue", "")).strip()
+ suggested_edit = str(comment.get("suggested_edit", "")).strip()
+ context = str(comment.get("context", "")).strip()
+
+ if not isinstance(line, int) or line <= 0:
+ continue
+ if not issue:
+ continue
+
+ normalized_comments.append(
+ {
+ "line": line,
+ "issue": issue,
+ "suggested_edit": suggested_edit,
+ "context": context,
+ }
+ )
+
+ return verdict, summary, normalized_comments
+
+
+def review_event_from_verdict(verdict: str) -> str:
+ return {
+ "request_changes": "REQUEST_CHANGES",
+ "comment": "COMMENT",
+ "approve": "APPROVE",
+ }.get(verdict, "COMMENT")
+
+
+def build_review_comments(
+ translated_path: str,
+ comments: List[Dict[str, object]],
+) -> List[Dict[str, object]]:
+ review_comments: List[Dict[str, object]] = []
+
+ for comment in comments:
+ line = int(comment["line"])
+ issue = str(comment["issue"]).strip()
+
+ raw_suggested = comment.get("suggested_edit", "")
+ if isinstance(raw_suggested, str):
+ suggested_edit = raw_suggested.rstrip("\r\n")
+ else:
+ suggested_edit = str(raw_suggested).rstrip("\r\n") if raw_suggested else ""
+
+ context = str(comment.get("context", "")).rstrip("\n")
+ full_line_suggestion = suggested_edit.rstrip("\n") if suggested_edit else ""
+
+ body_parts = [issue]
+ if context:
+ body_parts.append(f"> _Current text_: {context}")
+ if full_line_suggestion:
+ body_parts.append("```suggestion\n" + full_line_suggestion + "\n```")
+
+ body = "\n\n".join(body_parts).strip()
+
+ review_comments.append(
+ {
+ "path": translated_path,
+ "side": "RIGHT",
+ "line": line,
+ "body": body,
+ }
+ )
+
+ return review_comments
+
+
+def attach_translated_line_context(
+ translated_text: str,
+ comments: List[Dict[str, object]],
+) -> None:
+ if not comments:
+ return
+
+ lines = translated_text.splitlines()
+ for comment in comments:
+ line_idx = comment.get("line")
+ if not isinstance(line_idx, int):
+ continue
+
+ list_index = line_idx - 1
+ if list_index < 0 or list_index >= len(lines):
+ continue
+
+ current_line = lines[list_index].rstrip("\n")
+ if not comment.get("context"):
+ comment["context"] = current_line
+
+
+def build_github_review_payload(
+ body: str,
+ event: str = "COMMENT",
+ comments: Optional[List[Dict[str, object]]] = None,
+) -> Dict[str, object]:
+ payload: Dict[str, object] = {"event": event, "body": body}
+ if comments:
+ payload["comments"] = comments
+ return payload
+
+
+def submit_pr_review(
+ repo_name: str,
+ pr_number: int,
+ github_token: str,
+ body: str,
+ event: str,
+ comments: Optional[List[Dict[str, object]]] = None,
+ allow_self_request_changes: bool = True,
+) -> Tuple[Dict, str]:
+ """
+ GitHub PR 리뷰 전송 (self-review REQUEST_CHANGES 우회 포함).
+ """
+ github_token = resolve_github_token(github_token)
+
+ url = f"{SETTINGS.github_api_base}/repos/{repo_name}/pulls/{pr_number}/reviews"
+ headers = {
+ "Accept": "application/vnd.github.v3+json",
+ "Authorization": f"token {github_token}",
+ }
+
+ def _post(event_to_use: str, body_to_use: str) -> requests.Response:
+ payload = build_github_review_payload(body=body_to_use, event=event_to_use, comments=comments)
+ return requests.post(url, headers=headers, json=payload, timeout=30)
+
+ # 1차 요청
+ response = _post(event, body)
+
+ if response.status_code == 401:
+ raise PermissionError("GitHub token is invalid or lacks permission to submit a review.")
+
+ # 본인 PR + REQUEST_CHANGES 케이스 처리
+ if response.status_code == 422 and event == "REQUEST_CHANGES":
+ try:
+ error_payload = response.json()
+ except ValueError:
+ error_payload = {"message": response.text}
+
+ message = str(error_payload.get("message", ""))
+ errors = " ".join(str(item) for item in error_payload.get("errors", []))
+ combined_error = f"{message} {errors}".strip()
+
+ if "own pull request" in combined_error.lower():
+ if not allow_self_request_changes:
+ raise RuntimeError(
+ "GitHub does not allow REQUEST_CHANGES on your own pull request: "
+ + combined_error
+ )
+
+ fallback_event = "COMMENT"
+ fallback_body = "[REQUEST_CHANGES (self-review)]\n\n" + (body or "").strip()
+
+ comment_response = _post(fallback_event, fallback_body)
+ if comment_response.status_code >= 400:
+ raise RuntimeError(
+ "Failed to submit fallback self-review comment: "
+ f"HTTP {comment_response.status_code} - {comment_response.text}"
+ )
+ return comment_response.json(), "REQUEST_CHANGES_SELF"
+
+ if response.status_code >= 400:
+ raise RuntimeError(
+ "Failed to submit review: "
+ f"HTTP {response.status_code} - {response.text}"
+ )
+
+ return response.json(), event
+
+
+# --------------------- High-level domain services ------------------
+
+def prepare_translation_context(
+ github_token: str,
+ pr_url: str,
+ original_path: str,
+ translated_path: str,
+) -> Dict[str, object]:
+ """
+ PR에서 파일을 가져와 system/user prompt까지 구성.
+ """
+ repo_name, pr_number, original, translated = load_pr_files(
+ github_token=github_token,
+ pr_url=pr_url,
+ original_path=original_path,
+ translated_path=translated_path,
+ )
+
+ system_prompt, user_prompt = build_messages(
+ original=original,
+ translated=translated,
+ pr_number=pr_number,
+ pr_url=pr_url,
+ )
+
+ return {
+ "repo": repo_name,
+ "pr_number": pr_number,
+ "original": original,
+ "translated": translated,
+ "system_prompt": system_prompt,
+ "user_prompt": user_prompt,
+ }
+
+
+def review_and_emit_payload(
+ provider: str,
+ provider_token: str,
+ model_name: str,
+ pr_url: str,
+ translated_path: str,
+ original: str,
+ translated: str,
+) -> Dict[str, object]:
+ """
+ LLM 리뷰 수행 후 verdict / summary / comments 및 GitHub payload 생성.
+ """
+ _, pr_number = parse_pr_url(pr_url)
+
+ system_prompt, user_prompt = build_messages(
+ original=original,
+ translated=translated,
+ pr_number=pr_number,
+ pr_url=pr_url,
+ )
+
+ raw = dispatch_review(
+ provider=provider,
+ token=provider_token,
+ system_prompt=system_prompt,
+ user_prompt=user_prompt,
+ model_name=model_name,
+ )
+
+ verdict, summary, comments = parse_review_response(raw)
+ attach_translated_line_context(translated, comments)
+
+ event = review_event_from_verdict(verdict)
+ github_comments = build_review_comments(translated_path, comments)
+
+ payload = build_github_review_payload(
+ body=summary,
+ event=event,
+ comments=github_comments,
+ )
+
+ return {
+ "verdict": verdict,
+ "summary": summary,
+ "comments": comments,
+ "event": event,
+ "payload": payload,
+ }
+
+
+def submit_review_to_github(
+ github_token: str,
+ pr_url: str,
+ translated_path: str,
+ payload_or_review: Dict[str, object],
+ allow_self_request_changes: bool = True,
+) -> Dict[str, object]:
+ """
+ payload JSON 또는 review JSON을 입력받아 GitHub 리뷰 제출.
+ """
+ repo, pr_number = parse_pr_url(pr_url)
+
+ event = payload_or_review.get("event")
+ body = payload_or_review.get("body")
+ comments_obj = payload_or_review.get("comments")
+
+ comments: Optional[List[Dict[str, object]]] = None
+
+ if isinstance(event, str) and body:
+ # 이미 GitHub payload 형식
+ event_str = event
+ if isinstance(comments_obj, list):
+ comments = comments_obj
+ body_str = str(body)
+ else:
+ # review 형식 (verdict/summary/comments)
+ verdict = str(payload_or_review.get("verdict", "comment")).lower()
+ summary = str(payload_or_review.get("summary", "")).strip()
+ review_comments = payload_or_review.get("comments", [])
+ if not isinstance(review_comments, list):
+ review_comments = []
+
+ event_str = review_event_from_verdict(verdict)
+ body_str = summary if summary else "LLM translation review"
+ comments = build_review_comments(translated_path, review_comments)
+
+ if event_str == "REQUEST_CHANGES" and not body_str.strip() and not comments:
+ raise ValueError(
+ "REQUEST_CHANGES를 보내려면 review 본문 또는 코멘트가 하나 이상 필요합니다."
+ )
+
+ response, final_event = submit_pr_review(
+ repo_name=repo,
+ pr_number=pr_number,
+ github_token=github_token,
+ body=body_str,
+ event=event_str,
+ comments=comments,
+ allow_self_request_changes=allow_self_request_changes,
+ )
+
+ return {
+ "final_event": final_event,
+ "response": response,
+ }
+
+
+def run_end_to_end(
+ provider: str,
+ provider_token: str,
+ model_name: str,
+ github_token: str,
+ pr_url: str,
+ original_path: str,
+ translated_path: str,
+ save_review: bool = False,
+ save_path: str = "review.json",
+ submit_review_flag: bool = False,
+) -> Dict[str, object]:
+ repo, pr_number, original, translated = load_pr_files(
+ github_token=github_token,
+ pr_url=pr_url,
+ original_path=original_path,
+ translated_path=translated_path,
+ )
+
+ review = review_and_emit_payload(
+ provider=provider,
+ provider_token=provider_token,
+ model_name=model_name,
+ pr_url=pr_url,
+ translated_path=translated_path,
+ original=original,
+ translated=translated,
+ )
+
+ out: Dict[str, object] = {
+ "repo": repo,
+ "pr_number": pr_number,
+ "review": review,
+ }
+
+ if save_review:
+ Path(save_path).write_text(json.dumps(review, ensure_ascii=False, indent=2), encoding="utf-8")
+ out["saved_to"] = save_path
+
+ if submit_review_flag:
+ submission = submit_review_to_github(
+ github_token=github_token,
+ pr_url=pr_url,
+ translated_path=translated_path,
+ payload_or_review=review.get("payload") if isinstance(review.get("payload"), dict) else review,
+ )
+ out["submission"] = submission
+
+ return out
diff --git a/external/mcp-servers/hf-translation-reviewer/setting.py b/external/mcp-servers/hf-translation-reviewer/setting.py
new file mode 100644
index 0000000..c1acbb9
--- /dev/null
+++ b/external/mcp-servers/hf-translation-reviewer/setting.py
@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import os
+
+try:
+ import yaml # type: ignore
+except Exception:
+ yaml = None
+
+
+@dataclass
+class AppSettings:
+ default_provider: str = "openai"
+ default_model: str = "gpt-5"
+ github_api_base: str = "https://api.github.com"
+ ui_title: str = "LLM Translation Reviewer (PR) — MCP Tools"
+ ui_share: bool = True
+ ui_launch_mcp_server: bool = True
+
+
+def _load_yaml(path: Path) -> Dict[str, Any]:
+ if not path.is_file():
+ return {}
+ if yaml is None:
+ # yaml 없으면 config 없이 동작
+ return {}
+ with path.open("r", encoding="utf-8") as f:
+ data = yaml.safe_load(f) or {}
+ if not isinstance(data, dict):
+ return {}
+ return data
+
+
+def load_settings(config_path: str = "configs/default.yaml") -> AppSettings:
+ cfg_path = Path(config_path)
+ data = _load_yaml(cfg_path)
+
+ provider_cfg = data.get("provider", {}) if isinstance(data.get("provider"), dict) else {}
+ github_cfg = data.get("github", {}) if isinstance(data.get("github"), dict) else {}
+ ui_cfg = data.get("ui", {}) if isinstance(data.get("ui"), dict) else {}
+
+ default_provider = os.getenv("DEFAULT_PROVIDER", provider_cfg.get("default", "openai"))
+ default_model = os.getenv("DEFAULT_MODEL", provider_cfg.get("model", "gpt-5"))
+ github_api_base = os.getenv("GITHUB_API_BASE", github_cfg.get("api_base", "https://api.github.com"))
+ ui_title = ui_cfg.get("title", "LLM Translation Reviewer (PR) — MCP Tools")
+ ui_share = bool(ui_cfg.get("share", True))
+ ui_launch_mcp_server = bool(ui_cfg.get("launch_mcp_server", True))
+
+ return AppSettings(
+ default_provider=default_provider,
+ default_model=default_model,
+ github_api_base=github_api_base,
+ ui_title=ui_title,
+ ui_share=ui_share,
+ ui_launch_mcp_server=ui_launch_mcp_server,
+ )
+
+
+# 전역 설정 인스턴스
+SETTINGS: AppSettings = load_settings()
diff --git a/external/mcp-servers/hf-translation-reviewer/tools.py b/external/mcp-servers/hf-translation-reviewer/tools.py
new file mode 100644
index 0000000..2ce9027
--- /dev/null
+++ b/external/mcp-servers/hf-translation-reviewer/tools.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+
+from typing import Dict
+
+from services import (
+ prepare_translation_context,
+ review_and_emit_payload,
+ submit_review_to_github,
+ run_end_to_end,
+)
+
+
+def tool_prepare(
+ github_token: str = "",
+ pr_url: str = "",
+ original_path: str = "",
+ translated_path: str = "",
+) -> Dict[str, object]:
+ """
+ Tool 1: Fetch Files + Build Prompts
+ """
+ return prepare_translation_context(
+ github_token=github_token,
+ pr_url=pr_url,
+ original_path=original_path,
+ translated_path=translated_path,
+ )
+
+
+def tool_review_and_emit(
+ provider: str,
+ provider_token: str = "",
+ model_name: str = "",
+ pr_url: str = "",
+ translated_path: str = "",
+ original: str = "",
+ translated: str = "",
+) -> Dict[str, object]:
+ """
+ Tool 2: LLM Review + Emit Payload
+ """
+ return review_and_emit_payload(
+ provider=provider,
+ provider_token=provider_token,
+ model_name=model_name,
+ pr_url=pr_url,
+ translated_path=translated_path,
+ original=original,
+ translated=translated,
+ )
+
+
+def tool_submit_review(
+ github_token: str = "",
+ pr_url: str = "",
+ translated_path: str = "",
+ payload_or_review: Dict[str, object] = None, # type: ignore[assignment]
+ allow_self_request_changes: bool = True,
+) -> Dict[str, object]:
+ """
+ Tool 3: Submit Review
+ """
+ if payload_or_review is None:
+ raise ValueError("payload_or_review is required")
+
+ return submit_review_to_github(
+ github_token=github_token,
+ pr_url=pr_url,
+ translated_path=translated_path,
+ payload_or_review=payload_or_review,
+ allow_self_request_changes=allow_self_request_changes,
+ )
+
+
+def tool_end_to_end(
+ provider: str,
+ provider_token: str = "",
+ model_name: str = "",
+ github_token: str = "",
+ pr_url: str = "",
+ original_path: str = "",
+ translated_path: str = "",
+ save_review: bool = False,
+ save_path: str = "review.json",
+ submit_review_flag: bool = False,
+) -> Dict[str, object]:
+ """
+ Tool 4: End-to-End
+ """
+ return run_end_to_end(
+ provider=provider,
+ provider_token=provider_token,
+ model_name=model_name,
+ github_token=github_token,
+ pr_url=pr_url,
+ original_path=original_path,
+ translated_path=translated_path,
+ save_review=save_review,
+ save_path=save_path,
+ submit_review_flag=submit_review_flag,
+ )
diff --git a/logger/github_logger.py b/logger/github_logger.py
index 159123d..b054e54 100644
--- a/logger/github_logger.py
+++ b/logger/github_logger.py
@@ -1,71 +1,71 @@
-import os
-import base64
-from typing import Optional
-
-try:
- from github import Github, GithubException
- LIBS_OK = True
-except ImportError:
- LIBS_OK = False
-
-class GitHubLogger:
- """Dedicated logger that appends JSONL entries to a GitHub repo/branch/file.
-
- Env vars:
- - LOG_GITHUB_TOKEN (fallback: GITHUB_TOKEN)
- - LOG_REPO (format: owner/repo)
- - LOG_BRANCH (default: 'log_event')
- - LOG_FILE_PATH (default: 'pr_success.log')
- """
-
- def __init__(self):
- if not LIBS_OK:
- raise ImportError("PyGithub not installed. Please install PyGithub.")
- token = os.environ.get("LOG_GITHUB_TOKEN") or os.environ.get("GITHUB_TOKEN")
- if not token:
- raise ValueError("Missing LOG_GITHUB_TOKEN or GITHUB_TOKEN for logging")
- self._client = Github(token)
-
- repo_spec = os.environ.get("LOG_REPO")
- if not repo_spec or "/" not in repo_spec:
- raise ValueError("Missing or invalid LOG_REPO. Expected 'owner/repo'.")
- self.owner, self.repo_name = repo_spec.split("/", 1)
-
- self.branch = os.environ.get("LOG_BRANCH", "log_event")
- self.path = os.environ.get("LOG_FILE_PATH", "pr_success.log")
-
- def _ensure_branch(self, repo):
- try:
- repo.get_branch(self.branch)
- except GithubException as e:
- if e.status == 404:
- base = repo.get_branch(repo.default_branch)
- repo.create_git_ref(ref=f"refs/heads/{self.branch}", sha=base.commit.sha)
- else:
- raise
-
- def append_jsonl(self, jsonl_line: str, commit_message: str = "chore(log): append entry") -> str:
- repo = self._client.get_repo(f"{self.owner}/{self.repo_name}")
- self._ensure_branch(repo)
- try:
- existing = repo.get_contents(self.path, ref=self.branch)
- existing_content = base64.b64decode(existing.content).decode("utf-8")
- new_content = existing_content + jsonl_line
- repo.update_file(
- path=self.path,
- message=commit_message,
- content=new_content,
- sha=existing.sha,
- branch=self.branch,
- )
- return "SUCCESS: Log appended"
- except GithubException as e:
- if e.status == 404:
- repo.create_file(
- path=self.path,
- message=commit_message,
- content=jsonl_line,
- branch=self.branch,
- )
- return "SUCCESS: Log file created and first entry appended"
- raise
+import os
+import base64
+from typing import Optional
+
+try:
+ from github import Github, GithubException
+ LIBS_OK = True
+except ImportError:
+ LIBS_OK = False
+
+class GitHubLogger:
+ """Dedicated logger that appends JSONL entries to a GitHub repo/branch/file.
+
+ Env vars:
+ - LOG_GITHUB_TOKEN (fallback: GITHUB_TOKEN)
+ - LOG_REPO (format: owner/repo)
+ - LOG_BRANCH (default: 'log_event')
+ - LOG_FILE_PATH (default: 'pr_success.log')
+ """
+
+ def __init__(self):
+ if not LIBS_OK:
+ raise ImportError("PyGithub not installed. Please install PyGithub.")
+ token = os.environ.get("LOG_GITHUB_TOKEN") or os.environ.get("GITHUB_TOKEN")
+ if not token:
+ raise ValueError("Missing LOG_GITHUB_TOKEN or GITHUB_TOKEN for logging")
+ self._client = Github(token)
+
+ repo_spec = os.environ.get("LOG_REPO")
+ if not repo_spec or "/" not in repo_spec:
+ raise ValueError("Missing or invalid LOG_REPO. Expected 'owner/repo'.")
+ self.owner, self.repo_name = repo_spec.split("/", 1)
+
+ self.branch = os.environ.get("LOG_BRANCH", "log_event")
+ self.path = os.environ.get("LOG_FILE_PATH", "pr_success.log")
+
+ def _ensure_branch(self, repo):
+ try:
+ repo.get_branch(self.branch)
+ except GithubException as e:
+ if e.status == 404:
+ base = repo.get_branch(repo.default_branch)
+ repo.create_git_ref(ref=f"refs/heads/{self.branch}", sha=base.commit.sha)
+ else:
+ raise
+
+ def append_jsonl(self, jsonl_line: str, commit_message: str = "chore(log): append entry") -> str:
+ repo = self._client.get_repo(f"{self.owner}/{self.repo_name}")
+ self._ensure_branch(repo)
+ try:
+ existing = repo.get_contents(self.path, ref=self.branch)
+ existing_content = base64.b64decode(existing.content).decode("utf-8")
+ new_content = existing_content + jsonl_line
+ repo.update_file(
+ path=self.path,
+ message=commit_message,
+ content=new_content,
+ sha=existing.sha,
+ branch=self.branch,
+ )
+ return "SUCCESS: Log appended"
+ except GithubException as e:
+ if e.status == 404:
+ repo.create_file(
+ path=self.path,
+ message=commit_message,
+ content=jsonl_line,
+ branch=self.branch,
+ )
+ return "SUCCESS: Log file created and first entry appended"
+ raise
diff --git a/pr_generator/agent.py b/pr_generator/agent.py
index 8e43d5c..88650d1 100644
--- a/pr_generator/agent.py
+++ b/pr_generator/agent.py
@@ -1,596 +1,596 @@
-"""
-GitHub PR creation agent using Langchain.
-This code integrates with the actual GitHub API using the PyGithub library.
-Please set the GITHUB_TOKEN environment variable and install required libraries before running.
-"""
-
-import os
-import re
-import json
-from typing import Optional, Dict, List, Tuple, Any
-
-# Load environment variables from .env file
-from dotenv import load_dotenv
-from translator.content import llm_translate
-
-load_dotenv()
-
-# Constants definition
-ANTHROPIC_MODEL_ID = "claude-sonnet-4-20250514"
-DEFAULT_TEMPERATURE = 0.0
-
-# Library imports and error handling
-try:
- from github import Github, GithubException
- from github.GitRef import GitRef
- from langchain_anthropic import ChatAnthropic
-
- REQUIRED_LIBS_AVAILABLE = True
-except ImportError as e:
- print(f"Required libraries are not installed: {e}")
- print("Please run: pip install PyGithub boto3 langchain-anthropic")
- REQUIRED_LIBS_AVAILABLE = False
-
-
-class GitHubPRAgent:
- """Agent class for GitHub PR creation"""
-
- def __init__(self, user_owner: str = None, user_repo: str = None, base_owner: str = None, base_repo: str = None):
- self._github_client = None
- self._llm = None
- self.user_owner = user_owner
- self.user_repo = user_repo
- self.base_owner = base_owner
- self.base_repo = base_repo
-
- @property
- def github_client(self) -> Optional[Github]:
- """Return GitHub API client with lazy initialization."""
- if not REQUIRED_LIBS_AVAILABLE:
- raise ImportError("Required libraries not found.")
-
- if self._github_client is None:
- token = os.environ.get("GITHUB_TOKEN")
- if not token:
- print("Warning: GITHUB_TOKEN environment variable not set.")
- return Github() # Limited access
- self._github_client = Github(token)
-
- return self._github_client
-
- @property
- def llm(self):
- """Return LLM client with lazy initialization."""
- if not REQUIRED_LIBS_AVAILABLE:
- raise ImportError("Required libraries not found.")
-
- if self._llm is None:
- self._llm = ChatAnthropic(
- model=ANTHROPIC_MODEL_ID,
- temperature=DEFAULT_TEMPERATURE,
- )
- return self._llm
-
- def _handle_github_error(self, e: Exception, operation: str) -> str:
- """Handle GitHub API errors consistently."""
- if isinstance(e, GithubException):
- return f"{operation} failed: {e.status} {e.data.get('message', e.data)}"
- return f"Unexpected error during {operation}: {str(e)}"
-
- def create_pull_request(
- self,
- owner: str,
- repo_name: str,
- title: str,
- head: str,
- base: str,
- body: str = "",
- draft: bool = False,
- maintainer_can_modify: bool = True,
- ) -> str:
- """Create a new Pull Request."""
- try:
- # 1. Check if head and base are the same
- if head == base:
- return f"ERROR: head branch ({head}) and base branch ({base}) are identical."
-
- # 2. Check for existing PR
- existing_pr = self.check_existing_pr(owner, repo_name, head, base)
- if existing_pr:
- return f"ERROR: {existing_pr}"
-
- # 3. Verify head and base branches exist
- repo = self.github_client.get_repo(f"{owner}/{repo_name}")
- try:
- # For fork-to-upstream PR, head format is "fork_owner:branch_name"
- if ":" in head:
- fork_owner, branch_name = head.split(":", 1)
- fork_repo = self.github_client.get_repo(f"{fork_owner}/{repo_name}")
- head_branch = fork_repo.get_branch(branch_name)
- else:
- head_branch = repo.get_branch(head)
-
- base_branch = repo.get_branch(base)
-
- # 4. Check if head and base branches point to the same commit
- if head_branch.commit.sha == base_branch.commit.sha:
- return f"ERROR: head branch ({head}) and base branch ({base}) point to the same commit. No changes to merge."
-
- except GithubException as e:
- if e.status == 404:
- return f"ERROR: Branch not found. head: {head}, base: {base}"
-
- # 5. Create PR
- pr = repo.create_pull(
- title=title,
- body=body,
- head=head,
- base=base,
- draft=draft,
- maintainer_can_modify=maintainer_can_modify,
- )
- return f"PR creation successful: {pr.html_url}"
- except GithubException as e:
- if e.status == 422:
- error_msg = e.data.get("message", "Unknown error")
- errors = e.data.get("errors", [])
-
- error_details = []
- for error in errors:
- if "message" in error:
- error_details.append(error["message"])
-
- detail_msg = " | ".join(error_details) if error_details else ""
- return f"ERROR: PR creation failed (422): {error_msg}. {detail_msg}"
- return self._handle_github_error(e, "PR creation")
- except Exception as e:
- return self._handle_github_error(e, "PR creation")
-
- def create_branch(
- self, owner: str, repo_name: str, branch_name: str, source_sha: str
- ) -> str:
- """Create a new branch."""
- try:
- repo = self.github_client.get_repo(f"{owner}/{repo_name}")
- ref_name = f"refs/heads/{branch_name}"
- new_ref = repo.create_git_ref(ref=ref_name, sha=source_sha)
-
- if isinstance(new_ref, GitRef):
- return f"SUCCESS: Branch '{branch_name}' created successfully (ref: {new_ref.ref})"
- return f"ERROR: Branch '{branch_name}' creation failed. Please check API response."
- except GithubException as e:
- if e.status == 422 and "Reference already exists" in str(e.data):
- return f"WARNING: Branch '{branch_name}' already exists."
- return self._handle_github_error(e, "branch creation")
- except Exception as e:
- return self._handle_github_error(e, "branch creation")
-
- def check_existing_pr(
- self, owner: str, repo_name: str, head: str, base: str
- ) -> Optional[str]:
- """Check if there's an existing PR with the same head and base."""
- try:
- repo = self.github_client.get_repo(f"{owner}/{repo_name}")
- # For head parameter, use exactly what was passed (could be "fork_owner:branch" or just "branch")
- search_head = head if ":" in head else f"{owner}:{head}"
- pulls = repo.get_pulls(state="open", head=search_head, base=base)
- for pr in pulls:
- return f"Existing PR found: {pr.html_url}"
- return None
- except Exception as e:
- print(f"⚠️ Error checking existing PR: {str(e)}")
- return None
-
- def create_or_update_file(
- self,
- owner: str,
- repo_name: str,
- path: str,
- message: str,
- content: str,
- branch_name: Optional[str] = None,
- sha_blob: Optional[str] = None,
- ) -> str:
- """Create or update a single file."""
- try:
- repo = self.github_client.get_repo(f"{owner}/{repo_name}")
-
- args = {
- "path": path,
- "message": message,
- "content": content,
- }
- if branch_name:
- args["branch"] = branch_name
-
- # Try to update file
- if sha_blob:
- args["sha"] = sha_blob
- repo.update_file(**args)
- return f"SUCCESS: File updated - {path}"
-
- # Try to create file
- repo.create_file(**args)
- return f"SUCCESS: File created - {path}"
-
- except GithubException as e:
- # Try to update if file already exists
- if e.status == 422:
- try:
- existing_file = repo.get_contents(
- path, ref=branch_name or repo.default_branch
- )
- args["sha"] = existing_file.sha
- repo.update_file(**args)
- return f"SUCCESS: File updated - {path}"
- except:
- pass
- return f"ERROR: File processing failed - {path}"
- except Exception:
- return f"ERROR: File processing failed - {path}"
-
- def analyze_reference_pr(self, pr_url: str) -> Dict[str, Any]:
- """Analyze reference PR to extract style information."""
- try:
- # Parse PR URL
- match = re.match(r"https://github\.com/([^/]+)/([^/]+)/pull/(\d+)", pr_url)
- if not match:
- return {"error": f"Invalid PR URL format: {pr_url}"}
-
- owner, repo_name, pr_number = match.groups()
- repo = self.github_client.get_repo(f"{owner}/{repo_name}")
- pr = repo.get_pull(int(pr_number))
-
- return {
- "title": pr.title,
- "body": pr.body,
- "head_branch": pr.head.ref,
- "base_branch": pr.base.ref,
- "files_changed": [f.filename for f in pr.get_files()],
- "commits": [
- {"message": c.commit.message, "sha": c.sha}
- for c in pr.get_commits()
- ],
- }
- except Exception as e:
- return {"error": f"Error occurred during PR analysis: {str(e)}"}
-
- def _generate_with_llm(
- self, prompt: str, fallback_value: str, operation: str
- ) -> str:
- """Generate text using LLM."""
- try:
- _usage_info, generated = llm_translate(prompt)
- generated = generated.strip()
- print(f"LLM generated {operation}: {generated}")
- return generated
- except Exception as e:
- print(f"❌ Error generating {operation} with LLM: {e}")
- print(f"Using fallback value: {fallback_value}")
- return fallback_value
-
- def generate_branch_name_from_reference(
- self, reference_branch_name: str, target_language: str, file_name: str
- ) -> str:
- """Generate branch name using simple template."""
- # Keep .md extension and make branch-safe
- branch_safe_name = file_name.replace('_', '-')
- return f"{target_language}-{branch_safe_name}"
-
- def generate_pr_content_from_reference(
- self,
- reference_title: str,
- reference_body: str,
- target_language: str,
- filepath: str,
- target_filepath: str,
- file_name: str,
- ) -> Tuple[str, str]:
- """Use LLM to analyze reference PR title and body and generate appropriate PR content."""
- prompt = f"""Here is the reference PR information:
-
-Reference PR title: {reference_title}
-
-Reference PR body:
-{reference_body}
-
-Now I need to generate PR title and body for a new translation task:
-- Target language: {target_language}
-- Original file: {filepath}
-- Translation file: {target_filepath}
-- File name: {file_name}
-
-Please analyze the style and format of the reference PR to generate consistent new PR title and body.
-
-Requirements:
-1. Follow the title format and pattern of the reference PR
-2. Maintain the body style, markdown format, indentation, and line breaks of the reference PR
-3. Appropriately reflect the target language ({target_language}) and file paths
-4. If there are user mentions (@username), change them to general text instead of actual mentions
-5. Adjust the content to fit the translation task
-
-Response format:
-Title: [PR title here]
-Body: [PR body here, maintaining the exact markdown format and structure of the original]"""
-
- try:
- _usage_info, generated_content = llm_translate(prompt)
- generated_content = generated_content.strip()
-
- # Separate title and body from response
- lines = generated_content.split("\n")
- title_line = ""
- body_lines = []
- parsing_body = False
-
- for line in lines:
- if line.startswith("Title:"):
- title_line = line.replace("Title:", "").strip()
- elif line.startswith("Body:"):
- parsing_body = True
- body_content = line.replace("Body:", "").strip()
- if body_content:
- body_lines.append(body_content)
- elif parsing_body:
- body_lines.append(line)
-
- generated_title = title_line if title_line else reference_title
- generated_body = (
- "\n".join(body_lines)
- if body_lines
- else f"Add {target_language} translation for `{filepath}`."
- )
-
- print(f"LLM generated PR title: {generated_title}")
- print(f"LLM generated PR body (first 100 chars): {generated_body[:100]}...")
-
- return generated_title, generated_body
-
- except Exception as e:
- print(f"❌ Error generating PR content with LLM: {e}")
- return self._generate_default_pr_content(
- target_language, filepath, target_filepath, file_name
- )
-
- def _generate_default_pr_content(
- self, target_language: str, filepath: str, target_filepath: str, file_name: str
- ) -> Tuple[str, str]:
- """Generate default PR content."""
- title = f"🌐 [i18n-{target_language}] Translated `{file_name}` to {target_language}"
- body = f"""# What does this PR do?
-
-Translated the `{filepath}` file of the documentation to {target_language} 😄
-Thank you in advance for your review!
-
-Part of https://github.com/huggingface/transformers/issues/20179
-
-## Before reviewing
-- [x] Check for missing / redundant translations (번역 누락/중복 검사)
-- [x] Grammar Check (맞춤법 검사)
-- [x] Review or Add new terms to glossary (용어 확인 및 추가)
-- [x] Check Inline TOC (e.g. `[[lowercased-header]]`)
-- [x] Check live-preview for gotchas (live-preview로 정상작동 확인)
-
-## Who can review? (Initial)
-{target_language} translation reviewers
-
-## Before submitting
-- [x] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
-- [x] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
- Pull Request section?
-- [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
- to it if that's the case.
-- [x] Did you make sure to update the documentation with your changes? Here are the
- [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and
- [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
-- [ ] Did you write any new necessary tests?
-
-## Who can review? (Final)
- May you please review this PR?
-Documentation maintainers
-"""
- return title, body
-
- def generate_commit_message_from_reference(
- self, commit_messages: List[str], target_language: str, file_name: str
- ) -> str:
- """Generate simple commit message using template."""
- return f"docs: {target_language}: {file_name}"
-
- def get_branch_info(self, owner: str, repo_name: str, branch_name: str) -> str:
- """Get information about an existing branch."""
- try:
- repo = self.github_client.get_repo(f"{owner}/{repo_name}")
- branch = repo.get_branch(branch_name)
- commit = branch.commit
- commit_info = commit.commit
-
- return f"""
-📋 Existing branch information:
- - Branch name: {branch_name}
- - Latest commit: {commit.sha[:8]}
- - Commit message: {commit_info.message.split(chr(10))[0][:80]}...
- - Author: {commit_info.author.name}
- - Date: {commit_info.author.date.strftime('%Y-%m-%d %H:%M:%S')}
- """
- except Exception as e:
- return f"Failed to retrieve branch information: {str(e)}"
-
- def run_translation_pr_workflow(
- self,
- reference_pr_url: str,
- target_language: str,
- filepath: str,
- translated_doc: str,
- base_branch: str = "main",
- ) -> Dict[str, Any]:
- """Execute translation document PR creation workflow."""
- try:
- # 1. Analyze reference PR
- print(f"🔍 Analyzing reference PR: {reference_pr_url}")
- pr_analysis = self.analyze_reference_pr(reference_pr_url)
-
- if "error" in pr_analysis:
- return {"status": "error", "message": pr_analysis["error"]}
-
- print("Reference PR analysis completed")
-
- # 2. Generate translation file path and branch name
- target_filepath = filepath.replace("/en/", f"/{target_language}/")
- file_name = filepath.split("/")[-1] # Keep .md extension
-
- print(f"🌿 Generating branch name...")
- branch_name = self.generate_branch_name_from_reference(
- pr_analysis["head_branch"], target_language, file_name
- )
-
- # 3. Get main branch SHA from upstream and create branch in fork
- upstream_repo = self.github_client.get_repo(f"{self.base_owner}/{self.base_repo}")
- main_branch = upstream_repo.get_branch(base_branch)
- main_sha = main_branch.commit.sha
-
- print(f"🌿 Creating branch: {branch_name} in fork repository")
- branch_result = self.create_branch(self.user_owner, self.user_repo, branch_name, main_sha)
-
- # Check branch creation result
- if branch_result.startswith("ERROR"):
- return {
- "status": "error",
- "message": f"Branch creation failed: {branch_result}\n\nTarget: {self.user_owner}/{self.user_repo}\nBranch: {branch_name}\nBase SHA: {main_sha[:8]}",
- "branch": branch_name,
- "error_details": branch_result,
- }
- elif branch_result.startswith("WARNING"):
- print(f"⚠️ {branch_result}")
- # Continue if branch already exists
- elif branch_result.startswith("SUCCESS"):
- print(f"✅ {branch_result}")
- else:
- print(f"⚠️ Unexpected branch creation result: {branch_result}")
- # Continue anyway, might still work
-
- # 4. Generate commit message and save file
- commit_messages = [commit["message"] for commit in pr_analysis["commits"]]
- commit_message = self.generate_commit_message_from_reference(
- commit_messages, target_language, file_name
- )
-
- print(f"📄 Saving file: {target_filepath}")
- file_result = self.create_or_update_file(
- self.user_owner,
- self.user_repo,
- target_filepath,
- commit_message,
- translated_doc,
- branch_name,
- )
-
- if not file_result.startswith("SUCCESS"):
- return {
- "status": "error",
- "message": f"File save failed: {file_result}\n\n🎯 Target: {self.user_owner}/{self.user_repo} (expected: {target_language} fork of {self.base_owner}/{self.base_repo})\n🌿 Branch: {branch_name}\n📁 File: {target_filepath}",
- "branch": branch_name,
- "file_path": target_filepath,
- "error_details": file_result,
- }
-
- print(f"{file_result}")
-
- # 5. Create PR
- pr_title, pr_body = self.generate_pr_content_from_reference(
- pr_analysis["title"],
- pr_analysis["body"],
- target_language,
- filepath,
- target_filepath,
- file_name,
- )
-
- print(f"🔄 Creating PR: {pr_title}")
- print(f" Head: {self.user_owner}:{branch_name} → Base: {self.base_owner}:{base_branch}")
-
- # Create PR from fork to upstream repository
- pr_result = self.create_pull_request(
- self.base_owner, self.base_repo, pr_title, f"{self.user_owner}:{branch_name}", base_branch, pr_body, draft=True
- )
-
- if pr_result.startswith("ERROR"):
- print(f"❌ {pr_result}")
- return {
- "status": "partial_success",
- "branch": branch_name,
- "file_path": target_filepath,
- "message": f"File was saved and commit was successful.\nPR creation failed: {pr_result}",
- "error_details": pr_result,
- }
- elif "successful" in pr_result and "http" in pr_result:
- print(f"{pr_result}")
- return {
- "status": "success",
- "branch": branch_name,
- "file_path": target_filepath,
- "pr_url": pr_result.split(": ")[-1],
- "message": "Translation document PR created successfully!",
- }
- else:
- return {
- "status": "partial_success",
- "branch": branch_name,
- "file_path": target_filepath,
- "message": "File was saved but PR creation failed.",
- }
-
- except Exception as e:
- return {
- "status": "error",
- "message": f"Workflow execution failed: {str(e)}\n\nConfig: {self.user_owner}/{self.user_repo} → {self.base_owner}/{self.base_repo}\nFile: {filepath if 'filepath' in locals() else 'Unknown'}",
- "error_details": str(e),
- }
-
-
-# Backward compatibility functions (maintain compatibility with existing code)
-_agent = GitHubPRAgent()
-
-
-def get_github_client():
- return _agent.github_client
-
-
-def create_pull_request_func(*args, **kwargs):
- return _agent.create_pull_request(*args, **kwargs)
-
-
-def create_branch_func(*args, **kwargs):
- return _agent.create_branch(*args, **kwargs)
-
-
-def create_or_update_file_func(*args, **kwargs):
- return _agent.create_or_update_file(*args, **kwargs)
-
-
-def analyze_reference_pr_func(*args, **kwargs):
- return _agent.analyze_reference_pr(*args, **kwargs)
-
-
-def generate_branch_name_from_reference(*args, **kwargs):
- return _agent.generate_branch_name_from_reference(*args, **kwargs)
-
-
-def generate_pr_content_from_reference(*args, **kwargs):
- return _agent.generate_pr_content_from_reference(*args, **kwargs)
-
-
-def generate_default_pr_content(*args, **kwargs):
- return _agent._generate_default_pr_content(*args, **kwargs)
-
-
-def generate_commit_message_from_reference(*args, **kwargs):
- return _agent.generate_commit_message_from_reference(*args, **kwargs)
-
-
-def get_branch_info(*args, **kwargs):
- return _agent.get_branch_info(*args, **kwargs)
-
-
-def run_translation_pr_agent_simple(*args, **kwargs):
- return _agent.run_translation_pr_workflow(*args, **kwargs)
+"""
+GitHub PR creation agent using Langchain.
+This code integrates with the actual GitHub API using the PyGithub library.
+Please set the GITHUB_TOKEN environment variable and install required libraries before running.
+"""
+
+import os
+import re
+import json
+from typing import Optional, Dict, List, Tuple, Any
+
+# Load environment variables from .env file
+from dotenv import load_dotenv
+from translator.content import llm_translate
+
+load_dotenv()
+
+# Constants definition
+ANTHROPIC_MODEL_ID = "claude-sonnet-4-20250514"
+DEFAULT_TEMPERATURE = 0.0
+
+# Library imports and error handling
+try:
+ from github import Github, GithubException
+ from github.GitRef import GitRef
+ from langchain_anthropic import ChatAnthropic
+
+ REQUIRED_LIBS_AVAILABLE = True
+except ImportError as e:
+ print(f"Required libraries are not installed: {e}")
+ print("Please run: pip install PyGithub boto3 langchain-anthropic")
+ REQUIRED_LIBS_AVAILABLE = False
+
+
+class GitHubPRAgent:
+ """Agent class for GitHub PR creation"""
+
+ def __init__(self, user_owner: str = None, user_repo: str = None, base_owner: str = None, base_repo: str = None):
+ self._github_client = None
+ self._llm = None
+ self.user_owner = user_owner
+ self.user_repo = user_repo
+ self.base_owner = base_owner
+ self.base_repo = base_repo
+
+ @property
+ def github_client(self) -> Optional[Github]:
+ """Return GitHub API client with lazy initialization."""
+ if not REQUIRED_LIBS_AVAILABLE:
+ raise ImportError("Required libraries not found.")
+
+ if self._github_client is None:
+ token = os.environ.get("GITHUB_TOKEN")
+ if not token:
+ print("Warning: GITHUB_TOKEN environment variable not set.")
+ return Github() # Limited access
+ self._github_client = Github(token)
+
+ return self._github_client
+
+ @property
+ def llm(self):
+ """Return LLM client with lazy initialization."""
+ if not REQUIRED_LIBS_AVAILABLE:
+ raise ImportError("Required libraries not found.")
+
+ if self._llm is None:
+ self._llm = ChatAnthropic(
+ model=ANTHROPIC_MODEL_ID,
+ temperature=DEFAULT_TEMPERATURE,
+ )
+ return self._llm
+
+ def _handle_github_error(self, e: Exception, operation: str) -> str:
+ """Handle GitHub API errors consistently."""
+ if isinstance(e, GithubException):
+ return f"{operation} failed: {e.status} {e.data.get('message', e.data)}"
+ return f"Unexpected error during {operation}: {str(e)}"
+
+ def create_pull_request(
+ self,
+ owner: str,
+ repo_name: str,
+ title: str,
+ head: str,
+ base: str,
+ body: str = "",
+ draft: bool = False,
+ maintainer_can_modify: bool = True,
+ ) -> str:
+ """Create a new Pull Request."""
+ try:
+ # 1. Check if head and base are the same
+ if head == base:
+ return f"ERROR: head branch ({head}) and base branch ({base}) are identical."
+
+ # 2. Check for existing PR
+ existing_pr = self.check_existing_pr(owner, repo_name, head, base)
+ if existing_pr:
+ return f"ERROR: {existing_pr}"
+
+ # 3. Verify head and base branches exist
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
+ try:
+ # For fork-to-upstream PR, head format is "fork_owner:branch_name"
+ if ":" in head:
+ fork_owner, branch_name = head.split(":", 1)
+ fork_repo = self.github_client.get_repo(f"{fork_owner}/{repo_name}")
+ head_branch = fork_repo.get_branch(branch_name)
+ else:
+ head_branch = repo.get_branch(head)
+
+ base_branch = repo.get_branch(base)
+
+ # 4. Check if head and base branches point to the same commit
+ if head_branch.commit.sha == base_branch.commit.sha:
+ return f"ERROR: head branch ({head}) and base branch ({base}) point to the same commit. No changes to merge."
+
+ except GithubException as e:
+ if e.status == 404:
+ return f"ERROR: Branch not found. head: {head}, base: {base}"
+
+ # 5. Create PR
+ pr = repo.create_pull(
+ title=title,
+ body=body,
+ head=head,
+ base=base,
+ draft=draft,
+ maintainer_can_modify=maintainer_can_modify,
+ )
+ return f"PR creation successful: {pr.html_url}"
+ except GithubException as e:
+ if e.status == 422:
+ error_msg = e.data.get("message", "Unknown error")
+ errors = e.data.get("errors", [])
+
+ error_details = []
+ for error in errors:
+ if "message" in error:
+ error_details.append(error["message"])
+
+ detail_msg = " | ".join(error_details) if error_details else ""
+ return f"ERROR: PR creation failed (422): {error_msg}. {detail_msg}"
+ return self._handle_github_error(e, "PR creation")
+ except Exception as e:
+ return self._handle_github_error(e, "PR creation")
+
+ def create_branch(
+ self, owner: str, repo_name: str, branch_name: str, source_sha: str
+ ) -> str:
+ """Create a new branch."""
+ try:
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
+ ref_name = f"refs/heads/{branch_name}"
+ new_ref = repo.create_git_ref(ref=ref_name, sha=source_sha)
+
+ if isinstance(new_ref, GitRef):
+ return f"SUCCESS: Branch '{branch_name}' created successfully (ref: {new_ref.ref})"
+ return f"ERROR: Branch '{branch_name}' creation failed. Please check API response."
+ except GithubException as e:
+ if e.status == 422 and "Reference already exists" in str(e.data):
+ return f"WARNING: Branch '{branch_name}' already exists."
+ return self._handle_github_error(e, "branch creation")
+ except Exception as e:
+ return self._handle_github_error(e, "branch creation")
+
+ def check_existing_pr(
+ self, owner: str, repo_name: str, head: str, base: str
+ ) -> Optional[str]:
+ """Check if there's an existing PR with the same head and base."""
+ try:
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
+ # For head parameter, use exactly what was passed (could be "fork_owner:branch" or just "branch")
+ search_head = head if ":" in head else f"{owner}:{head}"
+ pulls = repo.get_pulls(state="open", head=search_head, base=base)
+ for pr in pulls:
+ return f"Existing PR found: {pr.html_url}"
+ return None
+ except Exception as e:
+ print(f"⚠️ Error checking existing PR: {str(e)}")
+ return None
+
+ def create_or_update_file(
+ self,
+ owner: str,
+ repo_name: str,
+ path: str,
+ message: str,
+ content: str,
+ branch_name: Optional[str] = None,
+ sha_blob: Optional[str] = None,
+ ) -> str:
+ """Create or update a single file."""
+ try:
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
+
+ args = {
+ "path": path,
+ "message": message,
+ "content": content,
+ }
+ if branch_name:
+ args["branch"] = branch_name
+
+ # Try to update file
+ if sha_blob:
+ args["sha"] = sha_blob
+ repo.update_file(**args)
+ return f"SUCCESS: File updated - {path}"
+
+ # Try to create file
+ repo.create_file(**args)
+ return f"SUCCESS: File created - {path}"
+
+ except GithubException as e:
+ # Try to update if file already exists
+ if e.status == 422:
+ try:
+ existing_file = repo.get_contents(
+ path, ref=branch_name or repo.default_branch
+ )
+ args["sha"] = existing_file.sha
+ repo.update_file(**args)
+ return f"SUCCESS: File updated - {path}"
+ except:
+ pass
+ return f"ERROR: File processing failed - {path}"
+ except Exception:
+ return f"ERROR: File processing failed - {path}"
+
+ def analyze_reference_pr(self, pr_url: str) -> Dict[str, Any]:
+ """Analyze reference PR to extract style information."""
+ try:
+ # Parse PR URL
+ match = re.match(r"https://github\.com/([^/]+)/([^/]+)/pull/(\d+)", pr_url)
+ if not match:
+ return {"error": f"Invalid PR URL format: {pr_url}"}
+
+ owner, repo_name, pr_number = match.groups()
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
+ pr = repo.get_pull(int(pr_number))
+
+ return {
+ "title": pr.title,
+ "body": pr.body,
+ "head_branch": pr.head.ref,
+ "base_branch": pr.base.ref,
+ "files_changed": [f.filename for f in pr.get_files()],
+ "commits": [
+ {"message": c.commit.message, "sha": c.sha}
+ for c in pr.get_commits()
+ ],
+ }
+ except Exception as e:
+ return {"error": f"Error occurred during PR analysis: {str(e)}"}
+
+ def _generate_with_llm(
+ self, prompt: str, fallback_value: str, operation: str
+ ) -> str:
+ """Generate text using LLM."""
+ try:
+ _usage_info, generated = llm_translate(prompt)
+ generated = generated.strip()
+ print(f"LLM generated {operation}: {generated}")
+ return generated
+ except Exception as e:
+ print(f"❌ Error generating {operation} with LLM: {e}")
+ print(f"Using fallback value: {fallback_value}")
+ return fallback_value
+
+ def generate_branch_name_from_reference(
+ self, reference_branch_name: str, target_language: str, file_name: str
+ ) -> str:
+ """Generate branch name using simple template."""
+ # Keep .md extension and make branch-safe
+ branch_safe_name = file_name.replace('_', '-')
+ return f"{target_language}-{branch_safe_name}"
+
+ def generate_pr_content_from_reference(
+ self,
+ reference_title: str,
+ reference_body: str,
+ target_language: str,
+ filepath: str,
+ target_filepath: str,
+ file_name: str,
+ ) -> Tuple[str, str]:
+ """Use LLM to analyze reference PR title and body and generate appropriate PR content."""
+ prompt = f"""Here is the reference PR information:
+
+Reference PR title: {reference_title}
+
+Reference PR body:
+{reference_body}
+
+Now I need to generate PR title and body for a new translation task:
+- Target language: {target_language}
+- Original file: {filepath}
+- Translation file: {target_filepath}
+- File name: {file_name}
+
+Please analyze the style and format of the reference PR to generate consistent new PR title and body.
+
+Requirements:
+1. Follow the title format and pattern of the reference PR
+2. Maintain the body style, markdown format, indentation, and line breaks of the reference PR
+3. Appropriately reflect the target language ({target_language}) and file paths
+4. If there are user mentions (@username), change them to general text instead of actual mentions
+5. Adjust the content to fit the translation task
+
+Response format:
+Title: [PR title here]
+Body: [PR body here, maintaining the exact markdown format and structure of the original]"""
+
+ try:
+ _usage_info, generated_content = llm_translate(prompt)
+ generated_content = generated_content.strip()
+
+ # Separate title and body from response
+ lines = generated_content.split("\n")
+ title_line = ""
+ body_lines = []
+ parsing_body = False
+
+ for line in lines:
+ if line.startswith("Title:"):
+ title_line = line.replace("Title:", "").strip()
+ elif line.startswith("Body:"):
+ parsing_body = True
+ body_content = line.replace("Body:", "").strip()
+ if body_content:
+ body_lines.append(body_content)
+ elif parsing_body:
+ body_lines.append(line)
+
+ generated_title = title_line if title_line else reference_title
+ generated_body = (
+ "\n".join(body_lines)
+ if body_lines
+ else f"Add {target_language} translation for `{filepath}`."
+ )
+
+ print(f"LLM generated PR title: {generated_title}")
+ print(f"LLM generated PR body (first 100 chars): {generated_body[:100]}...")
+
+ return generated_title, generated_body
+
+ except Exception as e:
+ print(f"❌ Error generating PR content with LLM: {e}")
+ return self._generate_default_pr_content(
+ target_language, filepath, target_filepath, file_name
+ )
+
+ def _generate_default_pr_content(
+ self, target_language: str, filepath: str, target_filepath: str, file_name: str
+ ) -> Tuple[str, str]:
+ """Generate default PR content."""
+ title = f"🌐 [i18n-{target_language}] Translated `{file_name}` to {target_language}"
+ body = f"""# What does this PR do?
+
+Translated the `{filepath}` file of the documentation to {target_language} 😄
+Thank you in advance for your review!
+
+Part of https://github.com/huggingface/transformers/issues/20179
+
+## Before reviewing
+- [x] Check for missing / redundant translations (번역 누락/중복 검사)
+- [x] Grammar Check (맞춤법 검사)
+- [x] Review or Add new terms to glossary (용어 확인 및 추가)
+- [x] Check Inline TOC (e.g. `[[lowercased-header]]`)
+- [x] Check live-preview for gotchas (live-preview로 정상작동 확인)
+
+## Who can review? (Initial)
+{target_language} translation reviewers
+
+## Before submitting
+- [x] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
+- [x] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
+ Pull Request section?
+- [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
+ to it if that's the case.
+- [x] Did you make sure to update the documentation with your changes? Here are the
+ [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and
+ [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
+- [ ] Did you write any new necessary tests?
+
+## Who can review? (Final)
+ May you please review this PR?
+Documentation maintainers
+"""
+ return title, body
+
+ def generate_commit_message_from_reference(
+ self, commit_messages: List[str], target_language: str, file_name: str
+ ) -> str:
+ """Generate simple commit message using template."""
+ return f"docs: {target_language}: {file_name}"
+
+ def get_branch_info(self, owner: str, repo_name: str, branch_name: str) -> str:
+ """Get information about an existing branch."""
+ try:
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
+ branch = repo.get_branch(branch_name)
+ commit = branch.commit
+ commit_info = commit.commit
+
+ return f"""
+📋 Existing branch information:
+ - Branch name: {branch_name}
+ - Latest commit: {commit.sha[:8]}
+ - Commit message: {commit_info.message.split(chr(10))[0][:80]}...
+ - Author: {commit_info.author.name}
+ - Date: {commit_info.author.date.strftime('%Y-%m-%d %H:%M:%S')}
+ """
+ except Exception as e:
+ return f"Failed to retrieve branch information: {str(e)}"
+
+ def run_translation_pr_workflow(
+ self,
+ reference_pr_url: str,
+ target_language: str,
+ filepath: str,
+ translated_doc: str,
+ base_branch: str = "main",
+ ) -> Dict[str, Any]:
+ """Execute translation document PR creation workflow."""
+ try:
+ # 1. Analyze reference PR
+ print(f"🔍 Analyzing reference PR: {reference_pr_url}")
+ pr_analysis = self.analyze_reference_pr(reference_pr_url)
+
+ if "error" in pr_analysis:
+ return {"status": "error", "message": pr_analysis["error"]}
+
+ print("Reference PR analysis completed")
+
+ # 2. Generate translation file path and branch name
+ target_filepath = filepath.replace("/en/", f"/{target_language}/")
+ file_name = filepath.split("/")[-1] # Keep .md extension
+
+ print(f"🌿 Generating branch name...")
+ branch_name = self.generate_branch_name_from_reference(
+ pr_analysis["head_branch"], target_language, file_name
+ )
+
+ # 3. Get main branch SHA from upstream and create branch in fork
+ upstream_repo = self.github_client.get_repo(f"{self.base_owner}/{self.base_repo}")
+ main_branch = upstream_repo.get_branch(base_branch)
+ main_sha = main_branch.commit.sha
+
+ print(f"🌿 Creating branch: {branch_name} in fork repository")
+ branch_result = self.create_branch(self.user_owner, self.user_repo, branch_name, main_sha)
+
+ # Check branch creation result
+ if branch_result.startswith("ERROR"):
+ return {
+ "status": "error",
+ "message": f"Branch creation failed: {branch_result}\n\nTarget: {self.user_owner}/{self.user_repo}\nBranch: {branch_name}\nBase SHA: {main_sha[:8]}",
+ "branch": branch_name,
+ "error_details": branch_result,
+ }
+ elif branch_result.startswith("WARNING"):
+ print(f"⚠️ {branch_result}")
+ # Continue if branch already exists
+ elif branch_result.startswith("SUCCESS"):
+ print(f"✅ {branch_result}")
+ else:
+ print(f"⚠️ Unexpected branch creation result: {branch_result}")
+ # Continue anyway, might still work
+
+ # 4. Generate commit message and save file
+ commit_messages = [commit["message"] for commit in pr_analysis["commits"]]
+ commit_message = self.generate_commit_message_from_reference(
+ commit_messages, target_language, file_name
+ )
+
+ print(f"📄 Saving file: {target_filepath}")
+ file_result = self.create_or_update_file(
+ self.user_owner,
+ self.user_repo,
+ target_filepath,
+ commit_message,
+ translated_doc,
+ branch_name,
+ )
+
+ if not file_result.startswith("SUCCESS"):
+ return {
+ "status": "error",
+ "message": f"File save failed: {file_result}\n\n🎯 Target: {self.user_owner}/{self.user_repo} (expected: {target_language} fork of {self.base_owner}/{self.base_repo})\n🌿 Branch: {branch_name}\n📁 File: {target_filepath}",
+ "branch": branch_name,
+ "file_path": target_filepath,
+ "error_details": file_result,
+ }
+
+ print(f"{file_result}")
+
+ # 5. Create PR
+ pr_title, pr_body = self.generate_pr_content_from_reference(
+ pr_analysis["title"],
+ pr_analysis["body"],
+ target_language,
+ filepath,
+ target_filepath,
+ file_name,
+ )
+
+ print(f"🔄 Creating PR: {pr_title}")
+ print(f" Head: {self.user_owner}:{branch_name} → Base: {self.base_owner}:{base_branch}")
+
+ # Create PR from fork to upstream repository
+ pr_result = self.create_pull_request(
+ self.base_owner, self.base_repo, pr_title, f"{self.user_owner}:{branch_name}", base_branch, pr_body, draft=True
+ )
+
+ if pr_result.startswith("ERROR"):
+ print(f"❌ {pr_result}")
+ return {
+ "status": "partial_success",
+ "branch": branch_name,
+ "file_path": target_filepath,
+ "message": f"File was saved and commit was successful.\nPR creation failed: {pr_result}",
+ "error_details": pr_result,
+ }
+ elif "successful" in pr_result and "http" in pr_result:
+ print(f"{pr_result}")
+ return {
+ "status": "success",
+ "branch": branch_name,
+ "file_path": target_filepath,
+ "pr_url": pr_result.split(": ")[-1],
+ "message": "Translation document PR created successfully!",
+ }
+ else:
+ return {
+ "status": "partial_success",
+ "branch": branch_name,
+ "file_path": target_filepath,
+ "message": "File was saved but PR creation failed.",
+ }
+
+ except Exception as e:
+ return {
+ "status": "error",
+ "message": f"Workflow execution failed: {str(e)}\n\nConfig: {self.user_owner}/{self.user_repo} → {self.base_owner}/{self.base_repo}\nFile: {filepath if 'filepath' in locals() else 'Unknown'}",
+ "error_details": str(e),
+ }
+
+
+# Backward compatibility functions (maintain compatibility with existing code)
+_agent = GitHubPRAgent()
+
+
+def get_github_client():
+ return _agent.github_client
+
+
+def create_pull_request_func(*args, **kwargs):
+ return _agent.create_pull_request(*args, **kwargs)
+
+
+def create_branch_func(*args, **kwargs):
+ return _agent.create_branch(*args, **kwargs)
+
+
+def create_or_update_file_func(*args, **kwargs):
+ return _agent.create_or_update_file(*args, **kwargs)
+
+
+def analyze_reference_pr_func(*args, **kwargs):
+ return _agent.analyze_reference_pr(*args, **kwargs)
+
+
+def generate_branch_name_from_reference(*args, **kwargs):
+ return _agent.generate_branch_name_from_reference(*args, **kwargs)
+
+
+def generate_pr_content_from_reference(*args, **kwargs):
+ return _agent.generate_pr_content_from_reference(*args, **kwargs)
+
+
+def generate_default_pr_content(*args, **kwargs):
+ return _agent._generate_default_pr_content(*args, **kwargs)
+
+
+def generate_commit_message_from_reference(*args, **kwargs):
+ return _agent.generate_commit_message_from_reference(*args, **kwargs)
+
+
+def get_branch_info(*args, **kwargs):
+ return _agent.get_branch_info(*args, **kwargs)
+
+
+def run_translation_pr_agent_simple(*args, **kwargs):
+ return _agent.run_translation_pr_workflow(*args, **kwargs)
diff --git a/pr_generator/searcher.py b/pr_generator/searcher.py
index 7ade3a4..fb4d38d 100644
--- a/pr_generator/searcher.py
+++ b/pr_generator/searcher.py
@@ -1,238 +1,238 @@
-"""
-GitHub PR Search Agent
-An agent that finds a suitable reference PR when a reference PR URL is not provided.
-"""
-
-import os
-import re
-import logging
-from typing import List, Dict, Any, Optional
-
-# Load environment variables
-from dotenv import load_dotenv
-
-load_dotenv()
-
-# Setup logging
-logging.basicConfig(
- level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
-logger = logging.getLogger(__name__)
-
-# Langchain imports
-try:
- from langchain_anthropic import ChatAnthropic
- from langchain.tools import StructuredTool
- from langchain.agents import AgentExecutor, create_tool_calling_agent
- from langchain_core.prompts import ChatPromptTemplate
- from github import Github
-
- REQUIRED_LIBS_AVAILABLE = True
-except ImportError as e:
- print(f"Required libraries are not installed: {e}")
- REQUIRED_LIBS_AVAILABLE = False
-
-# Constants
-ANTHROPIC_MODEL_ID = "claude-sonnet-4-20250514"
-DEFAULT_TEMPERATURE = 0.0
-# Fallback PR URL to ensure a PR is always returned
-DEFAULT_FALLBACK_PR_URL = "https://github.com/huggingface/transformers/pull/24968"
-
-
-class GitHubPRSearcher:
- """GitHub PR Searcher - now using a LangChain agent."""
-
- def _search_github_prs(self, query: str) -> List[Dict[str, Any]]:
- """
- Searches GitHub for pull requests matching the query and returns the top 5 results.
- The query should be a valid GitHub search query.
- """
- logger.info(f"Executing GitHub search with query: {query}")
- try:
- issues = self.github_client.search_issues(query=query)
- # Take top 5 to keep context small for the agent
- top_issues = issues.get_page(0)[:5]
-
- if not top_issues:
- return []
-
- return [
- {"title": issue.title, "url": issue.html_url, "number": issue.number}
- for issue in top_issues
- ]
- except Exception as e:
- logger.error(f"Error during GitHub search: {e}", exc_info=True)
- # Return an error message that the agent can understand
- return [{"error": f"An error occurred during search: {e}"}]
-
- def __init__(self):
- if not REQUIRED_LIBS_AVAILABLE:
- raise ImportError("Required libraries for agent could not be found.")
-
- self._github_client = None
- self.llm = ChatAnthropic(
- model=ANTHROPIC_MODEL_ID,
- temperature=DEFAULT_TEMPERATURE,
- )
-
- search_tool = StructuredTool.from_function(
- func=self._search_github_prs,
- name="search_github_prs",
- description="Searches GitHub for pull requests matching the query and returns the top 5 results. The query should be a valid GitHub search query.",
- )
- tools = [search_tool]
-
- prompt_string = """You are a GitHub expert. Your mission is to find the best reference pull request (PR) for a given task.
-
-You need to find a merged PR in the repository: {owner}/{repo_name}.
-The PR should be for a documentation translation into **{target_language}**.
-The context for the translation is: **{context}**.
-
-Use the tools at your disposal to search for relevant PRs.
-Analyze the search results and select the one that best matches the request. A good PR is usually one that has "translation", "docs", "i18n", and the target language in its title.
-
-Here is an example of a good search query you could use:
-`repo:{owner}/{repo_name} is:pr is:merged "{target_language}" "{context}" i18n translation docs`
-
-After your analysis, you MUST output **only the final URL** of the best PR you have chosen. Do not include any other text in your final response."""
-
- prompt = ChatPromptTemplate.from_messages(
- [
- ("system", prompt_string),
- (
- "human",
- "Find the best reference PR for translating docs to {target_language} about {context} in the {owner}/{repo_name} repository.",
- ),
- ("placeholder", "{agent_scratchpad}"),
- ]
- )
-
- agent = create_tool_calling_agent(self.llm, tools, prompt)
- self.agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)
-
- @property
- def github_client(self) -> Optional[Github]:
- """Lazy initialization of the GitHub API client."""
- if not REQUIRED_LIBS_AVAILABLE:
- raise ImportError("Required libraries could not be found.")
-
- if self._github_client is None:
- token = os.environ.get("GITHUB_TOKEN")
- if not token:
- print("Warning: GITHUB_TOKEN environment variable is not set.")
- self._github_client = Github() # Limited access
- else:
- self._github_client = Github(token)
- return self._github_client
-
- def find_best_reference_pr(
- self, owner: str, repo_name: str, target_language: str, context: str
- ):
- """
- Finds the best reference PR using a LangChain agent.
- Yields progress and returns the final PR URL.
- """
- message = "🤖 Agent is searching for the best reference PR..."
- logger.info(message)
- yield message
-
- try:
- agent_input = {
- "owner": owner,
- "repo_name": repo_name,
- "target_language": target_language,
- "context": context,
- }
-
- agent_output = None
- for event in self.agent_executor.stream(agent_input):
- if "actions" in event and event["actions"]:
- action = event["actions"][0]
- tool_query = action.tool_input.get("query", str(action.tool_input))
- message = f"🔍 Agent is using tool `{action.tool}` with query:\n`{tool_query}`"
- logger.info(message)
- yield message
- elif "steps" in event and event["steps"]:
- message = "📊 Agent is analyzing the results from the tool..."
- logger.info(message)
- yield message
- elif "output" in event and event["output"]:
- agent_output = event["output"]
-
- if not agent_output:
- message = "⚠️ Agent failed to find a suitable PR. Using default PR."
- logger.warning(message)
- yield message
- return DEFAULT_FALLBACK_PR_URL
-
- # The agent's final output can be a string, a list of tool results,
- # or a list of content blocks from the LLM. We'll find the URL
- # by searching for it in the string representation of the output.
- output_text = str(agent_output)
- urls = re.findall(r"https?://github.com/[^/]+/[^/]+/pull/\d+", output_text)
-
- final_url = ""
- if urls:
- final_url = urls[-1] # Take the last URL found
-
- if not final_url:
- message = f"⚠️ Agent returned unparsable output: {agent_output}. Using default PR."
- logger.warning(message)
- yield message
- return DEFAULT_FALLBACK_PR_URL
-
- message = f"✅ Selected the best PR:\n`{final_url}`"
- logger.info(f"Selected the best PR: {final_url}")
- yield message
- return final_url
-
- except Exception as e:
- message = f"❌ Error during agent execution: {e}\nUsing default PR."
- logger.error(message, exc_info=True)
- yield message
- return DEFAULT_FALLBACK_PR_URL
-
-
-def find_reference_pr_simple_stream(target_language: str = "", context: str = ""):
- """
- A simple function to find a reference PR, streaming progress.
- This function always searches in the 'huggingface/transformers' repository.
- """
- searcher = GitHubPRSearcher()
- stream_generator = searcher.find_best_reference_pr(
- "huggingface", "transformers", target_language, context
- )
- # The handler will receive the final URL from the generator's return statement
- final_url = yield from stream_generator
-
- # Format the final result as expected by the handler
- return {
- "status": "success",
- "result": f"Recommended PR URL: {final_url}",
- "repository": "huggingface/transformers",
- "target_language": target_language,
- }
-
-
-# Example usage
-if __name__ == "__main__":
- # Example execution for streaming
- # In a real application, a generator consumer (like the one in handler.py)
- # would process the yielded values. This script simulates that.
- print("--- Running Streaming Search Simulation ---")
-
- def run_simulation():
- """Simulates the consumption of the streaming generator."""
- test_gen = find_reference_pr_simple_stream(
- target_language="korean", context="docs"
- )
- try:
- while True:
- # This will print progress messages
- print(next(test_gen))
- except StopIteration as e:
- # When the generator is exhausted, the final result is in e.value
- print("\n--- FINAL RESULT ---")
- print(e.value)
-
- run_simulation()
+"""
+GitHub PR Search Agent
+An agent that finds a suitable reference PR when a reference PR URL is not provided.
+"""
+
+import os
+import re
+import logging
+from typing import List, Dict, Any, Optional
+
+# Load environment variables
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# Setup logging
+logging.basicConfig(
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+# Langchain imports
+try:
+ from langchain_anthropic import ChatAnthropic
+ from langchain.tools import StructuredTool
+ from langchain.agents import AgentExecutor, create_tool_calling_agent
+ from langchain_core.prompts import ChatPromptTemplate
+ from github import Github
+
+ REQUIRED_LIBS_AVAILABLE = True
+except ImportError as e:
+ print(f"Required libraries are not installed: {e}")
+ REQUIRED_LIBS_AVAILABLE = False
+
+# Constants
+ANTHROPIC_MODEL_ID = "claude-sonnet-4-20250514"
+DEFAULT_TEMPERATURE = 0.0
+# Fallback PR URL to ensure a PR is always returned
+DEFAULT_FALLBACK_PR_URL = "https://github.com/huggingface/transformers/pull/24968"
+
+
+class GitHubPRSearcher:
+ """GitHub PR Searcher - now using a LangChain agent."""
+
+ def _search_github_prs(self, query: str) -> List[Dict[str, Any]]:
+ """
+ Searches GitHub for pull requests matching the query and returns the top 5 results.
+ The query should be a valid GitHub search query.
+ """
+ logger.info(f"Executing GitHub search with query: {query}")
+ try:
+ issues = self.github_client.search_issues(query=query)
+ # Take top 5 to keep context small for the agent
+ top_issues = issues.get_page(0)[:5]
+
+ if not top_issues:
+ return []
+
+ return [
+ {"title": issue.title, "url": issue.html_url, "number": issue.number}
+ for issue in top_issues
+ ]
+ except Exception as e:
+ logger.error(f"Error during GitHub search: {e}", exc_info=True)
+ # Return an error message that the agent can understand
+ return [{"error": f"An error occurred during search: {e}"}]
+
+ def __init__(self):
+ if not REQUIRED_LIBS_AVAILABLE:
+ raise ImportError("Required libraries for agent could not be found.")
+
+ self._github_client = None
+ self.llm = ChatAnthropic(
+ model=ANTHROPIC_MODEL_ID,
+ temperature=DEFAULT_TEMPERATURE,
+ )
+
+ search_tool = StructuredTool.from_function(
+ func=self._search_github_prs,
+ name="search_github_prs",
+ description="Searches GitHub for pull requests matching the query and returns the top 5 results. The query should be a valid GitHub search query.",
+ )
+ tools = [search_tool]
+
+ prompt_string = """You are a GitHub expert. Your mission is to find the best reference pull request (PR) for a given task.
+
+You need to find a merged PR in the repository: {owner}/{repo_name}.
+The PR should be for a documentation translation into **{target_language}**.
+The context for the translation is: **{context}**.
+
+Use the tools at your disposal to search for relevant PRs.
+Analyze the search results and select the one that best matches the request. A good PR is usually one that has "translation", "docs", "i18n", and the target language in its title.
+
+Here is an example of a good search query you could use:
+`repo:{owner}/{repo_name} is:pr is:merged "{target_language}" "{context}" i18n translation docs`
+
+After your analysis, you MUST output **only the final URL** of the best PR you have chosen. Do not include any other text in your final response."""
+
+ prompt = ChatPromptTemplate.from_messages(
+ [
+ ("system", prompt_string),
+ (
+ "human",
+ "Find the best reference PR for translating docs to {target_language} about {context} in the {owner}/{repo_name} repository.",
+ ),
+ ("placeholder", "{agent_scratchpad}"),
+ ]
+ )
+
+ agent = create_tool_calling_agent(self.llm, tools, prompt)
+ self.agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)
+
+ @property
+ def github_client(self) -> Optional[Github]:
+ """Lazy initialization of the GitHub API client."""
+ if not REQUIRED_LIBS_AVAILABLE:
+ raise ImportError("Required libraries could not be found.")
+
+ if self._github_client is None:
+ token = os.environ.get("GITHUB_TOKEN")
+ if not token:
+ print("Warning: GITHUB_TOKEN environment variable is not set.")
+ self._github_client = Github() # Limited access
+ else:
+ self._github_client = Github(token)
+ return self._github_client
+
+ def find_best_reference_pr(
+ self, owner: str, repo_name: str, target_language: str, context: str
+ ):
+ """
+ Finds the best reference PR using a LangChain agent.
+ Yields progress and returns the final PR URL.
+ """
+ message = "🤖 Agent is searching for the best reference PR..."
+ logger.info(message)
+ yield message
+
+ try:
+ agent_input = {
+ "owner": owner,
+ "repo_name": repo_name,
+ "target_language": target_language,
+ "context": context,
+ }
+
+ agent_output = None
+ for event in self.agent_executor.stream(agent_input):
+ if "actions" in event and event["actions"]:
+ action = event["actions"][0]
+ tool_query = action.tool_input.get("query", str(action.tool_input))
+ message = f"🔍 Agent is using tool `{action.tool}` with query:\n`{tool_query}`"
+ logger.info(message)
+ yield message
+ elif "steps" in event and event["steps"]:
+ message = "📊 Agent is analyzing the results from the tool..."
+ logger.info(message)
+ yield message
+ elif "output" in event and event["output"]:
+ agent_output = event["output"]
+
+ if not agent_output:
+ message = "⚠️ Agent failed to find a suitable PR. Using default PR."
+ logger.warning(message)
+ yield message
+ return DEFAULT_FALLBACK_PR_URL
+
+ # The agent's final output can be a string, a list of tool results,
+ # or a list of content blocks from the LLM. We'll find the URL
+ # by searching for it in the string representation of the output.
+ output_text = str(agent_output)
+ urls = re.findall(r"https?://github.com/[^/]+/[^/]+/pull/\d+", output_text)
+
+ final_url = ""
+ if urls:
+ final_url = urls[-1] # Take the last URL found
+
+ if not final_url:
+ message = f"⚠️ Agent returned unparsable output: {agent_output}. Using default PR."
+ logger.warning(message)
+ yield message
+ return DEFAULT_FALLBACK_PR_URL
+
+ message = f"✅ Selected the best PR:\n`{final_url}`"
+ logger.info(f"Selected the best PR: {final_url}")
+ yield message
+ return final_url
+
+ except Exception as e:
+ message = f"❌ Error during agent execution: {e}\nUsing default PR."
+ logger.error(message, exc_info=True)
+ yield message
+ return DEFAULT_FALLBACK_PR_URL
+
+
+def find_reference_pr_simple_stream(target_language: str = "", context: str = ""):
+ """
+ A simple function to find a reference PR, streaming progress.
+ This function always searches in the 'huggingface/transformers' repository.
+ """
+ searcher = GitHubPRSearcher()
+ stream_generator = searcher.find_best_reference_pr(
+ "huggingface", "transformers", target_language, context
+ )
+ # The handler will receive the final URL from the generator's return statement
+ final_url = yield from stream_generator
+
+ # Format the final result as expected by the handler
+ return {
+ "status": "success",
+ "result": f"Recommended PR URL: {final_url}",
+ "repository": "huggingface/transformers",
+ "target_language": target_language,
+ }
+
+
+# Example usage
+if __name__ == "__main__":
+ # Example execution for streaming
+ # In a real application, a generator consumer (like the one in handler.py)
+ # would process the yielded values. This script simulates that.
+ print("--- Running Streaming Search Simulation ---")
+
+ def run_simulation():
+ """Simulates the consumption of the streaming generator."""
+ test_gen = find_reference_pr_simple_stream(
+ target_language="korean", context="docs"
+ )
+ try:
+ while True:
+ # This will print progress messages
+ print(next(test_gen))
+ except StopIteration as e:
+ # When the generator is exhausted, the final result is in e.value
+ print("\n--- FINAL RESULT ---")
+ print(e.value)
+
+ run_simulation()
diff --git a/requirements.txt b/requirements.txt
index 46938ad..fac62e9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,11 +1,11 @@
-gradio==5.33.0
-requests
-pydantic
-langchain-anthropic
-python-dotenv
-langchain
-PyGithub
-langchain-core
-langchain-community
-boto3
+gradio==5.33.0
+requests
+pydantic
+langchain-anthropic
+python-dotenv
+langchain
+PyGithub
+langchain-core
+langchain-community
+boto3
PyYAML
\ No newline at end of file
diff --git a/test/test_final_translate.md b/test/test_final_translate.md
index e18ce36..132e501 100644
--- a/test/test_final_translate.md
+++ b/test/test_final_translate.md
@@ -1,127 +1,127 @@
-
-
-# 가속기 선택 [[accelerator-selection]]
-
-분산 훈련 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 개수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기들이 있을 때 더 빠른 가속기를 먼저 사용하고 싶거나, 사용 가능한 가속기 중 일부만 사용하고 싶을 때 유용합니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
-
-이 가이드에서는 사용할 가속기의 개수와 사용 순서를 선택하는 방법을 보여드립니다.
-
-## 가속기 개수 [[number-of-accelerators]]
-
-예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령어를 실행하세요.
-
-
-
-
-사용할 가속기 개수를 선택하려면 `--nproc_per_node`를 사용하세요.
-
-```bash
-torchrun --nproc_per_node=2 trainer-program.py ...
-```
-
-
-
-
-사용할 가속기 개수를 선택하려면 `--num_processes`를 사용하세요.
-
-```bash
-accelerate launch --num_processes 2 trainer-program.py ...
-```
-
-
-
-
-사용할 GPU 개수를 선택하려면 `--num_gpus`를 사용하세요.
-
-```bash
-deepspeed --num_gpus 2 trainer-program.py ...
-```
-
-
-
-
-## 가속기 순서 [[order-of-accelerators]]
-사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 보통 각 실행마다 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
-
-예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
-
-
-
-
-```bash
-CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
-```
-
-GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
-순서를 바꾸려면(GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
-
-
-```bash
-CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
-```
-
-GPU 없이 실행하려면:
-
-```bash
-CUDA_VISIBLE_DEVICES= python trainer-program.py ...
-```
-
-`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
-
-- PCIe 버스 ID 순서(`nvidia-smi`와 일치):
-
- ```bash
-$hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
- ```
-
-- 컴퓨팅 성능 순서(가장 빠른 것부터):
-
- ```bash
- export CUDA_DEVICE_ORDER=FASTEST_FIRST
- ```
-
-
-
-
-```bash
-ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
-```
-
-XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
-순서를 바꾸려면(XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
-
-```bash
-ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
-```
-
-
-다음으로 Intel XPU의 순서를 제어할 수도 있습니다:
-
-```bash
-export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
-```
-
-Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
-
-
-
-
-
-
-> [!WARNING]
-> 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신 특정 훈련 실행을 위한 환경 변수를 같은 명령줄에서 설정하는 것이 일반적인 관행입니다.
-
+
+
+# 가속기 선택 [[accelerator-selection]]
+
+분산 훈련 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 개수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기들이 있을 때 더 빠른 가속기를 먼저 사용하고 싶거나, 사용 가능한 가속기 중 일부만 사용하고 싶을 때 유용합니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
+
+이 가이드에서는 사용할 가속기의 개수와 사용 순서를 선택하는 방법을 보여드립니다.
+
+## 가속기 개수 [[number-of-accelerators]]
+
+예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령어를 실행하세요.
+
+
+
+
+사용할 가속기 개수를 선택하려면 `--nproc_per_node`를 사용하세요.
+
+```bash
+torchrun --nproc_per_node=2 trainer-program.py ...
+```
+
+
+
+
+사용할 가속기 개수를 선택하려면 `--num_processes`를 사용하세요.
+
+```bash
+accelerate launch --num_processes 2 trainer-program.py ...
+```
+
+
+
+
+사용할 GPU 개수를 선택하려면 `--num_gpus`를 사용하세요.
+
+```bash
+deepspeed --num_gpus 2 trainer-program.py ...
+```
+
+
+
+
+## 가속기 순서 [[order-of-accelerators]]
+사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 보통 각 실행마다 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
+
+예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
+
+
+
+
+```bash
+CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
+```
+
+GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
+순서를 바꾸려면(GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
+
+
+```bash
+CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
+```
+
+GPU 없이 실행하려면:
+
+```bash
+CUDA_VISIBLE_DEVICES= python trainer-program.py ...
+```
+
+`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
+
+- PCIe 버스 ID 순서(`nvidia-smi`와 일치):
+
+ ```bash
+$hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
+ ```
+
+- 컴퓨팅 성능 순서(가장 빠른 것부터):
+
+ ```bash
+ export CUDA_DEVICE_ORDER=FASTEST_FIRST
+ ```
+
+
+
+
+```bash
+ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
+```
+
+XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
+순서를 바꾸려면(XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
+
+```bash
+ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
+```
+
+
+다음으로 Intel XPU의 순서를 제어할 수도 있습니다:
+
+```bash
+export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
+```
+
+Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
+
+
+
+
+
+
+> [!WARNING]
+> 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신 특정 훈련 실행을 위한 환경 변수를 같은 명령줄에서 설정하는 것이 일반적인 관행입니다.
+
diff --git a/test/test_prompt.py b/test/test_prompt.py
index 57a3071..f84edf6 100644
--- a/test/test_prompt.py
+++ b/test/test_prompt.py
@@ -1,71 +1,71 @@
-output = """
-What do these sentences about Hugging Face Transformers (a machine learning library) mean in Korean? Please do not translate the word after a 🤗 emoji as it is a product name. Output only the translated result without any explanations or introductions.
-```md
-# Accelerator selection
-
-During distributed training, you can specify the number and order of accelerators (CUDA, XPU, MPS, HPU, etc.) to use. This can be useful when you have accelerators with different computing power and you want to use the faster accelerator first. Or you could only use a subset of the available accelerators. The selection process works for both [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html). You don't need Accelerate or [DeepSpeed integration](./main_classes/deepspeed).
-
-This guide will show you how to select the number of accelerators to use and the order to use them in.
-
-## Number of accelerators
-
-For example, if there are 4 accelerators and you only want to use the first 2, run the command below.
-
-
-
-
-Use the `--nproc_per_node` to select how many accelerators to use.
-
-
-
-
-Use `--num_processes` to select how many accelerators to use.
-
-
-
-
-Use `--num_gpus` to select how many GPUs to use.
-
-
-
-
-## Order of accelerators
-To select specific accelerators to use and their order, use the environment variable appropriate for your hardware. This is often set on the command line for each run, but can also be added to your `~/.bashrc` or other startup config file.
-
-For example, if there are 4 accelerators (0, 1, 2, 3) and you only want to run accelerators 0 and 2:
-
-
-
-
-Only GPUs 0 and 2 are "visible" to PyTorch and are mapped to `cuda:0` and `cuda:1` respectively.
-To reverse the order (use GPU 2 as `cuda:0` and GPU 0 as `cuda:1`):
-
-To run without any GPUs:
-
-You can also control the order of CUDA devices using `CUDA_DEVICE_ORDER`:
-
-- Order by PCIe bus ID (matches `nvidia-smi`):
-
-
-
-- Order by compute capability (fastest first):
-
-
-
-
-
-
-Only XPUs 0 and 2 are "visible" to PyTorch and are mapped to `xpu:0` and `xpu:1` respectively.
-To reverse the order (use XPU 2 as `xpu:0` and XPU 0 as `xpu:1`):
-
-You can also control the order of Intel XPUs with:
-
-For more information about device enumeration and sorting on Intel XPU, please refer to the [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) documentation.
-
-
-
-
-> [!WARNING]
-> Environment variables can be exported instead of being added to the command line. This is not recommended because it can be confusing if you forget how the environment variable was set up and you end up using the wrong accelerators. Instead, it is common practice to set the environment variable for a specific training run on the same command line.
-```
-"""
+output = """
+What do these sentences about Hugging Face Transformers (a machine learning library) mean in Korean? Please do not translate the word after a 🤗 emoji as it is a product name. Output only the translated result without any explanations or introductions.
+```md
+# Accelerator selection
+
+During distributed training, you can specify the number and order of accelerators (CUDA, XPU, MPS, HPU, etc.) to use. This can be useful when you have accelerators with different computing power and you want to use the faster accelerator first. Or you could only use a subset of the available accelerators. The selection process works for both [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html). You don't need Accelerate or [DeepSpeed integration](./main_classes/deepspeed).
+
+This guide will show you how to select the number of accelerators to use and the order to use them in.
+
+## Number of accelerators
+
+For example, if there are 4 accelerators and you only want to use the first 2, run the command below.
+
+
+
+
+Use the `--nproc_per_node` to select how many accelerators to use.
+
+
+
+
+Use `--num_processes` to select how many accelerators to use.
+
+
+
+
+Use `--num_gpus` to select how many GPUs to use.
+
+
+
+
+## Order of accelerators
+To select specific accelerators to use and their order, use the environment variable appropriate for your hardware. This is often set on the command line for each run, but can also be added to your `~/.bashrc` or other startup config file.
+
+For example, if there are 4 accelerators (0, 1, 2, 3) and you only want to run accelerators 0 and 2:
+
+
+
+
+Only GPUs 0 and 2 are "visible" to PyTorch and are mapped to `cuda:0` and `cuda:1` respectively.
+To reverse the order (use GPU 2 as `cuda:0` and GPU 0 as `cuda:1`):
+
+To run without any GPUs:
+
+You can also control the order of CUDA devices using `CUDA_DEVICE_ORDER`:
+
+- Order by PCIe bus ID (matches `nvidia-smi`):
+
+
+
+- Order by compute capability (fastest first):
+
+
+
+
+
+
+Only XPUs 0 and 2 are "visible" to PyTorch and are mapped to `xpu:0` and `xpu:1` respectively.
+To reverse the order (use XPU 2 as `xpu:0` and XPU 0 as `xpu:1`):
+
+You can also control the order of Intel XPUs with:
+
+For more information about device enumeration and sorting on Intel XPU, please refer to the [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) documentation.
+
+
+
+
+> [!WARNING]
+> Environment variables can be exported instead of being added to the command line. This is not recommended because it can be confusing if you forget how the environment variable was set up and you end up using the wrong accelerators. Instead, it is common practice to set the environment variable for a specific training run on the same command line.
+```
+"""
diff --git a/test/test_translate.py b/test/test_translate.py
index 388a9f2..7cb4ce2 100644
--- a/test/test_translate.py
+++ b/test/test_translate.py
@@ -1,68 +1,68 @@
-translated_content = """
-# 가속기 선택
-
-분산 훈련 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 개수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기들이 있을 때 더 빠른 가속기를 먼저 사용하고 싶거나, 사용 가능한 가속기 중 일부만 사용하고 싶을 때 유용합니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
-
-이 가이드에서는 사용할 가속기의 개수와 사용 순서를 선택하는 방법을 보여드립니다.
-
-## 가속기 개수
-
-예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령어를 실행하세요.
-
-
-
-
-사용할 가속기 개수를 선택하려면 `--nproc_per_node`를 사용하세요.
-
-
-
-
-사용할 가속기 개수를 선택하려면 `--num_processes`를 사용하세요.
-
-
-
-
-사용할 GPU 개수를 선택하려면 `--num_gpus`를 사용하세요.
-
-
-
-
-## 가속기 순서
-사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 보통 각 실행마다 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
-
-예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
-
-
-
-
-GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
-순서를 바꾸려면(GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
-
-GPU 없이 실행하려면:
-
-`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
-
-- PCIe 버스 ID 순서(`nvidia-smi`와 일치):
-
-
-
-- 컴퓨팅 성능 순서(가장 빠른 것부터):
-
-
-
-
-
-
-XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
-순서를 바꾸려면(XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
-
-다음으로 Intel XPU의 순서를 제어할 수도 있습니다:
-
-Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
-
-
-
-
-> [!WARNING]
-> 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신 특정 훈련 실행을 위한 환경 변수를 같은 명령줄에서 설정하는 것이 일반적인 관행입니다.
-"""
+translated_content = """
+# 가속기 선택
+
+분산 훈련 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 개수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기들이 있을 때 더 빠른 가속기를 먼저 사용하고 싶거나, 사용 가능한 가속기 중 일부만 사용하고 싶을 때 유용합니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
+
+이 가이드에서는 사용할 가속기의 개수와 사용 순서를 선택하는 방법을 보여드립니다.
+
+## 가속기 개수
+
+예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령어를 실행하세요.
+
+
+
+
+사용할 가속기 개수를 선택하려면 `--nproc_per_node`를 사용하세요.
+
+
+
+
+사용할 가속기 개수를 선택하려면 `--num_processes`를 사용하세요.
+
+
+
+
+사용할 GPU 개수를 선택하려면 `--num_gpus`를 사용하세요.
+
+
+
+
+## 가속기 순서
+사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 보통 각 실행마다 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
+
+예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
+
+
+
+
+GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
+순서를 바꾸려면(GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
+
+GPU 없이 실행하려면:
+
+`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
+
+- PCIe 버스 ID 순서(`nvidia-smi`와 일치):
+
+
+
+- 컴퓨팅 성능 순서(가장 빠른 것부터):
+
+
+
+
+
+
+XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
+순서를 바꾸려면(XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
+
+다음으로 Intel XPU의 순서를 제어할 수도 있습니다:
+
+Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
+
+
+
+
+> [!WARNING]
+> 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신 특정 훈련 실행을 위한 환경 변수를 같은 명령줄에서 설정하는 것이 일반적인 관행입니다.
+"""
diff --git a/translation_result/docs/source/en/accelerator_selection.md b/translation_result/docs/source/en/accelerator_selection.md
index 58cbfa1..f7e7c46 100644
--- a/translation_result/docs/source/en/accelerator_selection.md
+++ b/translation_result/docs/source/en/accelerator_selection.md
@@ -1,127 +1,127 @@
-
-
-# 가속기 선택 [[accelerator-selection]]
-
-분산 학습 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기가 있을 때 더 빠른 가속기를 먼저 사용하고 싶은 경우에 유용할 수 있습니다. 또는 사용 가능한 가속기의 일부만 사용할 수도 있습니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)는 필요하지 않습니다.
-
-이 가이드는 사용할 가속기의 수와 사용 순서를 선택하는 방법을 보여줍니다.
-
-## 가속기 수 [[number-of-accelerators]]
-
-예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령을 실행하세요.
-
-
-
-
-`--nproc_per_node`를 사용하여 사용할 가속기 수를 선택합니다.
-
-```bash
-torchrun --nproc_per_node=2 trainer-program.py ...
-```
-
-
-
-
-`--num_processes`를 사용하여 사용할 가속기 수를 선택합니다.
-
-```bash
-accelerate launch --num_processes 2 trainer-program.py ...
-```
-
-
-
-
-`--num_gpus`를 사용하여 사용할 GPU 수를 선택합니다.
-
-```bash
-deepspeed --num_gpus 2 trainer-program.py ...
-```
-
-
-
-
-## 가속기 순서 [[order-of-accelerators]]
-사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 종종 각 실행에 대해 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 구성 파일에 추가할 수도 있습니다.
-
-예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
-
-
-
-
-```bash
-CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
-```
-
-GPU 0과 2만 PyTorch에서 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
-순서를 바꾸려면 (GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
-
-
-```bash
-CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
-```
-
-GPU 없이 실행하려면:
-
-```bash
-CUDA_VISIBLE_DEVICES= python trainer-program.py ...
-```
-
-`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
-
-- PCIe 버스 ID 순서 (`nvidia-smi`와 일치):
-
- ```bash
-$hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
- ```
-
-- 컴퓨팅 성능 순서 (가장 빠른 것부터):
-
- ```bash
- export CUDA_DEVICE_ORDER=FASTEST_FIRST
- ```
-
-
-
-
-```bash
-ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
-```
-
-XPU 0과 2만 PyTorch에서 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
-순서를 바꾸려면 (XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
-
-```bash
-ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
-```
-
-
-다음을 사용하여 Intel XPU의 순서를 제어할 수도 있습니다:
-
-```bash
-export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
-```
-
-Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
-
-
-
-
-
-
-> [!WARNING]
-> 환경 변수는 명령줄에 추가하는 대신 내보낼 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란을 야기할 수 있으므로 권장하지 않습니다. 대신, 같은 명령줄에서 특정 훈련 실행을 위해 환경 변수를 설정하는 것이 일반적인 관례입니다.
+
+
+# 가속기 선택 [[accelerator-selection]]
+
+분산 학습 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기가 있을 때 더 빠른 가속기를 먼저 사용하고 싶은 경우에 유용할 수 있습니다. 또는 사용 가능한 가속기의 일부만 사용할 수도 있습니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)는 필요하지 않습니다.
+
+이 가이드는 사용할 가속기의 수와 사용 순서를 선택하는 방법을 보여줍니다.
+
+## 가속기 수 [[number-of-accelerators]]
+
+예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령을 실행하세요.
+
+
+
+
+`--nproc_per_node`를 사용하여 사용할 가속기 수를 선택합니다.
+
+```bash
+torchrun --nproc_per_node=2 trainer-program.py ...
+```
+
+
+
+
+`--num_processes`를 사용하여 사용할 가속기 수를 선택합니다.
+
+```bash
+accelerate launch --num_processes 2 trainer-program.py ...
+```
+
+
+
+
+`--num_gpus`를 사용하여 사용할 GPU 수를 선택합니다.
+
+```bash
+deepspeed --num_gpus 2 trainer-program.py ...
+```
+
+
+
+
+## 가속기 순서 [[order-of-accelerators]]
+사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 종종 각 실행에 대해 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 구성 파일에 추가할 수도 있습니다.
+
+예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
+
+
+
+
+```bash
+CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
+```
+
+GPU 0과 2만 PyTorch에서 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
+순서를 바꾸려면 (GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
+
+
+```bash
+CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
+```
+
+GPU 없이 실행하려면:
+
+```bash
+CUDA_VISIBLE_DEVICES= python trainer-program.py ...
+```
+
+`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
+
+- PCIe 버스 ID 순서 (`nvidia-smi`와 일치):
+
+ ```bash
+$hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
+ ```
+
+- 컴퓨팅 성능 순서 (가장 빠른 것부터):
+
+ ```bash
+ export CUDA_DEVICE_ORDER=FASTEST_FIRST
+ ```
+
+
+
+
+```bash
+ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
+```
+
+XPU 0과 2만 PyTorch에서 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
+순서를 바꾸려면 (XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
+
+```bash
+ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
+```
+
+
+다음을 사용하여 Intel XPU의 순서를 제어할 수도 있습니다:
+
+```bash
+export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
+```
+
+Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
+
+
+
+
+
+
+> [!WARNING]
+> 환경 변수는 명령줄에 추가하는 대신 내보낼 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란을 야기할 수 있으므로 권장하지 않습니다. 대신, 같은 명령줄에서 특정 훈련 실행을 위해 환경 변수를 설정하는 것이 일반적인 관례입니다.
```
\ No newline at end of file
diff --git a/translator/content.py b/translator/content.py
index cb7e2a3..00b80ea 100644
--- a/translator/content.py
+++ b/translator/content.py
@@ -1,214 +1,214 @@
-import os
-import re
-import string
-
-import requests
-from langchain.callbacks import get_openai_callback
-from langchain_anthropic import ChatAnthropic
-import boto3
-import json
-
-from translator.prompt_glossary import PROMPT_WITH_GLOSSARY
-from translator.project_config import get_project_config
-
-
-def get_content(filepath: str, project: str = "transformers") -> str:
- if filepath == "":
- raise ValueError("No files selected for translation.")
-
- config = get_project_config(project)
- # Extract repo path from repo_url (e.g., "huggingface/transformers")
- repo_path = config.repo_url.replace("https://github.com/", "")
-
- url = f"https://raw.githubusercontent.com/{repo_path}/main/{filepath}"
- response = requests.get(url)
- if response.status_code == 200:
- content = response.text
- return content
- else:
- raise ValueError("Failed to retrieve content from the URL.", url)
-
-
-def preprocess_content(content: str) -> str:
- # Extract text to translate from document
-
- ## ignore top license comment
- to_translate = content[content.find("#") :]
- ## remove code blocks from text
- # to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
- ## remove markdown tables from text
- # to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
- ## remove empty lines from text
- to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
- return to_translate
-
-
-def get_full_prompt(language: str, to_translate: str, additional_instruction: str = "") -> str:
- base_prompt = string.Template(
- "What do these sentences about Hugging Face Transformers "
- "(a machine learning library) mean in $language? "
- "Please do not translate the word after a 🤗 emoji "
- "as it is a product name. Output the complete markdown file**, with prose translated and all other content intact"
- "No explanations or extras—only the translated markdown. Also translate all comments within code blocks as well."
- ).safe_substitute(language=language)
-
- base_prompt += "\n\n```md"
-
- full_prompt = "\n".join([base_prompt, to_translate.strip(), "```", PROMPT_WITH_GLOSSARY])
-
- if additional_instruction.strip():
- full_prompt += f"\n\n🗒️ Additional instructions: {additional_instruction.strip()}"
-
- return full_prompt
-
-
-def split_markdown_sections(markdown: str) -> list:
- # Find all titles using regular expressions
- return re.split(r"^(#+\s+)(.*)$", markdown, flags=re.MULTILINE)[1:]
- # format is like [level, title, content, level, title, content, ...]
-
-
-def get_anchors(divided: list) -> list:
- anchors = []
- # from https://github.com/huggingface/doc-builder/blob/01b262bae90d66e1150cdbf58c83c02733ed4366/src/doc_builder/build_doc.py#L300-L302
- for title in divided[1::3]:
- anchor = re.sub(r"[^a-z0-9\s]+", "", title.lower())
- anchor = re.sub(r"\s{2,}", " ", anchor.strip()).replace(" ", "-")
- anchors.append(f"[[{anchor}]]")
- return anchors
-
-
-def make_scaffold(content: str, to_translate: str) -> string.Template:
- scaffold = content
- for i, text in enumerate(to_translate.split("\n\n")):
- scaffold = scaffold.replace(text, f"$hf_i18n_placeholder{i}", 1)
- print("inner scaffold:")
- print(scaffold)
- return string.Template(scaffold)
-
-
-def is_in_code_block(text: str, position: int) -> bool:
- """Check if a position in text is inside a code block"""
- text_before = text[:position]
- code_block_starts = text_before.count("```")
- return code_block_starts % 2 == 1
-
-
-def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
- scaffold = make_scaffold(content, to_translate)
- print("scaffold:")
- print(scaffold.template)
-
- # Get original text sections to maintain structure
- original_sections = to_translate.split("\n\n")
-
- # Split markdown sections to get headers and anchors
- divided = split_markdown_sections(to_translate)
- print("divided:")
- print(divided)
- anchors = get_anchors(divided)
-
- # Split translated content by markdown sections
- translated_divided = split_markdown_sections(translated)
- print("translated divided:")
- print(translated_divided)
-
- # Ensure we have the same number of headers as the original
- if len(translated_divided[1::3]) != len(anchors):
- print(f"Warning: Header count mismatch. Original: {len(anchors)}, Translated: {len(translated_divided[1::3])}")
- # Adjust anchors list to match translated headers
- if len(translated_divided[1::3]) < len(anchors):
- anchors = anchors[:len(translated_divided[1::3])]
- else:
- # Add empty anchors for extra headers
- anchors.extend([""] * (len(translated_divided[1::3]) - len(anchors)))
-
- # Add anchors to translated headers only if they're not in code blocks
- for i, korean_title in enumerate(translated_divided[1::3]):
- if i < len(anchors):
- # Find the position of this header in the original translated text
- header_pos = translated.find(korean_title.strip())
- if header_pos != -1 and not is_in_code_block(translated, header_pos):
- translated_divided[1 + i * 3] = f"{korean_title} {anchors[i]}"
- else:
- translated_divided[1 + i * 3] = korean_title
-
- # Reconstruct translated content with proper structure
- reconstructed_translated = "".join([
- "".join(translated_divided[i * 3 : i * 3 + 3])
- for i in range(len(translated_divided) // 3)
- ])
-
- # Split by double newlines to match original structure
- translated_sections = reconstructed_translated.split("\n\n")
-
- print("scaffold template count:")
- print(scaffold.template.count("$hf_i18n_placeholder"))
- print("original sections length:")
- print(len(original_sections))
- print("translated sections length:")
- print(len(translated_sections))
-
- # Ensure section counts match
- placeholder_count = scaffold.template.count("$hf_i18n_placeholder")
-
- if len(translated_sections) < placeholder_count:
- # Add empty sections if translated has fewer sections
- translated_sections.extend([""] * (placeholder_count - len(translated_sections)))
- elif len(translated_sections) > placeholder_count:
- # Truncate if translated has more sections
- translated_sections = translated_sections[:placeholder_count]
-
- # Final check
- if len(translated_sections) != placeholder_count:
- return f"Error: Section count mismatch. Expected: {placeholder_count}, Got: {len(translated_sections)}"
-
- translated_doc = scaffold.safe_substitute(
- {f"hf_i18n_placeholder{i}": text for i, text in enumerate(translated_sections)}
- )
- return translated_doc
-
-
-def llm_translate(to_translate: str) -> tuple[str, str]:
- anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY")
- aws_bearer_token_bedrock = os.environ.get("AWS_BEARER_TOKEN_BEDROCK")
-
- if anthropic_api_key:
- # Use Anthropic API Key
- model = ChatAnthropic(
- model="claude-sonnet-4-20250514", max_tokens=64000, streaming=True
- )
- ai_message = model.invoke(to_translate)
- cb = "Anthropic API Key used"
- return str(cb), ai_message.content
-
- elif aws_bearer_token_bedrock:
- # Use AWS Bedrock with bearer token (assuming standard AWS credential chain is configured)
- # Note: boto3 does not directly use a 'bearer_token' named environment variable for SigV4 authentication.
- # It relies on AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, or IAM roles.
- # If AWS_BEARER_TOKEN_BEDROCK is meant to be one of these, it should be renamed accordingly.
- # For now, we proceed assuming standard AWS credential chain is configured to pick up credentials.
- client = boto3.client("bedrock-runtime", region_name="eu-north-1")
-
- body = {
- "messages": [
- {"role": "user", "content": to_translate}
- ],
- "max_tokens": 128000,
- "anthropic_version": "bedrock-2023-05-31"
- }
-
- response = client.invoke_model(
- modelId="arn:aws:bedrock:eu-north-1:235729104418:inference-profile/eu.anthropic.claude-3-7-sonnet-20250219-v1:0",
- contentType="application/json",
- accept="application/json",
- body=json.dumps(body),
- )
- result = json.loads(response["body"].read())
- cb = result["usage"]
- ai_message = result["content"][0]["text"]
-
- return str(cb), ai_message
-
- else:
- raise ValueError("No API key found for translation. Please set ANTHROPIC_API_KEY or AWS_BEARER_TOKEN_BEDROCK environment variable.")
+import os
+import re
+import string
+
+import requests
+from langchain.callbacks import get_openai_callback
+from langchain_anthropic import ChatAnthropic
+import boto3
+import json
+
+from translator.prompt_glossary import PROMPT_WITH_GLOSSARY
+from translator.project_config import get_project_config
+
+
+def get_content(filepath: str, project: str = "transformers") -> str:
+ if filepath == "":
+ raise ValueError("No files selected for translation.")
+
+ config = get_project_config(project)
+ # Extract repo path from repo_url (e.g., "huggingface/transformers")
+ repo_path = config.repo_url.replace("https://github.com/", "")
+
+ url = f"https://raw.githubusercontent.com/{repo_path}/main/{filepath}"
+ response = requests.get(url)
+ if response.status_code == 200:
+ content = response.text
+ return content
+ else:
+ raise ValueError("Failed to retrieve content from the URL.", url)
+
+
+def preprocess_content(content: str) -> str:
+ # Extract text to translate from document
+
+ ## ignore top license comment
+ to_translate = content[content.find("#") :]
+ ## remove code blocks from text
+ # to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
+ ## remove markdown tables from text
+ # to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
+ ## remove empty lines from text
+ to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
+ return to_translate
+
+
+def get_full_prompt(language: str, to_translate: str, additional_instruction: str = "") -> str:
+ base_prompt = string.Template(
+ "What do these sentences about Hugging Face Transformers "
+ "(a machine learning library) mean in $language? "
+ "Please do not translate the word after a 🤗 emoji "
+ "as it is a product name. Output the complete markdown file**, with prose translated and all other content intact"
+ "No explanations or extras—only the translated markdown. Also translate all comments within code blocks as well."
+ ).safe_substitute(language=language)
+
+ base_prompt += "\n\n```md"
+
+ full_prompt = "\n".join([base_prompt, to_translate.strip(), "```", PROMPT_WITH_GLOSSARY])
+
+ if additional_instruction.strip():
+ full_prompt += f"\n\n🗒️ Additional instructions: {additional_instruction.strip()}"
+
+ return full_prompt
+
+
+def split_markdown_sections(markdown: str) -> list:
+ # Find all titles using regular expressions
+ return re.split(r"^(#+\s+)(.*)$", markdown, flags=re.MULTILINE)[1:]
+ # format is like [level, title, content, level, title, content, ...]
+
+
+def get_anchors(divided: list) -> list:
+ anchors = []
+ # from https://github.com/huggingface/doc-builder/blob/01b262bae90d66e1150cdbf58c83c02733ed4366/src/doc_builder/build_doc.py#L300-L302
+ for title in divided[1::3]:
+ anchor = re.sub(r"[^a-z0-9\s]+", "", title.lower())
+ anchor = re.sub(r"\s{2,}", " ", anchor.strip()).replace(" ", "-")
+ anchors.append(f"[[{anchor}]]")
+ return anchors
+
+
+def make_scaffold(content: str, to_translate: str) -> string.Template:
+ scaffold = content
+ for i, text in enumerate(to_translate.split("\n\n")):
+ scaffold = scaffold.replace(text, f"$hf_i18n_placeholder{i}", 1)
+ print("inner scaffold:")
+ print(scaffold)
+ return string.Template(scaffold)
+
+
+def is_in_code_block(text: str, position: int) -> bool:
+ """Check if a position in text is inside a code block"""
+ text_before = text[:position]
+ code_block_starts = text_before.count("```")
+ return code_block_starts % 2 == 1
+
+
+def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
+ scaffold = make_scaffold(content, to_translate)
+ print("scaffold:")
+ print(scaffold.template)
+
+ # Get original text sections to maintain structure
+ original_sections = to_translate.split("\n\n")
+
+ # Split markdown sections to get headers and anchors
+ divided = split_markdown_sections(to_translate)
+ print("divided:")
+ print(divided)
+ anchors = get_anchors(divided)
+
+ # Split translated content by markdown sections
+ translated_divided = split_markdown_sections(translated)
+ print("translated divided:")
+ print(translated_divided)
+
+ # Ensure we have the same number of headers as the original
+ if len(translated_divided[1::3]) != len(anchors):
+ print(f"Warning: Header count mismatch. Original: {len(anchors)}, Translated: {len(translated_divided[1::3])}")
+ # Adjust anchors list to match translated headers
+ if len(translated_divided[1::3]) < len(anchors):
+ anchors = anchors[:len(translated_divided[1::3])]
+ else:
+ # Add empty anchors for extra headers
+ anchors.extend([""] * (len(translated_divided[1::3]) - len(anchors)))
+
+ # Add anchors to translated headers only if they're not in code blocks
+ for i, korean_title in enumerate(translated_divided[1::3]):
+ if i < len(anchors):
+ # Find the position of this header in the original translated text
+ header_pos = translated.find(korean_title.strip())
+ if header_pos != -1 and not is_in_code_block(translated, header_pos):
+ translated_divided[1 + i * 3] = f"{korean_title} {anchors[i]}"
+ else:
+ translated_divided[1 + i * 3] = korean_title
+
+ # Reconstruct translated content with proper structure
+ reconstructed_translated = "".join([
+ "".join(translated_divided[i * 3 : i * 3 + 3])
+ for i in range(len(translated_divided) // 3)
+ ])
+
+ # Split by double newlines to match original structure
+ translated_sections = reconstructed_translated.split("\n\n")
+
+ print("scaffold template count:")
+ print(scaffold.template.count("$hf_i18n_placeholder"))
+ print("original sections length:")
+ print(len(original_sections))
+ print("translated sections length:")
+ print(len(translated_sections))
+
+ # Ensure section counts match
+ placeholder_count = scaffold.template.count("$hf_i18n_placeholder")
+
+ if len(translated_sections) < placeholder_count:
+ # Add empty sections if translated has fewer sections
+ translated_sections.extend([""] * (placeholder_count - len(translated_sections)))
+ elif len(translated_sections) > placeholder_count:
+ # Truncate if translated has more sections
+ translated_sections = translated_sections[:placeholder_count]
+
+ # Final check
+ if len(translated_sections) != placeholder_count:
+ return f"Error: Section count mismatch. Expected: {placeholder_count}, Got: {len(translated_sections)}"
+
+ translated_doc = scaffold.safe_substitute(
+ {f"hf_i18n_placeholder{i}": text for i, text in enumerate(translated_sections)}
+ )
+ return translated_doc
+
+
+def llm_translate(to_translate: str) -> tuple[str, str]:
+ anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY")
+ aws_bearer_token_bedrock = os.environ.get("AWS_BEARER_TOKEN_BEDROCK")
+
+ if anthropic_api_key:
+ # Use Anthropic API Key
+ model = ChatAnthropic(
+ model="claude-sonnet-4-20250514", max_tokens=64000, streaming=True
+ )
+ ai_message = model.invoke(to_translate)
+ cb = "Anthropic API Key used"
+ return str(cb), ai_message.content
+
+ elif aws_bearer_token_bedrock:
+ # Use AWS Bedrock with bearer token (assuming standard AWS credential chain is configured)
+ # Note: boto3 does not directly use a 'bearer_token' named environment variable for SigV4 authentication.
+ # It relies on AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, or IAM roles.
+ # If AWS_BEARER_TOKEN_BEDROCK is meant to be one of these, it should be renamed accordingly.
+ # For now, we proceed assuming standard AWS credential chain is configured to pick up credentials.
+ client = boto3.client("bedrock-runtime", region_name="eu-north-1")
+
+ body = {
+ "messages": [
+ {"role": "user", "content": to_translate}
+ ],
+ "max_tokens": 128000,
+ "anthropic_version": "bedrock-2023-05-31"
+ }
+
+ response = client.invoke_model(
+ modelId="arn:aws:bedrock:eu-north-1:235729104418:inference-profile/eu.anthropic.claude-3-7-sonnet-20250219-v1:0",
+ contentType="application/json",
+ accept="application/json",
+ body=json.dumps(body),
+ )
+ result = json.loads(response["body"].read())
+ cb = result["usage"]
+ ai_message = result["content"][0]["text"]
+
+ return str(cb), ai_message
+
+ else:
+ raise ValueError("No API key found for translation. Please set ANTHROPIC_API_KEY or AWS_BEARER_TOKEN_BEDROCK environment variable.")
diff --git a/translator/model.py b/translator/model.py
index ce268f3..f428211 100644
--- a/translator/model.py
+++ b/translator/model.py
@@ -1,70 +1,70 @@
-from enum import Enum, unique
-
-from pydantic import BaseModel, computed_field
-
-
-@unique
-class Languages(Enum):
- az = "az"
- bn = "bn"
- de = "de"
- em = "em"
- es = "es"
- fa = "fa"
- fr = "fr"
- he = "he"
- hu = "hu"
- id = "id"
- it = "it"
- ja = "ja"
- ko = "ko"
- pl = "pl"
- pt = "pt"
- ru = "ru"
- tr = "tr"
- uk = "uk"
- ur = "ur"
- vi = "vi"
- yo = "yo"
- zh = "zh"
- zh_hant = "zh-hant"
-
-
-class TranslationDoc(BaseModel):
- official_lang: str = "en"
- translation_lang: str
- original_file: str
- translation_file: str | None = None
- translation_exists: bool
-
-
-class Summary(BaseModel):
- lang: str
- files_analyzed: int = 0
- files_translated: int = 0
- files_outdated: int = 0
- files_missing_translation: int = 0
- files: list[TranslationDoc] = []
-
- @computed_field # type: ignore
- @property
- def percentage_missing_translation(self) -> float:
- try:
- return (
- 100 * float(self.files_missing_translation) / float(self.files_analyzed)
- )
- except Exception:
- return 0.0
-
- def append_file(self, doc: TranslationDoc) -> None:
- self.files.append(doc)
- self.files_analyzed += 1
-
- if doc.translation_exists:
- self.files_translated += 1
-
- if not doc.translation_exists:
- self.files_missing_translation += 1
-
- def first_missing_translation_files(self, length: int = 10) -> list[TranslationDoc]:
- return list(filter(lambda d: not d.translation_exists, self.files))[:length]
+from enum import Enum, unique
+
+from pydantic import BaseModel, computed_field
+
+
+@unique
+class Languages(Enum):
+ az = "az"
+ bn = "bn"
+ de = "de"
+ em = "em"
+ es = "es"
+ fa = "fa"
+ fr = "fr"
+ he = "he"
+ hu = "hu"
+ id = "id"
+ it = "it"
+ ja = "ja"
+ ko = "ko"
+ pl = "pl"
+ pt = "pt"
+ ru = "ru"
+ tr = "tr"
+ uk = "uk"
+ ur = "ur"
+ vi = "vi"
+ yo = "yo"
+ zh = "zh"
+ zh_hant = "zh-hant"
+
+
+class TranslationDoc(BaseModel):
+ official_lang: str = "en"
+ translation_lang: str
+ original_file: str
+ translation_file: str | None = None
+ translation_exists: bool
+
+
+class Summary(BaseModel):
+ lang: str
+ files_analyzed: int = 0
+ files_translated: int = 0
+ files_outdated: int = 0
+ files_missing_translation: int = 0
+ files: list[TranslationDoc] = []
+
+ @computed_field # type: ignore
+ @property
+ def percentage_missing_translation(self) -> float:
+ try:
+ return (
+ 100 * float(self.files_missing_translation) / float(self.files_analyzed)
+ )
+ except Exception:
+ return 0.0
+
+ def append_file(self, doc: TranslationDoc) -> None:
+ self.files.append(doc)
+ self.files_analyzed += 1
+
+ if doc.translation_exists:
+ self.files_translated += 1
+
+ if not doc.translation_exists:
+ self.files_missing_translation += 1
+
+ def first_missing_translation_files(self, length: int = 10) -> list[TranslationDoc]:
+ return list(filter(lambda d: not d.translation_exists, self.files))[:length]
diff --git a/translator/project_config.py b/translator/project_config.py
index 17a6594..9fab291 100644
--- a/translator/project_config.py
+++ b/translator/project_config.py
@@ -1,48 +1,48 @@
-"""Project configuration for different HuggingFace repositories."""
-
-from dataclasses import dataclass
-from typing import Dict
-
-
-@dataclass
-class ProjectConfig:
- """Configuration for a specific HuggingFace project."""
- name: str
- repo_url: str
- api_url: str
- docs_path: str
- github_issues: Dict[str, str] # language -> issue_id
- reference_pr_url: str
-
-
-# Project configurations
-PROJECTS = {
- "transformers": ProjectConfig(
- name="Transformers",
- repo_url="https://github.com/huggingface/transformers",
- api_url="https://api.github.com/repos/huggingface/transformers/git/trees/main?recursive=1",
- docs_path="docs/source",
- github_issues={"ko": "20179"},
- reference_pr_url="https://github.com/huggingface/transformers/pull/24968"
- ),
- "smolagents": ProjectConfig(
- name="SmolAgents",
- repo_url="https://github.com/huggingface/smolagents",
- api_url="https://api.github.com/repos/huggingface/smolagents/git/trees/main?recursive=1",
- docs_path="docs/source",
- github_issues={"ko": "20179"}, # To be filled when issue is created
- reference_pr_url="https://github.com/huggingface/smolagents/pull/1581" # To be filled with actual PR URL
- )
-}
-
-
-def get_project_config(project_key: str) -> ProjectConfig:
- """Get project configuration by key."""
- if project_key not in PROJECTS:
- raise ValueError(f"Unknown project: {project_key}. Available: {list(PROJECTS.keys())}")
- return PROJECTS[project_key]
-
-
-def get_available_projects() -> list[str]:
- """Get list of available project keys."""
+"""Project configuration for different HuggingFace repositories."""
+
+from dataclasses import dataclass
+from typing import Dict
+
+
+@dataclass
+class ProjectConfig:
+ """Configuration for a specific HuggingFace project."""
+ name: str
+ repo_url: str
+ api_url: str
+ docs_path: str
+ github_issues: Dict[str, str] # language -> issue_id
+ reference_pr_url: str
+
+
+# Project configurations
+PROJECTS = {
+ "transformers": ProjectConfig(
+ name="Transformers",
+ repo_url="https://github.com/huggingface/transformers",
+ api_url="https://api.github.com/repos/huggingface/transformers/git/trees/main?recursive=1",
+ docs_path="docs/source",
+ github_issues={"ko": "20179"},
+ reference_pr_url="https://github.com/huggingface/transformers/pull/24968"
+ ),
+ "smolagents": ProjectConfig(
+ name="SmolAgents",
+ repo_url="https://github.com/huggingface/smolagents",
+ api_url="https://api.github.com/repos/huggingface/smolagents/git/trees/main?recursive=1",
+ docs_path="docs/source",
+ github_issues={"ko": "20179"}, # To be filled when issue is created
+ reference_pr_url="https://github.com/huggingface/smolagents/pull/1581" # To be filled with actual PR URL
+ )
+}
+
+
+def get_project_config(project_key: str) -> ProjectConfig:
+ """Get project configuration by key."""
+ if project_key not in PROJECTS:
+ raise ValueError(f"Unknown project: {project_key}. Available: {list(PROJECTS.keys())}")
+ return PROJECTS[project_key]
+
+
+def get_available_projects() -> list[str]:
+ """Get list of available project keys."""
return list(PROJECTS.keys())
\ No newline at end of file
diff --git a/translator/prompt_glossary.py b/translator/prompt_glossary.py
index 859f3cd..261f30f 100644
--- a/translator/prompt_glossary.py
+++ b/translator/prompt_glossary.py
@@ -1,126 +1,126 @@
-PROMPT_WITH_GLOSSARY = """
-You have a glossary of terms with their Korean translations. When translating a sentence, you need to check if any of the words in the sentence are in the glossary, and if so, translate them according to the provided Korean terms. Here is the glossary:
-
-🔹 Glossary (English → Korean):
-- revision: 개정
-- method: 메소드
-- secrets: 비밀값
-- search helper: 검색 헬퍼
-- logging level: 로그 레벨
-- workflow: 워크플로우
-- corner case: 코너 케이스
-- tokenization: 토큰화
-- architecture: 아키텍처
-- attention mask: 어텐션 마스크
-- backbone: 백본
-- argmax: argmax
-- beam search: 빔 서치
-- clustering: 군집화
-- configuration: 구성
-- context: 문맥
-- cross entropy: 교차 엔트로피
-- cross-attention: 크로스 어텐션
-- dictionary: 딕셔너리
-- entry: 엔트리
-- few shot: 퓨샷
-- flatten: 평탄화
-- ground truth: 정답
-- head: 헤드
-- helper function: 헬퍼 함수
-- image captioning: 이미지 캡셔닝
-- image patch: 이미지 패치
-- inference: 추론
-- instance: 인스턴스
-- Instantiate: 인스턴스화
-- knowledge distillation: 지식 증류
-- labels: 레이블
-- large language models (LLM): 대규모 언어 모델
-- layer: 레이어
-- learning rate scheduler: Learning Rate Scheduler
-- localization: 로컬리제이션
-- log mel-filter bank: 로그 멜 필터 뱅크
-- look-up table: 룩업 테이블
-- loss function: 손실 함수
-- machine learning: 머신 러닝
-- mapping: 매핑
-- masked language modeling (MLM): 마스크드 언어 모델
-- malware: 악성코드
-- metric: 지표
-- mixed precision: 혼합 정밀도
-- modality: 모달리티
-- monolingual model: 단일 언어 모델
-- multi gpu: 다중 GPU
-- multilingual model: 다국어 모델
-- parsing: 파싱
-- perplexity (PPL): 펄플렉서티(Perplexity)
-- pipeline: 파이프라인
-- pixel values: 픽셀 값
-- pooling: 풀링
-- position IDs: 위치 ID
-- preprocessing: 전처리
-- prompt: 프롬프트
-- pythonic: 파이써닉
-- query: 쿼리
-- question answering: 질의 응답
-- raw audio waveform: 원시 오디오 파형
-- recurrent neural network (RNN): 순환 신경망
-- accelerator: 가속기
-- Accelerate: Accelerate
-- architecture: 아키텍처
-- arguments: 인수
-- attention mask: 어텐션 마스크
-- augmentation: 증강
-- autoencoding models: 오토인코딩 모델
-- autoregressive models: 자기회귀 모델
-- backward: 역방향
-- bounding box: 바운딩 박스
-- causal language modeling: 인과적 언어 모델링(causal language modeling)
-- channel: 채널
-- checkpoint: 체크포인트(checkpoint)
-- chunk: 묶음
-- computer vision: 컴퓨터 비전
-- convolution: 합성곱
-- crop: 자르기
-- custom: 사용자 정의
-- customize: 맞춤 설정하다
-- data collator: 데이터 콜레이터
-- dataset: 데이터 세트
-- decoder input IDs: 디코더 입력 ID
-- decoder models: 디코더 모델
-- deep learning (DL): 딥러닝
-- directory: 디렉터리
-- distributed training: 분산 학습
-- downstream: 다운스트림
-- encoder models: 인코더 모델
-- entity: 개체
-- epoch: 에폭
-- evaluation method: 평가 방법
-- feature extraction: 특성 추출
-- feature matrix: 특성 행렬(feature matrix)
-- fine-tunning: 미세 조정
-- finetuned models: 미세 조정 모델
-- hidden state: 은닉 상태
-- hyperparameter: 하이퍼파라미터
-- learning: 학습
-- load: 가져오다
-- method: 메소드
-- optimizer: 옵티마이저
-- pad (padding): 패드 (패딩)
-- parameter: 매개변수
-- pretrained model: 사전훈련된 모델
-- separator (* [SEP]를 부르는 이름): 분할 토큰
-- sequence: 시퀀스
-- silent error: 조용한 오류
-- token: 토큰
-- tokenizer: 토크나이저
-- training: 훈련
-- workflow: 워크플로우
-
-📌 Instructions:
-1. Whenever a source term from the glossary appears **in any form** (full match or partial match within a larger phrase), **replace it with the exact Korean translation** from the glossary, keeping the rest of the phrase in Korean.
- - Example: “Attention Interface” → “어텐션 인터페이스”
- - Example: “Architecture details” → “아키텍처 상세”
-2. Non-glossary words should be translated naturally, respecting context and technical nuance.
-
-Please revise the translated sentences accordingly using the terms provided in this glossary.
-"""
+PROMPT_WITH_GLOSSARY = """
+You have a glossary of terms with their Korean translations. When translating a sentence, you need to check if any of the words in the sentence are in the glossary, and if so, translate them according to the provided Korean terms. Here is the glossary:
+
+🔹 Glossary (English → Korean):
+- revision: 개정
+- method: 메소드
+- secrets: 비밀값
+- search helper: 검색 헬퍼
+- logging level: 로그 레벨
+- workflow: 워크플로우
+- corner case: 코너 케이스
+- tokenization: 토큰화
+- architecture: 아키텍처
+- attention mask: 어텐션 마스크
+- backbone: 백본
+- argmax: argmax
+- beam search: 빔 서치
+- clustering: 군집화
+- configuration: 구성
+- context: 문맥
+- cross entropy: 교차 엔트로피
+- cross-attention: 크로스 어텐션
+- dictionary: 딕셔너리
+- entry: 엔트리
+- few shot: 퓨샷
+- flatten: 평탄화
+- ground truth: 정답
+- head: 헤드
+- helper function: 헬퍼 함수
+- image captioning: 이미지 캡셔닝
+- image patch: 이미지 패치
+- inference: 추론
+- instance: 인스턴스
+- Instantiate: 인스턴스화
+- knowledge distillation: 지식 증류
+- labels: 레이블
+- large language models (LLM): 대규모 언어 모델
+- layer: 레이어
+- learning rate scheduler: Learning Rate Scheduler
+- localization: 로컬리제이션
+- log mel-filter bank: 로그 멜 필터 뱅크
+- look-up table: 룩업 테이블
+- loss function: 손실 함수
+- machine learning: 머신 러닝
+- mapping: 매핑
+- masked language modeling (MLM): 마스크드 언어 모델
+- malware: 악성코드
+- metric: 지표
+- mixed precision: 혼합 정밀도
+- modality: 모달리티
+- monolingual model: 단일 언어 모델
+- multi gpu: 다중 GPU
+- multilingual model: 다국어 모델
+- parsing: 파싱
+- perplexity (PPL): 펄플렉서티(Perplexity)
+- pipeline: 파이프라인
+- pixel values: 픽셀 값
+- pooling: 풀링
+- position IDs: 위치 ID
+- preprocessing: 전처리
+- prompt: 프롬프트
+- pythonic: 파이써닉
+- query: 쿼리
+- question answering: 질의 응답
+- raw audio waveform: 원시 오디오 파형
+- recurrent neural network (RNN): 순환 신경망
+- accelerator: 가속기
+- Accelerate: Accelerate
+- architecture: 아키텍처
+- arguments: 인수
+- attention mask: 어텐션 마스크
+- augmentation: 증강
+- autoencoding models: 오토인코딩 모델
+- autoregressive models: 자기회귀 모델
+- backward: 역방향
+- bounding box: 바운딩 박스
+- causal language modeling: 인과적 언어 모델링(causal language modeling)
+- channel: 채널
+- checkpoint: 체크포인트(checkpoint)
+- chunk: 묶음
+- computer vision: 컴퓨터 비전
+- convolution: 합성곱
+- crop: 자르기
+- custom: 사용자 정의
+- customize: 맞춤 설정하다
+- data collator: 데이터 콜레이터
+- dataset: 데이터 세트
+- decoder input IDs: 디코더 입력 ID
+- decoder models: 디코더 모델
+- deep learning (DL): 딥러닝
+- directory: 디렉터리
+- distributed training: 분산 학습
+- downstream: 다운스트림
+- encoder models: 인코더 모델
+- entity: 개체
+- epoch: 에폭
+- evaluation method: 평가 방법
+- feature extraction: 특성 추출
+- feature matrix: 특성 행렬(feature matrix)
+- fine-tunning: 미세 조정
+- finetuned models: 미세 조정 모델
+- hidden state: 은닉 상태
+- hyperparameter: 하이퍼파라미터
+- learning: 학습
+- load: 가져오다
+- method: 메소드
+- optimizer: 옵티마이저
+- pad (padding): 패드 (패딩)
+- parameter: 매개변수
+- pretrained model: 사전훈련된 모델
+- separator (* [SEP]를 부르는 이름): 분할 토큰
+- sequence: 시퀀스
+- silent error: 조용한 오류
+- token: 토큰
+- tokenizer: 토크나이저
+- training: 훈련
+- workflow: 워크플로우
+
+📌 Instructions:
+1. Whenever a source term from the glossary appears **in any form** (full match or partial match within a larger phrase), **replace it with the exact Korean translation** from the glossary, keeping the rest of the phrase in Korean.
+ - Example: “Attention Interface” → “어텐션 인터페이스”
+ - Example: “Architecture details” → “아키텍처 상세”
+2. Non-glossary words should be translated naturally, respecting context and technical nuance.
+
+Please revise the translated sentences accordingly using the terms provided in this glossary.
+"""
diff --git a/translator/retriever.py b/translator/retriever.py
index 0af9584..3cdb267 100644
--- a/translator/retriever.py
+++ b/translator/retriever.py
@@ -1,199 +1,199 @@
-import re
-import os
-from pathlib import Path
-
-import requests
-
-from .model import Languages, Summary, TranslationDoc
-from .project_config import get_project_config
-
-
-def get_github_repo_files(project: str = "transformers"):
- """
- Get github repo files
- """
- config = get_project_config(project)
-
- # Add GitHub token if available to avoid rate limiting (optional)
- headers = {}
- github_token = os.environ.get("GITHUB_TOKEN")
- if github_token:
- headers["Authorization"] = f"token {github_token}"
-
- response = requests.get(config.api_url, headers=headers)
-
- # Handle rate limit with helpful message
- if response.status_code == 403 and "rate limit" in response.text.lower():
- raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
-
- data = response.json()
- all_items = data.get("tree", [])
-
- file_paths = [
- item["path"]
- for item in all_items
- if item["type"] == "blob" and (item["path"].startswith("docs"))
- ]
- return file_paths
-
-
-def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko", all_files: list = None):
- """
- Get open PR in the github issue, filtered by title containing '[i18n-KO]'.
- """
- config = get_project_config(project)
- issue_id = config.github_issues.get(lang)
-
- # For projects without GitHub issue tracking, still search for PRs
- if not issue_id:
- raise ValueError(f"⚠️ No GitHub issue registered for {project}.")
-
- # Require all_files parameter
- if all_files is None:
- raise ValueError("Repository file list must be provided")
-
- headers = {
- "Accept": "application/vnd.github+json",
- }
-
- # Add GitHub token if available to avoid rate limiting (optional)
- github_token = os.environ.get("GITHUB_TOKEN")
- if github_token:
- headers["Authorization"] = f"token {github_token}"
-
- all_open_prs = []
- page = 1
- per_page = 100 # Maximum allowed by GitHub API
-
- while True:
- repo_path = config.repo_url.replace("https://github.com/", "")
- url = f"https://api.github.com/repos/{repo_path}/pulls?state=open&page={page}&per_page={per_page}"
- response = requests.get(url, headers=headers)
-
- if response.status_code == 403 and "rate limit" in response.text.lower():
- raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
- elif response.status_code != 200:
- raise Exception(f"GitHub API error: {response.status_code} {response.text}")
-
- page_prs = response.json()
- if not page_prs: # No more PRs
- break
-
- all_open_prs.extend(page_prs)
- page += 1
-
- # Break if we got less than per_page results (last page)
- if len(page_prs) < per_page:
- break
-
- filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]]
-
- # Pattern to match filenames after "Translated" keyword
- pattern = re.compile(r"Translated\s+(?:`([^`]+)`|(\S+))\s+to")
-
- def find_original_file_path(filename_from_title, all_files):
- """Find the exact file path from repo files by matching filename"""
- if not filename_from_title:
- return None
-
- # Remove .md extension for matching
- base_name = filename_from_title.replace('.md', '')
-
- # Look for exact matches in repo files
- for file_path in all_files:
- if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
- file_base = file_path.split("/")[-1].replace('.md', '')
- if file_base == base_name:
- return file_path
-
- # If no exact match, fallback to simple path
- return f"docs/source/en/{filename_from_title}"
-
- filenames = []
- pr_info_list = []
-
- for pr in filtered_prs:
- match = pattern.search(pr["title"])
- if match:
- # Use group 1 (with backticks) or group 2 (without backticks)
- filename = match.group(1) or match.group(2)
- # Add .md extension if not present
- if not filename.endswith('.md'):
- filename += '.md'
-
- # Find the correct file path by matching filename
- correct_path = None
- if filename:
- # Remove .md extension for matching
- base_name = filename.replace('.md', '')
-
- # Look for exact matches in repo files
- for file_path in all_files:
- if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
- file_base = file_path.split("/")[-1].replace('.md', '')
- if file_base == base_name:
- correct_path = file_path
- break
-
- # If no exact match, fallback to simple path
- if not correct_path:
- correct_path = f"docs/source/en/{filename}"
- if correct_path:
- filenames.append(correct_path)
- pr_info_list.append(f"{config.repo_url}/pull/{pr['url'].rstrip('/').split('/')[-1]}")
- return filenames, pr_info_list
-
-
-def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
- """
- Retrieve missing docs
- """
-
- report = f"""
-| Item | Count | Percentage |
-|------|-------|------------|
-| 📂 HuggingFaces docs | {summary.files_analyzed} | - |
-| 🪹 Missing translations | {summary.files_missing_translation} | {summary.percentage_missing_translation:.2f}% |
-"""
- print(report)
- first_missing_docs = list()
- for file in summary.first_missing_translation_files(table_size):
- first_missing_docs.append(file.original_file)
-
- print(first_missing_docs)
- return report, first_missing_docs
-
-
-def report(project: str, target_lang: str, top_k: int = 1, docs_file: list = None) -> tuple[str, list[str]]:
- """
- Generate a report for the translated docs
- """
- if docs_file is None:
- raise ValueError("Repository file list must be provided")
-
- base_docs_path = Path("docs/source")
- en_docs_path = Path("docs/source/en")
-
- lang = Languages[target_lang]
- summary = Summary(lang=lang.value)
-
- for file in docs_file:
- if file.endswith(".md"):
- try:
- file_relative_path = Path(file).relative_to(en_docs_path)
- except ValueError:
- continue
-
- translated_path = os.path.join(
- base_docs_path, lang.value, file_relative_path
- )
- translation_exists = translated_path in docs_file
-
- doc = TranslationDoc(
- translation_lang=lang.value,
- original_file=file,
- translation_file=translated_path,
- translation_exists=translation_exists,
- )
- summary.append_file(doc)
- return retrieve(summary, top_k)
+import re
+import os
+from pathlib import Path
+
+import requests
+
+from .model import Languages, Summary, TranslationDoc
+from .project_config import get_project_config
+
+
+def get_github_repo_files(project: str = "transformers"):
+ """
+ Get github repo files
+ """
+ config = get_project_config(project)
+
+ # Add GitHub token if available to avoid rate limiting (optional)
+ headers = {}
+ github_token = os.environ.get("GITHUB_TOKEN")
+ if github_token:
+ headers["Authorization"] = f"token {github_token}"
+
+ response = requests.get(config.api_url, headers=headers)
+
+ # Handle rate limit with helpful message
+ if response.status_code == 403 and "rate limit" in response.text.lower():
+ raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
+
+ data = response.json()
+ all_items = data.get("tree", [])
+
+ file_paths = [
+ item["path"]
+ for item in all_items
+ if item["type"] == "blob" and (item["path"].startswith("docs"))
+ ]
+ return file_paths
+
+
+def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko", all_files: list = None):
+ """
+ Get open PR in the github issue, filtered by title containing '[i18n-KO]'.
+ """
+ config = get_project_config(project)
+ issue_id = config.github_issues.get(lang)
+
+ # For projects without GitHub issue tracking, still search for PRs
+ if not issue_id:
+ raise ValueError(f"⚠️ No GitHub issue registered for {project}.")
+
+ # Require all_files parameter
+ if all_files is None:
+ raise ValueError("Repository file list must be provided")
+
+ headers = {
+ "Accept": "application/vnd.github+json",
+ }
+
+ # Add GitHub token if available to avoid rate limiting (optional)
+ github_token = os.environ.get("GITHUB_TOKEN")
+ if github_token:
+ headers["Authorization"] = f"token {github_token}"
+
+ all_open_prs = []
+ page = 1
+ per_page = 100 # Maximum allowed by GitHub API
+
+ while True:
+ repo_path = config.repo_url.replace("https://github.com/", "")
+ url = f"https://api.github.com/repos/{repo_path}/pulls?state=open&page={page}&per_page={per_page}"
+ response = requests.get(url, headers=headers)
+
+ if response.status_code == 403 and "rate limit" in response.text.lower():
+ raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
+ elif response.status_code != 200:
+ raise Exception(f"GitHub API error: {response.status_code} {response.text}")
+
+ page_prs = response.json()
+ if not page_prs: # No more PRs
+ break
+
+ all_open_prs.extend(page_prs)
+ page += 1
+
+ # Break if we got less than per_page results (last page)
+ if len(page_prs) < per_page:
+ break
+
+ filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]]
+
+ # Pattern to match filenames after "Translated" keyword
+ pattern = re.compile(r"Translated\s+(?:`([^`]+)`|(\S+))\s+to")
+
+ def find_original_file_path(filename_from_title, all_files):
+ """Find the exact file path from repo files by matching filename"""
+ if not filename_from_title:
+ return None
+
+ # Remove .md extension for matching
+ base_name = filename_from_title.replace('.md', '')
+
+ # Look for exact matches in repo files
+ for file_path in all_files:
+ if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
+ file_base = file_path.split("/")[-1].replace('.md', '')
+ if file_base == base_name:
+ return file_path
+
+ # If no exact match, fallback to simple path
+ return f"docs/source/en/{filename_from_title}"
+
+ filenames = []
+ pr_info_list = []
+
+ for pr in filtered_prs:
+ match = pattern.search(pr["title"])
+ if match:
+ # Use group 1 (with backticks) or group 2 (without backticks)
+ filename = match.group(1) or match.group(2)
+ # Add .md extension if not present
+ if not filename.endswith('.md'):
+ filename += '.md'
+
+ # Find the correct file path by matching filename
+ correct_path = None
+ if filename:
+ # Remove .md extension for matching
+ base_name = filename.replace('.md', '')
+
+ # Look for exact matches in repo files
+ for file_path in all_files:
+ if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
+ file_base = file_path.split("/")[-1].replace('.md', '')
+ if file_base == base_name:
+ correct_path = file_path
+ break
+
+ # If no exact match, fallback to simple path
+ if not correct_path:
+ correct_path = f"docs/source/en/{filename}"
+ if correct_path:
+ filenames.append(correct_path)
+ pr_info_list.append(f"{config.repo_url}/pull/{pr['url'].rstrip('/').split('/')[-1]}")
+ return filenames, pr_info_list
+
+
+def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
+ """
+ Retrieve missing docs
+ """
+
+ report = f"""
+| Item | Count | Percentage |
+|------|-------|------------|
+| 📂 HuggingFaces docs | {summary.files_analyzed} | - |
+| 🪹 Missing translations | {summary.files_missing_translation} | {summary.percentage_missing_translation:.2f}% |
+"""
+ print(report)
+ first_missing_docs = list()
+ for file in summary.first_missing_translation_files(table_size):
+ first_missing_docs.append(file.original_file)
+
+ print(first_missing_docs)
+ return report, first_missing_docs
+
+
+def report(project: str, target_lang: str, top_k: int = 1, docs_file: list = None) -> tuple[str, list[str]]:
+ """
+ Generate a report for the translated docs
+ """
+ if docs_file is None:
+ raise ValueError("Repository file list must be provided")
+
+ base_docs_path = Path("docs/source")
+ en_docs_path = Path("docs/source/en")
+
+ lang = Languages[target_lang]
+ summary = Summary(lang=lang.value)
+
+ for file in docs_file:
+ if file.endswith(".md"):
+ try:
+ file_relative_path = Path(file).relative_to(en_docs_path)
+ except ValueError:
+ continue
+
+ translated_path = os.path.join(
+ base_docs_path, lang.value, file_relative_path
+ )
+ translation_exists = translated_path in docs_file
+
+ doc = TranslationDoc(
+ translation_lang=lang.value,
+ original_file=file,
+ translation_file=translated_path,
+ translation_exists=translation_exists,
+ )
+ summary.append_file(doc)
+ return retrieve(summary, top_k)