Spaces:
Runtime error
Runtime error
Add MCP servers under external/ (docs-explorer & translation-reviewer)2
Browse files- .github/workflows/main.yml +82 -82
- .gitignore +3 -3
- README.md +307 -307
- agent/handler.py +639 -639
- agent/toctree_handler.py +419 -419
- agent/workflow.py +338 -338
- app.py +379 -379
- config.py +9 -9
- example.env +18 -18
- logger/github_logger.py +71 -71
- pr_generator/agent.py +596 -596
- pr_generator/searcher.py +238 -238
- requirements.txt +10 -10
- test/test_final_translate.md +127 -127
- test/test_prompt.py +71 -71
- test/test_translate.py +68 -68
- translation_result/docs/source/en/accelerator_selection.md +126 -126
- translator/content.py +214 -214
- translator/model.py +70 -70
- translator/project_config.py +47 -47
- translator/prompt_glossary.py +126 -126
- translator/retriever.py +199 -199
.github/workflows/main.yml
CHANGED
|
@@ -1,82 +1,82 @@
|
|
| 1 |
-
name: Deploy to HF Space
|
| 2 |
-
|
| 3 |
-
on:
|
| 4 |
-
push:
|
| 5 |
-
branches: [mcp/docs-search-review]
|
| 6 |
-
workflow_dispatch:
|
| 7 |
-
|
| 8 |
-
jobs:
|
| 9 |
-
deploy:
|
| 10 |
-
runs-on: ubuntu-latest
|
| 11 |
-
|
| 12 |
-
steps:
|
| 13 |
-
- name: Checkout repository
|
| 14 |
-
uses: actions/checkout@v3
|
| 15 |
-
with:
|
| 16 |
-
fetch-depth: 0
|
| 17 |
-
lfs: true
|
| 18 |
-
ref: mcp/docs-search-review # 추후 main 수정
|
| 19 |
-
|
| 20 |
-
- name: Setup LFS & migrate images
|
| 21 |
-
run: |
|
| 22 |
-
git config --global user.email "[email protected]"
|
| 23 |
-
git config --global user.name "GitHub Actions"
|
| 24 |
-
git lfs install
|
| 25 |
-
git lfs track "images/**"
|
| 26 |
-
echo "images/** filter=lfs diff=lfs merge=lfs -text" >> .gitattributes
|
| 27 |
-
git add .gitattributes
|
| 28 |
-
git commit -m "Add images to LFS tracking" || echo "No changes"
|
| 29 |
-
git add -A
|
| 30 |
-
git diff --cached --quiet || git commit -m "Pre-migrate: commit all changes" || echo "No changes"
|
| 31 |
-
git lfs migrate import --include="images/**" --include-ref=refs/heads/mcp/docs-search-review # 추후 main 수정
|
| 32 |
-
|
| 33 |
-
- name: Deploy to Hugging Face Space
|
| 34 |
-
env:
|
| 35 |
-
HF_USERNAME: ${{ secrets.HF_USERNAME }}
|
| 36 |
-
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 37 |
-
HF_SPACE_NAME: ${{ secrets.HF_SPACE_NAME }}
|
| 38 |
-
run: |
|
| 39 |
-
git remote add space https://$HF_USERNAME:[email protected]/spaces/$HF_USERNAME/$HF_SPACE_NAME
|
| 40 |
-
git push --force space mcp/docs-search-review:main # 추후 main 수정
|
| 41 |
-
|
| 42 |
-
deploy_docs_explorer:
|
| 43 |
-
runs-on: ubuntu-latest
|
| 44 |
-
|
| 45 |
-
steps:
|
| 46 |
-
- name: Checkout repository
|
| 47 |
-
uses: actions/checkout@v3
|
| 48 |
-
with:
|
| 49 |
-
fetch-depth: 0
|
| 50 |
-
ref: mcp/docs-search-review # 추후 main 수정
|
| 51 |
-
|
| 52 |
-
- name: Push hf-translation-docs-explorer to HF Space
|
| 53 |
-
env:
|
| 54 |
-
HF_USERNAME: ${{ secrets.HF_USERNAME }}
|
| 55 |
-
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 56 |
-
HF_SPACE_NAME_DOCS_EXPLORER: ${{ secrets.HF_SPACE_NAME_DOCS_EXPLORER }}
|
| 57 |
-
run: |
|
| 58 |
-
git subtree split --prefix=external/mcp-servers/hf-translation-docs-explorer -b docs-explorer-branch
|
| 59 |
-
|
| 60 |
-
git remote add space-docs-explorer https://$HF_USERNAME:[email protected]/spaces/$HF_USERNAME/$HF_SPACE_NAME_DOCS_EXPLORER
|
| 61 |
-
git push --force space-docs-explorer docs-explorer-branch:main
|
| 62 |
-
|
| 63 |
-
deploy_translation_reviewer:
|
| 64 |
-
runs-on: ubuntu-latest
|
| 65 |
-
|
| 66 |
-
steps:
|
| 67 |
-
- name: Checkout repository
|
| 68 |
-
uses: actions/checkout@v3
|
| 69 |
-
with:
|
| 70 |
-
fetch-depth: 0
|
| 71 |
-
ref: mcp/docs-search-review # 추후 main 수정
|
| 72 |
-
|
| 73 |
-
- name: Push hf-translation-reviewer to HF Space
|
| 74 |
-
env:
|
| 75 |
-
HF_USERNAME: ${{ secrets.HF_USERNAME }}
|
| 76 |
-
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 77 |
-
HF_SPACE_NAME_TRANSLATION_REVIEWER: ${{ secrets.HF_SPACE_NAME_TRANSLATION_REVIEWER }}
|
| 78 |
-
run: |
|
| 79 |
-
git subtree split --prefix=external/mcp-servers/hf-translation-reviewer -b translation-reviewer-branch
|
| 80 |
-
|
| 81 |
-
git remote add space-translation-reviewer https://$HF_USERNAME:[email protected]/spaces/$HF_USERNAME/$HF_SPACE_NAME_TRANSLATION_REVIEWER
|
| 82 |
-
git push --force space-translation-reviewer translation-reviewer-branch:main
|
|
|
|
| 1 |
+
name: Deploy to HF Space
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [mcp/docs-search-review]
|
| 6 |
+
workflow_dispatch:
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
deploy:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
|
| 12 |
+
steps:
|
| 13 |
+
- name: Checkout repository
|
| 14 |
+
uses: actions/checkout@v3
|
| 15 |
+
with:
|
| 16 |
+
fetch-depth: 0
|
| 17 |
+
lfs: true
|
| 18 |
+
ref: mcp/docs-search-review # 추후 main 수정
|
| 19 |
+
|
| 20 |
+
- name: Setup LFS & migrate images
|
| 21 |
+
run: |
|
| 22 |
+
git config --global user.email "[email protected]"
|
| 23 |
+
git config --global user.name "GitHub Actions"
|
| 24 |
+
git lfs install
|
| 25 |
+
git lfs track "images/**"
|
| 26 |
+
echo "images/** filter=lfs diff=lfs merge=lfs -text" >> .gitattributes
|
| 27 |
+
git add .gitattributes
|
| 28 |
+
git commit -m "Add images to LFS tracking" || echo "No changes"
|
| 29 |
+
git add -A
|
| 30 |
+
git diff --cached --quiet || git commit -m "Pre-migrate: commit all changes" || echo "No changes"
|
| 31 |
+
git lfs migrate import --include="images/**" --include-ref=refs/heads/mcp/docs-search-review # 추후 main 수정
|
| 32 |
+
|
| 33 |
+
- name: Deploy to Hugging Face Space
|
| 34 |
+
env:
|
| 35 |
+
HF_USERNAME: ${{ secrets.HF_USERNAME }}
|
| 36 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 37 |
+
HF_SPACE_NAME: ${{ secrets.HF_SPACE_NAME }}
|
| 38 |
+
run: |
|
| 39 |
+
git remote add space https://$HF_USERNAME:[email protected]/spaces/$HF_USERNAME/$HF_SPACE_NAME
|
| 40 |
+
git push --force space mcp/docs-search-review:main # 추후 main 수정
|
| 41 |
+
|
| 42 |
+
deploy_docs_explorer:
|
| 43 |
+
runs-on: ubuntu-latest
|
| 44 |
+
|
| 45 |
+
steps:
|
| 46 |
+
- name: Checkout repository
|
| 47 |
+
uses: actions/checkout@v3
|
| 48 |
+
with:
|
| 49 |
+
fetch-depth: 0
|
| 50 |
+
ref: mcp/docs-search-review # 추후 main 수정
|
| 51 |
+
|
| 52 |
+
- name: Push hf-translation-docs-explorer to HF Space
|
| 53 |
+
env:
|
| 54 |
+
HF_USERNAME: ${{ secrets.HF_USERNAME }}
|
| 55 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 56 |
+
HF_SPACE_NAME_DOCS_EXPLORER: ${{ secrets.HF_SPACE_NAME_DOCS_EXPLORER }}
|
| 57 |
+
run: |
|
| 58 |
+
git subtree split --prefix=external/mcp-servers/hf-translation-docs-explorer -b docs-explorer-branch
|
| 59 |
+
|
| 60 |
+
git remote add space-docs-explorer https://$HF_USERNAME:[email protected]/spaces/$HF_USERNAME/$HF_SPACE_NAME_DOCS_EXPLORER
|
| 61 |
+
git push --force space-docs-explorer docs-explorer-branch:main
|
| 62 |
+
|
| 63 |
+
deploy_translation_reviewer:
|
| 64 |
+
runs-on: ubuntu-latest
|
| 65 |
+
|
| 66 |
+
steps:
|
| 67 |
+
- name: Checkout repository
|
| 68 |
+
uses: actions/checkout@v3
|
| 69 |
+
with:
|
| 70 |
+
fetch-depth: 0
|
| 71 |
+
ref: mcp/docs-search-review # 추후 main 수정
|
| 72 |
+
|
| 73 |
+
- name: Push hf-translation-reviewer to HF Space
|
| 74 |
+
env:
|
| 75 |
+
HF_USERNAME: ${{ secrets.HF_USERNAME }}
|
| 76 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 77 |
+
HF_SPACE_NAME_TRANSLATION_REVIEWER: ${{ secrets.HF_SPACE_NAME_TRANSLATION_REVIEWER }}
|
| 78 |
+
run: |
|
| 79 |
+
git subtree split --prefix=external/mcp-servers/hf-translation-reviewer -b translation-reviewer-branch
|
| 80 |
+
|
| 81 |
+
git remote add space-translation-reviewer https://$HF_USERNAME:[email protected]/spaces/$HF_USERNAME/$HF_SPACE_NAME_TRANSLATION_REVIEWER
|
| 82 |
+
git push --force space-translation-reviewer translation-reviewer-branch:main
|
.gitignore
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
-
.env
|
| 2 |
-
*/__pycache__/
|
| 3 |
-
pr_success.log
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
*/__pycache__/
|
| 3 |
+
pr_success.log
|
README.md
CHANGED
|
@@ -1,307 +1,307 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: i18n Agent - Contribute in Just 5 Minutes
|
| 3 |
-
emoji: 🤗
|
| 4 |
-
colorFrom: yellow
|
| 5 |
-
colorTo: yellow
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version: 5.33.1
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
---
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
<div align="center">
|
| 14 |
-
|
| 15 |
-
# 🌐 [i18n-agent] Hugging Face i18n made easy
|
| 16 |
-
|
| 17 |
-
*AI-powered translation agent for Hugging Face Transformers documentation internationalization*
|
| 18 |
-
|
| 19 |
-
[](https://huggingface.co/spaces/Agents-MCP-Hackathon/hf-transformers-docs-i18n-agent)
|
| 20 |
-
[](https://github.com/topics/agent-demo-track)
|
| 21 |
-
|
| 22 |
-

|
| 23 |
-
|
| 24 |
-
**Streamline Hugging Face Transformers documentation translation with Claude AI**
|
| 25 |
-
• **Auto-generate GitHub PRs** • **Multi-language support**
|
| 26 |
-
|
| 27 |
-
> 🎯 **Created to address [Hugging Face Transformers Issue #20179](https://github.com/huggingface/transformers/issues/20179)** - Making documentation translation more accessible and automated for the global community.
|
| 28 |
-
|
| 29 |
-
[🚀 Try Live Demo](https://huggingface.co/spaces/Agents-MCP-Hackathon/hf-transformers-docs-i18n-agent) • [📹 Watch Demo](#-demo-video) • [📖 Documentation](#-quick-start)
|
| 30 |
-
|
| 31 |
-
</div>
|
| 32 |
-
|
| 33 |
-
---
|
| 34 |
-
|
| 35 |
-
## ✨ What is this?
|
| 36 |
-
|
| 37 |
-
Transform the way you contribute to Hugging Face Transformers' global community! This AI agent automatically:
|
| 38 |
-
|
| 39 |
-
- 🔍 **Discovers** missing translations in [Transformers documentation](https://huggingface.co/docs/transformers/en/index)
|
| 40 |
-
- 🤖 **Translates** using Claude Sonnet 4 with technical precision
|
| 41 |
-
- 📝 **Creates** GitHub pull requests ready for review on [huggingface/transformers](https://github.com/huggingface/transformers)
|
| 42 |
-
- 💬 **Guides** you through the entire process
|
| 43 |
-
|
| 44 |
-
> **Perfect for**: Contributors addressing [Issue #20179](https://github.com/huggingface/transformers/issues/20179), documentation maintainers, and international communities wanting to make transformer models accessible worldwide.
|
| 45 |
-
|
| 46 |
-
## 🎯 Addressing the Community Need
|
| 47 |
-
|
| 48 |
-
This project was specifically created to solve [Hugging Face Transformers Issue #20179](https://github.com/huggingface/transformers/issues/20179), which highlights the need for better internationalization tooling. Our agent tackles the core challenges mentioned in the issue:
|
| 49 |
-
|
| 50 |
-
- **🚧 Translation Bottlenecks**: Automates the manual translation process
|
| 51 |
-
- **📊 Consistency Issues**: Maintains uniform translation quality within each languages
|
| 52 |
-
- **⚡ Scalability Problems**: Handles batch translations efficiently
|
| 53 |
-
- **🤝 Contributor Barriers**: Simplifies the PR creation workflow for translators
|
| 54 |
-
|
| 55 |
-
## 🎥 Demo Video
|
| 56 |
-
|
| 57 |
-
[Hugging Face i18n Agent Demo](https://youtu.be/J2MBMNk7la8?si=7867ztaU2nPN0UEo)
|
| 58 |
-
|
| 59 |
-
*Watch the complete walkthrough: from setup to PR creation in under 5 minutes*
|
| 60 |
-
|
| 61 |
-
## 🚀 Quick Start
|
| 62 |
-
|
| 63 |
-

|
| 64 |
-
|
| 65 |
-
### Option 1: One-Click Demo (Recommended)
|
| 66 |
-
[](https://huggingface.co/spaces/YOUR_USERNAME/i18n-agent)
|
| 67 |
-
|
| 68 |
-
✅ **No setup required!** Just click and start translating.
|
| 69 |
-
✅ **Pre-requisites!** Need `Anthropic API key` and `Github token`.
|
| 70 |
-
|
| 71 |
-
### Option 2: Run Locally
|
| 72 |
-
|
| 73 |
-
<details>
|
| 74 |
-
<summary>🛠️ Local Installation Guide</summary>
|
| 75 |
-
|
| 76 |
-
```bash
|
| 77 |
-
# Clone the repo
|
| 78 |
-
git clone https://github.com/Hugging-Face-KREW/i18n-agent.git
|
| 79 |
-
cd i18n-agent
|
| 80 |
-
|
| 81 |
-
# Install dependencies
|
| 82 |
-
make install
|
| 83 |
-
source .venv/bin/activate
|
| 84 |
-
|
| 85 |
-
# Set up your keys
|
| 86 |
-
cp .env.example .env
|
| 87 |
-
# Add your Anthropic API key and GitHub token
|
| 88 |
-
|
| 89 |
-
# Launch the app
|
| 90 |
-
python app.py
|
| 91 |
-
```
|
| 92 |
-
|
| 93 |
-
</details>
|
| 94 |
-
|
| 95 |
-
## 🎯 How It Works
|
| 96 |
-
|
| 97 |
-
This agent specifically targets the [Hugging Face Transformers documentation](https://huggingface.co/docs/transformers/en/index) and submits PRs addressing [Issue #20179](https://github.com/huggingface/transformers/issues/20179) in the [huggingface/transformers](https://github.com/huggingface/transformers) repository.
|
| 98 |
-
|
| 99 |
-
```mermaid
|
| 100 |
-
graph LR
|
| 101 |
-
A[🔍 Find Files] --> B[🤖 Translate] --> C[📝 Create PR]
|
| 102 |
-
|
| 103 |
-
A --> A1[Scan transformers docs]
|
| 104 |
-
A --> A2[Identify translation gaps]
|
| 105 |
-
|
| 106 |
-
B --> B1[Claude AI translation]
|
| 107 |
-
B --> B2[Preserve formatting]
|
| 108 |
-
|
| 109 |
-
C --> C1[Auto-branch creation]
|
| 110 |
-
C --> C2[Submit to huggingface/transformers]
|
| 111 |
-
```
|
| 112 |
-
|
| 113 |
-
### Step 1: 🔍 Discover Translation Targets
|
| 114 |
-
- Select your target language (Korean, and more languages will be supported)
|
| 115 |
-
- Set how many files to process
|
| 116 |
-
- Let AI identify the most impactful Transformers docs translations
|
| 117 |
-
|
| 118 |
-
### Step 2: 🤖 Smart Translation
|
| 119 |
-
- Enter your Anthropic API key
|
| 120 |
-
- Claude Sonnet 4 translates with context awareness
|
| 121 |
-
- Technical terms and code blocks preserved automatically
|
| 122 |
-
|
| 123 |
-
### Step 3: 📝 Automated PR Creation
|
| 124 |
-
- Configure GitHub credentials
|
| 125 |
-
- System creates properly formatted pull requests for [huggingface/transformers](https://github.com/huggingface/transformers)
|
| 126 |
-
- Optional: Use reference PRs for consistency
|
| 127 |
-
|
| 128 |
-
## 🌍 Supported Languages
|
| 129 |
-
|
| 130 |
-
<div align="center">
|
| 131 |
-
|
| 132 |
-
| Language | Code | Status |
|
| 133 |
-
|----------|------|--------|
|
| 134 |
-
| 🇰🇷 Korean | `ko` | ✅ Fully Supported |
|
| 135 |
-
|
| 136 |
-
*And more languages coming soon...*
|
| 137 |
-
|
| 138 |
-
</div>
|
| 139 |
-
|
| 140 |
-
## 🏗️ Architecture
|
| 141 |
-
|
| 142 |
-
<div align="center">
|
| 143 |
-
<a href="images/workflow.png" target="_blank">
|
| 144 |
-
<img src="images/workflow.png" alt="KREW x Hugging Face Logo" width="400">
|
| 145 |
-
</a>
|
| 146 |
-
</div>
|
| 147 |
-
|
| 148 |
-
<details>
|
| 149 |
-
|
| 150 |
-
<summary>📊 System Design Overview</summary>
|
| 151 |
-
|
| 152 |
-
**Frontend Layer**
|
| 153 |
-
- Gradio web interface with modern styling
|
| 154 |
-
- Real-time chat & quick controls with AI agent
|
| 155 |
-
- Progress tracking missing
|
| 156 |
-
|
| 157 |
-
**AI Processing Layer**
|
| 158 |
-
- File discovery with intelligent prioritization for Transformers docs
|
| 159 |
-
- Claude Sonnet 4 for context-aware translation
|
| 160 |
-
- LangChain integration for PR research
|
| 161 |
-
|
| 162 |
-
**Integration Layer**
|
| 163 |
-
- GitHub API for automated PR creation to [huggingface/transformers](https://github.com/huggingface/transformers)
|
| 164 |
-
- Branch management and commit structuring
|
| 165 |
-
- Template matching from reference PRs
|
| 166 |
-
|
| 167 |
-
</details>
|
| 168 |
-
|
| 169 |
-
## ⚙️ Configuration
|
| 170 |
-
|
| 171 |
-
### For Spaces Deployment
|
| 172 |
-
Prepare these secrets:
|
| 173 |
-
|
| 174 |
-
```bash
|
| 175 |
-
ANTHROPIC_API_KEY=your_claude_api_key
|
| 176 |
-
GITHUB_TOKEN=your_github_token
|
| 177 |
-
```
|
| 178 |
-
|
| 179 |
-
### For Local Development
|
| 180 |
-
Create `.env` file:
|
| 181 |
-
|
| 182 |
-
```bash
|
| 183 |
-
ANTHROPIC_API_KEY=<your api key>
|
| 184 |
-
|
| 185 |
-
# GitHub PR Agent Configuration
|
| 186 |
-
GITHUB_TOKEN=<your github token>
|
| 187 |
-
GITHUB_OWNER=<your github username>
|
| 188 |
-
GITHUB_REPO=<your repository name>
|
| 189 |
-
REFERENCE_PR_URL=<reference pr url for style analysis>
|
| 190 |
-
```
|
| 191 |
-
|
| 192 |
-
## 🤝 Contributing
|
| 193 |
-
|
| 194 |
-
<div align="center">
|
| 195 |
-
|
| 196 |
-
**Love this project? Here's how you can help:**
|
| 197 |
-
|
| 198 |
-
[](https://github.com/Hugging-Face-KREW/i18n-agent.git)
|
| 199 |
-
[](https://github.com/Hugging-Face-KREW/i18n-agent.git)
|
| 200 |
-
[](https://github.com/Hugging-Face-KREW/i18n-agent.git)
|
| 201 |
-
|
| 202 |
-
</div>
|
| 203 |
-
|
| 204 |
-
### 👥 Contributors
|
| 205 |
-
|
| 206 |
-
🤗 [email protected] / @harheem
|
| 207 |
-
🤗 [email protected] / @Jwaminju
|
| 208 |
-
|
| 209 |
-
## 💡 Use Cases
|
| 210 |
-
|
| 211 |
-
> **🌟 Real-world scenarios where this agent shines:**
|
| 212 |
-
|
| 213 |
-
- **📚 Documentation Teams**: Batch translate Transformers documentation updates
|
| 214 |
-
- **🌍 Community Contributors**: Help make Transformers accessible in your language
|
| 215 |
-
- **🏢 Organizations**: Streamline i18n workflows for Transformers library
|
| 216 |
-
- **👨💻 Developers**: Contribute Transformers translations without manual GitHub workflow
|
| 217 |
-
- **🎯 Issue #20179 Contributors**: Directly address the internationalization challenges raised by the community
|
| 218 |
-
|
| 219 |
-
## 🛠️ Tech Stack
|
| 220 |
-
|
| 221 |
-
<div align="center">
|
| 222 |
-
|
| 223 |
-

|
| 224 |
-

|
| 225 |
-

|
| 226 |
-

|
| 227 |
-

|
| 228 |
-
|
| 229 |
-
</div>
|
| 230 |
-
|
| 231 |
-
## ❓ FAQ
|
| 232 |
-
|
| 233 |
-
<details>
|
| 234 |
-
<summary><strong>Q: How does this relate to Issue #20179?</strong></summary>
|
| 235 |
-
<br>
|
| 236 |
-
This agent directly addresses the pain points raised in <a href="https://github.com/huggingface/transformers/issues/20179">Issue #20179</a> by automating the translation workflow, reducing manual overhead, and making it easier for contributors to submit high-quality translations.
|
| 237 |
-
</details>
|
| 238 |
-
|
| 239 |
-
<details>
|
| 240 |
-
<summary><strong>Q: How accurate are the translations?</strong></summary>
|
| 241 |
-
<br>
|
| 242 |
-
The agent uses Claude Sonnet 4, which provides high-quality translations with technical context awareness. It preserves code blocks, maintains formatting, and follows established translation patterns.
|
| 243 |
-
</details>
|
| 244 |
-
|
| 245 |
-
<details>
|
| 246 |
-
<summary><strong>Q: What permissions do I need for GitHub integration?</strong></summary>
|
| 247 |
-
<br>
|
| 248 |
-
Your GitHub token needs repository read/write permissions and the ability to create branches and pull requests on the target repository.
|
| 249 |
-
</details>
|
| 250 |
-
|
| 251 |
-
<details>
|
| 252 |
-
<summary><strong>Q: Can I customize the translation style?</strong></summary>
|
| 253 |
-
<br>
|
| 254 |
-
Yes! You can provide reference PR URLs to match existing translation patterns and maintain consistency with community standards.
|
| 255 |
-
</details>
|
| 256 |
-
|
| 257 |
-
## 🐛 Troubleshooting
|
| 258 |
-
|
| 259 |
-
### Common Issues
|
| 260 |
-
|
| 261 |
-
<details>
|
| 262 |
-
<summary><strong>API Key Issues</strong></summary>
|
| 263 |
-
|
| 264 |
-
- Ensure your Anthropic API key is valid and has sufficient credits
|
| 265 |
-
- Check that your GitHub token has the necessary repository permissions
|
| 266 |
-
|
| 267 |
-
</details>
|
| 268 |
-
|
| 269 |
-
<details>
|
| 270 |
-
<summary><strong>Translation Quality</strong></summary>
|
| 271 |
-
|
| 272 |
-
- The system uses Claude Sonnet 4 for high-quality translations
|
| 273 |
-
- Formatting and markdown structure is maintained
|
| 274 |
-
- Please restart the translation again if you met format issue
|
| 275 |
-
|
| 276 |
-
</details>
|
| 277 |
-
|
| 278 |
-
<details>
|
| 279 |
-
<summary><strong>GitHub PR Creation</strong></summary>
|
| 280 |
-
|
| 281 |
-
- Verify repository permissions and branch protection rules
|
| 282 |
-
- Check that the reference PR URL is accessible and valid
|
| 283 |
-
|
| 284 |
-
</details>
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
## 🙏 Acknowledgments
|
| 288 |
-
|
| 289 |
-
Special thanks to the amazing communities that make this possible:
|
| 290 |
-
|
| 291 |
-
- **🤗 Hugging Face** - For building the Transformers library and comprehensive documentation
|
| 292 |
-
- **🎭 Anthropic** - For Claude's incredible language capabilities
|
| 293 |
-
- **👥 Hugging Face KREW Community** - For championing Korean AI translation
|
| 294 |
-
- **🎨 Gradio** - For making beautiful AI interfaces simple
|
| 295 |
-
- **🌍 Community Contributors** - For raising awareness through [Issue #20179](https://github.com/huggingface/transformers/issues/20179)
|
| 296 |
-
|
| 297 |
-
---
|
| 298 |
-
|
| 299 |
-
<div align="center">
|
| 300 |
-
|
| 301 |
-
**Made with ❤️ for global accessibility of Hugging Face Transformers documentation.**
|
| 302 |
-
|
| 303 |
-
**🎯 Solving [Issue #20179](https://github.com/huggingface/transformers/issues/20179) one translation at a time.**
|
| 304 |
-
|
| 305 |
-
[⭐ Star this repo](https://github.com/Hugging-Face-KREW/i18n-agent.git) • [🐛 Report Bug](https://github.com/Hugging-Face-KREW/i18n-agent.git) • [💡 Request Feature](https://github.com/Hugging-Face-KREW/i18n-agent.git)
|
| 306 |
-
|
| 307 |
-
</div>
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: i18n Agent - Contribute in Just 5 Minutes
|
| 3 |
+
emoji: 🤗
|
| 4 |
+
colorFrom: yellow
|
| 5 |
+
colorTo: yellow
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.33.1
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
<div align="center">
|
| 14 |
+
|
| 15 |
+
# 🌐 [i18n-agent] Hugging Face i18n made easy
|
| 16 |
+
|
| 17 |
+
*AI-powered translation agent for Hugging Face Transformers documentation internationalization*
|
| 18 |
+
|
| 19 |
+
[](https://huggingface.co/spaces/Agents-MCP-Hackathon/hf-transformers-docs-i18n-agent)
|
| 20 |
+
[](https://github.com/topics/agent-demo-track)
|
| 21 |
+
|
| 22 |
+

|
| 23 |
+
|
| 24 |
+
**Streamline Hugging Face Transformers documentation translation with Claude AI**
|
| 25 |
+
• **Auto-generate GitHub PRs** • **Multi-language support**
|
| 26 |
+
|
| 27 |
+
> 🎯 **Created to address [Hugging Face Transformers Issue #20179](https://github.com/huggingface/transformers/issues/20179)** - Making documentation translation more accessible and automated for the global community.
|
| 28 |
+
|
| 29 |
+
[🚀 Try Live Demo](https://huggingface.co/spaces/Agents-MCP-Hackathon/hf-transformers-docs-i18n-agent) • [📹 Watch Demo](#-demo-video) • [📖 Documentation](#-quick-start)
|
| 30 |
+
|
| 31 |
+
</div>
|
| 32 |
+
|
| 33 |
+
---
|
| 34 |
+
|
| 35 |
+
## ✨ What is this?
|
| 36 |
+
|
| 37 |
+
Transform the way you contribute to Hugging Face Transformers' global community! This AI agent automatically:
|
| 38 |
+
|
| 39 |
+
- 🔍 **Discovers** missing translations in [Transformers documentation](https://huggingface.co/docs/transformers/en/index)
|
| 40 |
+
- 🤖 **Translates** using Claude Sonnet 4 with technical precision
|
| 41 |
+
- 📝 **Creates** GitHub pull requests ready for review on [huggingface/transformers](https://github.com/huggingface/transformers)
|
| 42 |
+
- 💬 **Guides** you through the entire process
|
| 43 |
+
|
| 44 |
+
> **Perfect for**: Contributors addressing [Issue #20179](https://github.com/huggingface/transformers/issues/20179), documentation maintainers, and international communities wanting to make transformer models accessible worldwide.
|
| 45 |
+
|
| 46 |
+
## 🎯 Addressing the Community Need
|
| 47 |
+
|
| 48 |
+
This project was specifically created to solve [Hugging Face Transformers Issue #20179](https://github.com/huggingface/transformers/issues/20179), which highlights the need for better internationalization tooling. Our agent tackles the core challenges mentioned in the issue:
|
| 49 |
+
|
| 50 |
+
- **🚧 Translation Bottlenecks**: Automates the manual translation process
|
| 51 |
+
- **📊 Consistency Issues**: Maintains uniform translation quality within each languages
|
| 52 |
+
- **⚡ Scalability Problems**: Handles batch translations efficiently
|
| 53 |
+
- **🤝 Contributor Barriers**: Simplifies the PR creation workflow for translators
|
| 54 |
+
|
| 55 |
+
## 🎥 Demo Video
|
| 56 |
+
|
| 57 |
+
[Hugging Face i18n Agent Demo](https://youtu.be/J2MBMNk7la8?si=7867ztaU2nPN0UEo)
|
| 58 |
+
|
| 59 |
+
*Watch the complete walkthrough: from setup to PR creation in under 5 minutes*
|
| 60 |
+
|
| 61 |
+
## 🚀 Quick Start
|
| 62 |
+
|
| 63 |
+

|
| 64 |
+
|
| 65 |
+
### Option 1: One-Click Demo (Recommended)
|
| 66 |
+
[](https://huggingface.co/spaces/YOUR_USERNAME/i18n-agent)
|
| 67 |
+
|
| 68 |
+
✅ **No setup required!** Just click and start translating.
|
| 69 |
+
✅ **Pre-requisites!** Need `Anthropic API key` and `Github token`.
|
| 70 |
+
|
| 71 |
+
### Option 2: Run Locally
|
| 72 |
+
|
| 73 |
+
<details>
|
| 74 |
+
<summary>🛠️ Local Installation Guide</summary>
|
| 75 |
+
|
| 76 |
+
```bash
|
| 77 |
+
# Clone the repo
|
| 78 |
+
git clone https://github.com/Hugging-Face-KREW/i18n-agent.git
|
| 79 |
+
cd i18n-agent
|
| 80 |
+
|
| 81 |
+
# Install dependencies
|
| 82 |
+
make install
|
| 83 |
+
source .venv/bin/activate
|
| 84 |
+
|
| 85 |
+
# Set up your keys
|
| 86 |
+
cp .env.example .env
|
| 87 |
+
# Add your Anthropic API key and GitHub token
|
| 88 |
+
|
| 89 |
+
# Launch the app
|
| 90 |
+
python app.py
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
</details>
|
| 94 |
+
|
| 95 |
+
## 🎯 How It Works
|
| 96 |
+
|
| 97 |
+
This agent specifically targets the [Hugging Face Transformers documentation](https://huggingface.co/docs/transformers/en/index) and submits PRs addressing [Issue #20179](https://github.com/huggingface/transformers/issues/20179) in the [huggingface/transformers](https://github.com/huggingface/transformers) repository.
|
| 98 |
+
|
| 99 |
+
```mermaid
|
| 100 |
+
graph LR
|
| 101 |
+
A[🔍 Find Files] --> B[🤖 Translate] --> C[📝 Create PR]
|
| 102 |
+
|
| 103 |
+
A --> A1[Scan transformers docs]
|
| 104 |
+
A --> A2[Identify translation gaps]
|
| 105 |
+
|
| 106 |
+
B --> B1[Claude AI translation]
|
| 107 |
+
B --> B2[Preserve formatting]
|
| 108 |
+
|
| 109 |
+
C --> C1[Auto-branch creation]
|
| 110 |
+
C --> C2[Submit to huggingface/transformers]
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
### Step 1: 🔍 Discover Translation Targets
|
| 114 |
+
- Select your target language (Korean, and more languages will be supported)
|
| 115 |
+
- Set how many files to process
|
| 116 |
+
- Let AI identify the most impactful Transformers docs translations
|
| 117 |
+
|
| 118 |
+
### Step 2: 🤖 Smart Translation
|
| 119 |
+
- Enter your Anthropic API key
|
| 120 |
+
- Claude Sonnet 4 translates with context awareness
|
| 121 |
+
- Technical terms and code blocks preserved automatically
|
| 122 |
+
|
| 123 |
+
### Step 3: 📝 Automated PR Creation
|
| 124 |
+
- Configure GitHub credentials
|
| 125 |
+
- System creates properly formatted pull requests for [huggingface/transformers](https://github.com/huggingface/transformers)
|
| 126 |
+
- Optional: Use reference PRs for consistency
|
| 127 |
+
|
| 128 |
+
## 🌍 Supported Languages
|
| 129 |
+
|
| 130 |
+
<div align="center">
|
| 131 |
+
|
| 132 |
+
| Language | Code | Status |
|
| 133 |
+
|----------|------|--------|
|
| 134 |
+
| 🇰🇷 Korean | `ko` | ✅ Fully Supported |
|
| 135 |
+
|
| 136 |
+
*And more languages coming soon...*
|
| 137 |
+
|
| 138 |
+
</div>
|
| 139 |
+
|
| 140 |
+
## 🏗️ Architecture
|
| 141 |
+
|
| 142 |
+
<div align="center">
|
| 143 |
+
<a href="images/workflow.png" target="_blank">
|
| 144 |
+
<img src="images/workflow.png" alt="KREW x Hugging Face Logo" width="400">
|
| 145 |
+
</a>
|
| 146 |
+
</div>
|
| 147 |
+
|
| 148 |
+
<details>
|
| 149 |
+
|
| 150 |
+
<summary>📊 System Design Overview</summary>
|
| 151 |
+
|
| 152 |
+
**Frontend Layer**
|
| 153 |
+
- Gradio web interface with modern styling
|
| 154 |
+
- Real-time chat & quick controls with AI agent
|
| 155 |
+
- Progress tracking missing
|
| 156 |
+
|
| 157 |
+
**AI Processing Layer**
|
| 158 |
+
- File discovery with intelligent prioritization for Transformers docs
|
| 159 |
+
- Claude Sonnet 4 for context-aware translation
|
| 160 |
+
- LangChain integration for PR research
|
| 161 |
+
|
| 162 |
+
**Integration Layer**
|
| 163 |
+
- GitHub API for automated PR creation to [huggingface/transformers](https://github.com/huggingface/transformers)
|
| 164 |
+
- Branch management and commit structuring
|
| 165 |
+
- Template matching from reference PRs
|
| 166 |
+
|
| 167 |
+
</details>
|
| 168 |
+
|
| 169 |
+
## ⚙️ Configuration
|
| 170 |
+
|
| 171 |
+
### For Spaces Deployment
|
| 172 |
+
Prepare these secrets:
|
| 173 |
+
|
| 174 |
+
```bash
|
| 175 |
+
ANTHROPIC_API_KEY=your_claude_api_key
|
| 176 |
+
GITHUB_TOKEN=your_github_token
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
### For Local Development
|
| 180 |
+
Create `.env` file:
|
| 181 |
+
|
| 182 |
+
```bash
|
| 183 |
+
ANTHROPIC_API_KEY=<your api key>
|
| 184 |
+
|
| 185 |
+
# GitHub PR Agent Configuration
|
| 186 |
+
GITHUB_TOKEN=<your github token>
|
| 187 |
+
GITHUB_OWNER=<your github username>
|
| 188 |
+
GITHUB_REPO=<your repository name>
|
| 189 |
+
REFERENCE_PR_URL=<reference pr url for style analysis>
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
## 🤝 Contributing
|
| 193 |
+
|
| 194 |
+
<div align="center">
|
| 195 |
+
|
| 196 |
+
**Love this project? Here's how you can help:**
|
| 197 |
+
|
| 198 |
+
[](https://github.com/Hugging-Face-KREW/i18n-agent.git)
|
| 199 |
+
[](https://github.com/Hugging-Face-KREW/i18n-agent.git)
|
| 200 |
+
[](https://github.com/Hugging-Face-KREW/i18n-agent.git)
|
| 201 |
+
|
| 202 |
+
</div>
|
| 203 |
+
|
| 204 |
+
### 👥 Contributors
|
| 205 |
+
|
| 206 |
+
🤗 [email protected] / @harheem
|
| 207 |
+
🤗 [email protected] / @Jwaminju
|
| 208 |
+
|
| 209 |
+
## 💡 Use Cases
|
| 210 |
+
|
| 211 |
+
> **🌟 Real-world scenarios where this agent shines:**
|
| 212 |
+
|
| 213 |
+
- **📚 Documentation Teams**: Batch translate Transformers documentation updates
|
| 214 |
+
- **🌍 Community Contributors**: Help make Transformers accessible in your language
|
| 215 |
+
- **🏢 Organizations**: Streamline i18n workflows for Transformers library
|
| 216 |
+
- **👨💻 Developers**: Contribute Transformers translations without manual GitHub workflow
|
| 217 |
+
- **🎯 Issue #20179 Contributors**: Directly address the internationalization challenges raised by the community
|
| 218 |
+
|
| 219 |
+
## 🛠️ Tech Stack
|
| 220 |
+
|
| 221 |
+
<div align="center">
|
| 222 |
+
|
| 223 |
+

|
| 224 |
+

|
| 225 |
+

|
| 226 |
+

|
| 227 |
+

|
| 228 |
+
|
| 229 |
+
</div>
|
| 230 |
+
|
| 231 |
+
## ❓ FAQ
|
| 232 |
+
|
| 233 |
+
<details>
|
| 234 |
+
<summary><strong>Q: How does this relate to Issue #20179?</strong></summary>
|
| 235 |
+
<br>
|
| 236 |
+
This agent directly addresses the pain points raised in <a href="https://github.com/huggingface/transformers/issues/20179">Issue #20179</a> by automating the translation workflow, reducing manual overhead, and making it easier for contributors to submit high-quality translations.
|
| 237 |
+
</details>
|
| 238 |
+
|
| 239 |
+
<details>
|
| 240 |
+
<summary><strong>Q: How accurate are the translations?</strong></summary>
|
| 241 |
+
<br>
|
| 242 |
+
The agent uses Claude Sonnet 4, which provides high-quality translations with technical context awareness. It preserves code blocks, maintains formatting, and follows established translation patterns.
|
| 243 |
+
</details>
|
| 244 |
+
|
| 245 |
+
<details>
|
| 246 |
+
<summary><strong>Q: What permissions do I need for GitHub integration?</strong></summary>
|
| 247 |
+
<br>
|
| 248 |
+
Your GitHub token needs repository read/write permissions and the ability to create branches and pull requests on the target repository.
|
| 249 |
+
</details>
|
| 250 |
+
|
| 251 |
+
<details>
|
| 252 |
+
<summary><strong>Q: Can I customize the translation style?</strong></summary>
|
| 253 |
+
<br>
|
| 254 |
+
Yes! You can provide reference PR URLs to match existing translation patterns and maintain consistency with community standards.
|
| 255 |
+
</details>
|
| 256 |
+
|
| 257 |
+
## 🐛 Troubleshooting
|
| 258 |
+
|
| 259 |
+
### Common Issues
|
| 260 |
+
|
| 261 |
+
<details>
|
| 262 |
+
<summary><strong>API Key Issues</strong></summary>
|
| 263 |
+
|
| 264 |
+
- Ensure your Anthropic API key is valid and has sufficient credits
|
| 265 |
+
- Check that your GitHub token has the necessary repository permissions
|
| 266 |
+
|
| 267 |
+
</details>
|
| 268 |
+
|
| 269 |
+
<details>
|
| 270 |
+
<summary><strong>Translation Quality</strong></summary>
|
| 271 |
+
|
| 272 |
+
- The system uses Claude Sonnet 4 for high-quality translations
|
| 273 |
+
- Formatting and markdown structure is maintained
|
| 274 |
+
- Please restart the translation again if you met format issue
|
| 275 |
+
|
| 276 |
+
</details>
|
| 277 |
+
|
| 278 |
+
<details>
|
| 279 |
+
<summary><strong>GitHub PR Creation</strong></summary>
|
| 280 |
+
|
| 281 |
+
- Verify repository permissions and branch protection rules
|
| 282 |
+
- Check that the reference PR URL is accessible and valid
|
| 283 |
+
|
| 284 |
+
</details>
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
## 🙏 Acknowledgments
|
| 288 |
+
|
| 289 |
+
Special thanks to the amazing communities that make this possible:
|
| 290 |
+
|
| 291 |
+
- **🤗 Hugging Face** - For building the Transformers library and comprehensive documentation
|
| 292 |
+
- **🎭 Anthropic** - For Claude's incredible language capabilities
|
| 293 |
+
- **👥 Hugging Face KREW Community** - For championing Korean AI translation
|
| 294 |
+
- **🎨 Gradio** - For making beautiful AI interfaces simple
|
| 295 |
+
- **🌍 Community Contributors** - For raising awareness through [Issue #20179](https://github.com/huggingface/transformers/issues/20179)
|
| 296 |
+
|
| 297 |
+
---
|
| 298 |
+
|
| 299 |
+
<div align="center">
|
| 300 |
+
|
| 301 |
+
**Made with ❤️ for global accessibility of Hugging Face Transformers documentation.**
|
| 302 |
+
|
| 303 |
+
**🎯 Solving [Issue #20179](https://github.com/huggingface/transformers/issues/20179) one translation at a time.**
|
| 304 |
+
|
| 305 |
+
[⭐ Star this repo](https://github.com/Hugging-Face-KREW/i18n-agent.git) • [🐛 Report Bug](https://github.com/Hugging-Face-KREW/i18n-agent.git) • [💡 Request Feature](https://github.com/Hugging-Face-KREW/i18n-agent.git)
|
| 306 |
+
|
| 307 |
+
</div>
|
agent/handler.py
CHANGED
|
@@ -1,639 +1,639 @@
|
|
| 1 |
-
"""Module for gradio chat-based translation agent interface."""
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
import re
|
| 5 |
-
from pathlib import Path
|
| 6 |
-
|
| 7 |
-
import gradio as gr
|
| 8 |
-
|
| 9 |
-
from agent.workflow import (
|
| 10 |
-
report_translation_target_files,
|
| 11 |
-
translate_docs_interactive,
|
| 12 |
-
generate_github_pr,
|
| 13 |
-
)
|
| 14 |
-
from pr_generator.searcher import find_reference_pr_simple_stream
|
| 15 |
-
from translator.content import get_full_prompt, get_content, preprocess_content
|
| 16 |
-
from translator.project_config import get_available_projects, get_project_config
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
# State management
|
| 20 |
-
class ChatState:
|
| 21 |
-
def __init__(self):
|
| 22 |
-
self.step = "welcome" # welcome -> find_files -> translate -> create_github_pr
|
| 23 |
-
|
| 24 |
-
# Transient state (reset on restart)
|
| 25 |
-
self.selected_project = "transformers"
|
| 26 |
-
self.target_language = "ko"
|
| 27 |
-
self.k_files = 10
|
| 28 |
-
self.files_to_translate = []
|
| 29 |
-
self.additional_instruction = ""
|
| 30 |
-
self.current_file_content = {"translated": ""}
|
| 31 |
-
self.pr_result = None
|
| 32 |
-
|
| 33 |
-
# Persistent settings (preserved across restarts)
|
| 34 |
-
self.persistent_settings = {
|
| 35 |
-
"anthropic_api_key": "",
|
| 36 |
-
"aws_bearer_token_bedrock": "",
|
| 37 |
-
"github_config": {
|
| 38 |
-
"token": "",
|
| 39 |
-
"owner": "",
|
| 40 |
-
"repo_name": "",
|
| 41 |
-
"reference_pr_url": "",
|
| 42 |
-
}
|
| 43 |
-
}
|
| 44 |
-
|
| 45 |
-
def reset_transient_state(self):
|
| 46 |
-
"""Reset only the workflow state, keep persistent settings"""
|
| 47 |
-
self.step = "welcome"
|
| 48 |
-
self.selected_project = "transformers"
|
| 49 |
-
self.target_language = "ko"
|
| 50 |
-
self.k_files = 10
|
| 51 |
-
self.files_to_translate = []
|
| 52 |
-
self.additional_instruction = ""
|
| 53 |
-
self.current_file_content = {"translated": ""}
|
| 54 |
-
self.pr_result = None
|
| 55 |
-
|
| 56 |
-
@property
|
| 57 |
-
def github_config(self):
|
| 58 |
-
return self.persistent_settings["github_config"]
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
state = ChatState()
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
def _extract_content_for_display(content: str) -> str:
|
| 65 |
-
"""Extract text from document for display."""
|
| 66 |
-
# Remove Copyright header
|
| 67 |
-
to_translate = re.sub(r"<!--.*?-->", "", content, count=1, flags=re.DOTALL)
|
| 68 |
-
to_translate = to_translate.strip()
|
| 69 |
-
## remove code blocks from text
|
| 70 |
-
to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
|
| 71 |
-
## remove markdown tables from text
|
| 72 |
-
to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
|
| 73 |
-
## remove empty lines from text
|
| 74 |
-
to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
|
| 75 |
-
|
| 76 |
-
return to_translate
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
def get_welcome_message():
|
| 80 |
-
"""Initial welcome message with project selection"""
|
| 81 |
-
return """**👋 Welcome to 🌐 Hugging Face i18n Translation Agent!**
|
| 82 |
-
|
| 83 |
-
I'll help you find files that need translation and translate them in a streamlined workflow.
|
| 84 |
-
|
| 85 |
-
**🎯 First, select which project you want to translate:**
|
| 86 |
-
|
| 87 |
-
Use the **`Quick Controls`** on the right to select a project, or **ask me `what`, `how`, or `help`** to get started.
|
| 88 |
-
"""
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
def process_file_search_handler(project: str, lang: str, k: int, history: list) -> tuple:
|
| 92 |
-
"""Process file search request and update Gradio UI components."""
|
| 93 |
-
global state
|
| 94 |
-
state.selected_project = project
|
| 95 |
-
state.target_language = lang
|
| 96 |
-
state.k_files = k
|
| 97 |
-
state.step = "find_files"
|
| 98 |
-
|
| 99 |
-
try:
|
| 100 |
-
status_report, files_list = report_translation_target_files(project, lang, k)
|
| 101 |
-
except Exception as e:
|
| 102 |
-
if "rate limit" in str(e).lower():
|
| 103 |
-
response = f"""❌ **GitHub API Rate Limit Exceeded**
|
| 104 |
-
|
| 105 |
-
{str(e)}
|
| 106 |
-
|
| 107 |
-
**💡 To fix this:**
|
| 108 |
-
1. Set GitHub Token in Configuration panel above
|
| 109 |
-
2. Click "💾 Save Configuration"
|
| 110 |
-
3. Try "Find Files" again"""
|
| 111 |
-
history.append(["File search request", response])
|
| 112 |
-
return history, "", update_status(), gr.Tabs(selected=0), gr.update(choices=[]), gr.update(visible=False)
|
| 113 |
-
else:
|
| 114 |
-
raise # Re-raise non-rate-limit errors
|
| 115 |
-
state.files_to_translate = (
|
| 116 |
-
[file[0] for file in files_list]
|
| 117 |
-
if files_list
|
| 118 |
-
else []
|
| 119 |
-
)
|
| 120 |
-
|
| 121 |
-
response = f"""**✅ File search completed!**
|
| 122 |
-
|
| 123 |
-
**Status Report:**
|
| 124 |
-
{status_report}
|
| 125 |
-
|
| 126 |
-
**📁 Found first {len(state.files_to_translate)} files to translate:**
|
| 127 |
-
"""
|
| 128 |
-
|
| 129 |
-
if state.files_to_translate:
|
| 130 |
-
config = get_project_config(state.selected_project)
|
| 131 |
-
for i, file in enumerate(state.files_to_translate, 1):
|
| 132 |
-
file_link = f"{config.repo_url}/blob/main/{file}"
|
| 133 |
-
response += f"\n{i}. [`{file}`]({file_link})"
|
| 134 |
-
|
| 135 |
-
# if len(state.files_to_translate) > 5:
|
| 136 |
-
# response += f"\n... and {len(state.files_to_translate) - 5} more files"
|
| 137 |
-
|
| 138 |
-
response += "\n\n**🚀 Ready to start translation?**\nI can begin translating these files one by one. Would you like to proceed?"
|
| 139 |
-
else:
|
| 140 |
-
response += "\nNo files found that need translation."
|
| 141 |
-
|
| 142 |
-
# Add to history
|
| 143 |
-
history.append(["Please find files that need translation", response])
|
| 144 |
-
cleared_input = ""
|
| 145 |
-
|
| 146 |
-
# 드롭다운 choices로 쓸 파일 리스트 반환 추가
|
| 147 |
-
return (
|
| 148 |
-
history,
|
| 149 |
-
cleared_input,
|
| 150 |
-
update_status(),
|
| 151 |
-
gr.Tabs(), # Don't change tab
|
| 152 |
-
update_dropdown_choices(state.files_to_translate),
|
| 153 |
-
)
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
def update_dropdown_choices(file_list):
|
| 157 |
-
return gr.update(choices=file_list, value=None)
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
def confirm_and_go_translate_handler(history):
|
| 161 |
-
"""Confirm selection and go to translate tab"""
|
| 162 |
-
global state
|
| 163 |
-
|
| 164 |
-
response = f"✅ **Selection confirmed!**\n\n🎯 **Project:** {state.selected_project}\n🌍 **Language:** {state.target_language}\n\n**➡️ Go to Tab 2 to start translation.**"
|
| 165 |
-
history.append(["Confirm selection", response])
|
| 166 |
-
return history, "", update_status(), gr.Tabs(selected=1)
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
def confirm_translation_and_go_upload_handler(history):
|
| 170 |
-
"""Confirm translation and go to upload PR tab"""
|
| 171 |
-
global state
|
| 172 |
-
|
| 173 |
-
if not state.current_file_content.get("translated"):
|
| 174 |
-
response = "❌ No translation available. Please complete translation first."
|
| 175 |
-
history.append(["Upload PR request", response])
|
| 176 |
-
return history, "", update_status(), gr.Tabs()
|
| 177 |
-
|
| 178 |
-
response = f"✅ **Translation confirmed!**\n\n📄 **File:** `{state.files_to_translate[0] if state.files_to_translate else 'Unknown'}`\n\n**➡️ Go to Tab 3 to upload PR.**"
|
| 179 |
-
history.append(["Upload PR request", response])
|
| 180 |
-
return history, "", update_status(), gr.Tabs(selected=2)
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
def start_translation_process(force_retranslate=False):
|
| 184 |
-
"""Start the translation process for the first file"""
|
| 185 |
-
if not state.files_to_translate:
|
| 186 |
-
return "❌ No files available for translation.", ""
|
| 187 |
-
|
| 188 |
-
current_file = state.files_to_translate[0]
|
| 189 |
-
|
| 190 |
-
# Call translation function (simplified for demo)
|
| 191 |
-
try:
|
| 192 |
-
status, translated = translate_docs_interactive(
|
| 193 |
-
state.target_language, [[current_file]], state.additional_instruction, state.selected_project, force_retranslate
|
| 194 |
-
)
|
| 195 |
-
|
| 196 |
-
state.current_file_content = {"translated": translated}
|
| 197 |
-
path = (
|
| 198 |
-
Path(__file__).resolve().parent.parent
|
| 199 |
-
/ f"translation_result/{current_file}"
|
| 200 |
-
)
|
| 201 |
-
p = Path(path)
|
| 202 |
-
p.parent.mkdir(parents=True, exist_ok=True)
|
| 203 |
-
p.write_text(translated, encoding="utf-8")
|
| 204 |
-
|
| 205 |
-
config = get_project_config(state.selected_project)
|
| 206 |
-
original_file_link = f"{config.repo_url}/blob/main/{current_file}"
|
| 207 |
-
print("Compeleted translation:\n")
|
| 208 |
-
print(translated)
|
| 209 |
-
print("----------------------------")
|
| 210 |
-
|
| 211 |
-
# Different response format for existing vs new translation
|
| 212 |
-
if isinstance(status, str) and "Existing translation loaded" in status:
|
| 213 |
-
response = f"{status}\n**📄 Original Content Link:** {original_file_link}\n\n**🌐 Translated Content:**"
|
| 214 |
-
else:
|
| 215 |
-
response = (
|
| 216 |
-
f"""🔄 Translation for: `{current_file}`\n"""
|
| 217 |
-
f"**📄 Original Content Link:** {original_file_link}\n\n"
|
| 218 |
-
f"{status}\n\n"
|
| 219 |
-
"**🌐 Translated Content:**"
|
| 220 |
-
)
|
| 221 |
-
return response, translated
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
except Exception as e:
|
| 225 |
-
response = f"❌ Translation failed: {str(e)}"
|
| 226 |
-
response += "\n**➡️ Please try from the beginning.**"
|
| 227 |
-
return response, ""
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
def handle_general_message(message):
|
| 231 |
-
"""Handle general messages"""
|
| 232 |
-
message_lower = message.lower()
|
| 233 |
-
|
| 234 |
-
if any(word in message_lower for word in ["help", "what", "how"]):
|
| 235 |
-
return """**🤖 I'm your Hugging Face i18n Translation Agent!**
|
| 236 |
-
|
| 237 |
-
I can help you:
|
| 238 |
-
1. **🔍 Find files** that need translation
|
| 239 |
-
2. **🌐 Translate documents** using AI
|
| 240 |
-
3. **📋 Review translations** for quality
|
| 241 |
-
4. **🚀 Create GitHub PR** for translation
|
| 242 |
-
|
| 243 |
-
Currently available actions with quick controls:
|
| 244 |
-
- "find files" - Search for files needing translation
|
| 245 |
-
- "translate" - Start translation process
|
| 246 |
-
- "review" - Review current translation
|
| 247 |
-
- "github" - Create GitHub Pull Request
|
| 248 |
-
- "restart" - Start over"""
|
| 249 |
-
|
| 250 |
-
elif "restart" in message_lower:
|
| 251 |
-
global state
|
| 252 |
-
state = ChatState()
|
| 253 |
-
return get_welcome_message()
|
| 254 |
-
|
| 255 |
-
else:
|
| 256 |
-
return """I understand you want to work on translations!
|
| 257 |
-
|
| 258 |
-
**Two ways to get started:**
|
| 259 |
-
|
| 260 |
-
1. **🔍 Find Files first** - Use Tab 1 to discover files that need translation
|
| 261 |
-
2. **🚀 Direct Translation** - Go to Tab 2 and enter a file path directly (e.g., `docs/source/en/model_doc/bert.md`)
|
| 262 |
-
|
| 263 |
-
Make sure to configure your API keys in the Configuration panel above.
|
| 264 |
-
"""
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
# Main handler
|
| 268 |
-
def handle_user_message(message, history):
|
| 269 |
-
"""Handle user messages and provide appropriate responses"""
|
| 270 |
-
global state
|
| 271 |
-
|
| 272 |
-
if not message.strip():
|
| 273 |
-
return history, ""
|
| 274 |
-
|
| 275 |
-
elif state.step == "find_files" and any(
|
| 276 |
-
word in message.lower()
|
| 277 |
-
for word in ["yes", "proceed", "start", "translate", "translation"]
|
| 278 |
-
):
|
| 279 |
-
# User wants to start translation
|
| 280 |
-
if state.files_to_translate:
|
| 281 |
-
state.step = "translate"
|
| 282 |
-
response, translated = start_translation_process()
|
| 283 |
-
history.append([message, response])
|
| 284 |
-
history.append(["", translated])
|
| 285 |
-
return history, ""
|
| 286 |
-
else:
|
| 287 |
-
response = (
|
| 288 |
-
"❌ No files available for translation. Please search for files first."
|
| 289 |
-
)
|
| 290 |
-
# Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
|
| 291 |
-
else:
|
| 292 |
-
# General response
|
| 293 |
-
response = handle_general_message(message)
|
| 294 |
-
|
| 295 |
-
history.append([message, response])
|
| 296 |
-
return history, ""
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
def update_status():
|
| 300 |
-
if state.step == "welcome":
|
| 301 |
-
return f"""
|
| 302 |
-
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px;padding: 10px; background: rgba(0, 0, 0, 0.25); border-radius: 8px;">
|
| 303 |
-
<div><strong>🔄 Step:</strong> Welcome</div>
|
| 304 |
-
<div><strong>🎯 Project:</strong> {state.selected_project}</div>
|
| 305 |
-
<div><strong>📁 Files:</strong> 0</div>
|
| 306 |
-
<div><strong>🌍 Language:</strong> {state.target_language}</div>
|
| 307 |
-
</div>
|
| 308 |
-
"""
|
| 309 |
-
|
| 310 |
-
step_map = {
|
| 311 |
-
"welcome": "Welcome",
|
| 312 |
-
"find_files": "Finding Files",
|
| 313 |
-
"translate": "Translating",
|
| 314 |
-
"review": "Reviewing",
|
| 315 |
-
"create_github_pr": "Creating PR",
|
| 316 |
-
}
|
| 317 |
-
|
| 318 |
-
progress_map = {
|
| 319 |
-
"welcome": "Ready to start",
|
| 320 |
-
"find_files": "Files found",
|
| 321 |
-
"translate": f"{len(state.files_to_translate)} remaining",
|
| 322 |
-
"review": "Review complete",
|
| 323 |
-
"create_github_pr": "PR generation in progress",
|
| 324 |
-
}
|
| 325 |
-
|
| 326 |
-
# Check GitHub configuration status
|
| 327 |
-
github_status = "❌ Not configured"
|
| 328 |
-
if all(
|
| 329 |
-
[
|
| 330 |
-
state.github_config["token"],
|
| 331 |
-
state.github_config["owner"],
|
| 332 |
-
state.github_config["repo_name"],
|
| 333 |
-
]
|
| 334 |
-
):
|
| 335 |
-
github_status = (
|
| 336 |
-
f"✅ {state.github_config['owner']}/{state.github_config['repo_name']}"
|
| 337 |
-
)
|
| 338 |
-
|
| 339 |
-
status_html = f"""
|
| 340 |
-
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px; padding: 10px; background: rgba(0, 0, 0, 0.25); border-radius: 8px;">
|
| 341 |
-
<div><strong>🔄 Step:</strong> {step_map.get(state.step, state.step)}</div>
|
| 342 |
-
<div><strong>🎯 Project:</strong> {state.selected_project}</div>
|
| 343 |
-
<div><strong>📁 Files:</strong> {len(state.files_to_translate)}</div>
|
| 344 |
-
<div><strong>🌍 Language:</strong> {state.target_language}</div>
|
| 345 |
-
<div><strong>⏳ Progress:</strong> {progress_map.get(state.step, 'In progress')}</div>
|
| 346 |
-
<div><strong>🔧 GitHub:</strong> {github_status}</div>
|
| 347 |
-
</div>
|
| 348 |
-
"""
|
| 349 |
-
|
| 350 |
-
return status_html
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
# Event handlers
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
def sync_language_displays(lang):
|
| 357 |
-
return lang
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
def update_project_selection(project, history):
|
| 361 |
-
"""Update state when project is selected"""
|
| 362 |
-
global state
|
| 363 |
-
state.selected_project = project
|
| 364 |
-
response = f"Selection confirmed: 🎯 Project → **{project}**"
|
| 365 |
-
history.append(["Project selection", response])
|
| 366 |
-
return history, "", update_status()
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
def update_language_selection(lang, history):
|
| 370 |
-
"""Update state when language is selected"""
|
| 371 |
-
global state
|
| 372 |
-
state.target_language = lang
|
| 373 |
-
response = f"Selection confirmed: 🌍 Language → **{lang}**"
|
| 374 |
-
history.append(["Language selection", response])
|
| 375 |
-
return history, "", update_status(), lang
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
def update_persistent_config(api_provider, anthropic_key, aws_bearer_token_bedrock, github_token, github_owner, github_repo, reference_pr_url, history):
|
| 379 |
-
"""Update persistent configuration settings."""
|
| 380 |
-
global state
|
| 381 |
-
|
| 382 |
-
# Update API keys based on provider selection
|
| 383 |
-
if api_provider == "Anthropic":
|
| 384 |
-
state.persistent_settings["anthropic_api_key"] = anthropic_key
|
| 385 |
-
os.environ["ANTHROPIC_API_KEY"] = anthropic_key
|
| 386 |
-
# Clear AWS Bedrock token if Anthropic is selected
|
| 387 |
-
state.persistent_settings["aws_bearer_token_bedrock"] = ""
|
| 388 |
-
os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None)
|
| 389 |
-
elif api_provider == "AWS Bedrock":
|
| 390 |
-
state.persistent_settings["aws_bearer_token_bedrock"] = aws_bearer_token_bedrock
|
| 391 |
-
os.environ["AWS_BEARER_TOKEN_BEDROCK"] = aws_bearer_token_bedrock
|
| 392 |
-
# Clear Anthropic key if AWS Bedrock is selected
|
| 393 |
-
state.persistent_settings["anthropic_api_key"] = ""
|
| 394 |
-
os.environ.pop("ANTHROPIC_API_KEY", None)
|
| 395 |
-
else:
|
| 396 |
-
# If no provider is selected or unknown, clear both
|
| 397 |
-
state.persistent_settings["anthropic_api_key"] = ""
|
| 398 |
-
os.environ.pop("ANTHROPIC_API_KEY", None)
|
| 399 |
-
state.persistent_settings["aws_bearer_token_bedrock"] = ""
|
| 400 |
-
os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None)
|
| 401 |
-
|
| 402 |
-
if github_token:
|
| 403 |
-
os.environ["GITHUB_TOKEN"] = github_token
|
| 404 |
-
|
| 405 |
-
# Get default reference PR URL from project config if not provided
|
| 406 |
-
if not reference_pr_url and state.selected_project:
|
| 407 |
-
try:
|
| 408 |
-
config = get_project_config(state.selected_project)
|
| 409 |
-
reference_pr_url = config.reference_pr_url
|
| 410 |
-
except:
|
| 411 |
-
pass
|
| 412 |
-
|
| 413 |
-
# Save GitHub configuration to persistent settings
|
| 414 |
-
state.persistent_settings["github_config"].update({
|
| 415 |
-
"token": github_token or "",
|
| 416 |
-
"owner": github_owner or "",
|
| 417 |
-
"repo_name": github_repo or "",
|
| 418 |
-
"reference_pr_url": reference_pr_url or "",
|
| 419 |
-
})
|
| 420 |
-
|
| 421 |
-
# Build response message based on what was configured
|
| 422 |
-
response = "✅ Configuration saved!"
|
| 423 |
-
if github_owner and github_repo:
|
| 424 |
-
response += f" GitHub: {github_owner}/{github_repo}"
|
| 425 |
-
|
| 426 |
-
if api_provider == "Anthropic" and anthropic_key:
|
| 427 |
-
response += " Anthropic API key updated."
|
| 428 |
-
elif api_provider == "AWS Bedrock" and aws_bearer_token_bedrock:
|
| 429 |
-
response += " AWS Bedrock Bearer Token updated."
|
| 430 |
-
|
| 431 |
-
history.append(["Configuration update", response])
|
| 432 |
-
return history, "", update_status()
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
def update_github_config(token, owner, repo, reference_pr_url):
|
| 436 |
-
"""Legacy function for backward compatibility."""
|
| 437 |
-
return update_persistent_config("", token, owner, repo, reference_pr_url)
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
def update_prompt_preview(language, file_path, additional_instruction):
|
| 441 |
-
"""Update prompt preview based on current settings"""
|
| 442 |
-
if not file_path.strip():
|
| 443 |
-
return "Select a file to see the prompt preview..."
|
| 444 |
-
|
| 445 |
-
try:
|
| 446 |
-
# Get language name
|
| 447 |
-
if language == "ko":
|
| 448 |
-
translation_lang = "Korean"
|
| 449 |
-
else:
|
| 450 |
-
translation_lang = language
|
| 451 |
-
|
| 452 |
-
# Get sample content (first 500 characters)
|
| 453 |
-
content = get_content(file_path, state.selected_project)
|
| 454 |
-
to_translate = preprocess_content(content)
|
| 455 |
-
|
| 456 |
-
# Truncate for preview
|
| 457 |
-
sample_content = to_translate[:500] + ("..." if len(to_translate) > 500 else "")
|
| 458 |
-
|
| 459 |
-
# Generate prompt
|
| 460 |
-
prompt = get_full_prompt(translation_lang, sample_content, additional_instruction)
|
| 461 |
-
|
| 462 |
-
return prompt
|
| 463 |
-
except Exception as e:
|
| 464 |
-
error_str = str(e)
|
| 465 |
-
if "Failed to retrieve content from the URL" in error_str:
|
| 466 |
-
return f"❌ **File not found:** `{file_path}`\n\n💡 **Please check:**\n1. Is this file in the **{state.selected_project}** project?\n2. Use \"🔍 Find Files to Translate\" to see available files\n3. Verify the file path is correct"
|
| 467 |
-
return f"Error generating prompt preview: {error_str}"
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
def send_message(message, history):
|
| 471 |
-
new_history, cleared_input = handle_user_message(message, history)
|
| 472 |
-
return new_history, cleared_input, update_status()
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
# Button handlers with tab switching
|
| 476 |
-
def start_translate_handler(history, file_to_translate, additional_instruction="", force_retranslate=False):
|
| 477 |
-
# Use persistent anthropic key
|
| 478 |
-
anthropic_key = state.persistent_settings["anthropic_api_key"]
|
| 479 |
-
aws_bearer_token_bedrock = state.persistent_settings["aws_bearer_token_bedrock"]
|
| 480 |
-
|
| 481 |
-
if not anthropic_key and not aws_bearer_token_bedrock:
|
| 482 |
-
response = "❌ Please set either Anthropic API key or AWS Bearer Token for Bedrock in Configuration panel first."
|
| 483 |
-
history.append(["Translation request", response])
|
| 484 |
-
return history, "", update_status(), gr.Tabs(), gr.update(), gr.update()
|
| 485 |
-
|
| 486 |
-
# Set the active API key to environment variable for translator.content.py
|
| 487 |
-
if anthropic_key:
|
| 488 |
-
os.environ["ANTHROPIC_API_KEY"] = anthropic_key
|
| 489 |
-
os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None) # Ensure only one is active
|
| 490 |
-
elif aws_bearer_token_bedrock:
|
| 491 |
-
os.environ["AWS_BEARER_TOKEN_BEDROCK"] = aws_bearer_token_bedrock
|
| 492 |
-
os.environ.pop("ANTHROPIC_API_KEY", None) # Ensure only one is active
|
| 493 |
-
|
| 494 |
-
# Check if file path is provided
|
| 495 |
-
if not file_to_translate or not file_to_translate.strip():
|
| 496 |
-
response = "❌ Please select a file from the dropdown or enter a file path to translate."
|
| 497 |
-
history.append(["Translation request", response])
|
| 498 |
-
return history, "", update_status(), gr.Tabs(), gr.update(), gr.update()
|
| 499 |
-
|
| 500 |
-
state.additional_instruction = additional_instruction
|
| 501 |
-
state.files_to_translate = [file_to_translate]
|
| 502 |
-
state.step = "translate"
|
| 503 |
-
|
| 504 |
-
# Start translation directly
|
| 505 |
-
if force_retranslate:
|
| 506 |
-
history.append(["Translation request", "🔄 **Force retranslation started...**"])
|
| 507 |
-
response, translated = start_translation_process(force_retranslate)
|
| 508 |
-
history.append(["", response])
|
| 509 |
-
if translated:
|
| 510 |
-
history.append(["", translated])
|
| 511 |
-
|
| 512 |
-
# Update button text and show confirm button after translation
|
| 513 |
-
start_btn_text = "🔄 Retranslation" if state.current_file_content["translated"] else "🚀 Start Translation"
|
| 514 |
-
confirm_btn_visible = bool(state.current_file_content["translated"])
|
| 515 |
-
|
| 516 |
-
return history, "", update_status(), gr.Tabs(), gr.update(value=start_btn_text), gr.update(visible=confirm_btn_visible)
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
def approve_handler(history, owner, repo, reference_pr_url):
|
| 520 |
-
"""Handles the request to generate a GitHub PR."""
|
| 521 |
-
global state
|
| 522 |
-
state.step = "create_github_pr"
|
| 523 |
-
|
| 524 |
-
# Check all required GitHub configuration at once
|
| 525 |
-
github_config = state.persistent_settings["github_config"]
|
| 526 |
-
missing_config = []
|
| 527 |
-
|
| 528 |
-
if not github_config.get("token"):
|
| 529 |
-
missing_config.append("GitHub Token")
|
| 530 |
-
if not owner:
|
| 531 |
-
missing_config.append("GitHub Owner")
|
| 532 |
-
if not repo:
|
| 533 |
-
missing_config.append("Repository Name")
|
| 534 |
-
|
| 535 |
-
if missing_config:
|
| 536 |
-
config = get_project_config(state.selected_project)
|
| 537 |
-
repo_name = config.repo_url.split('/')[-1] # Extract repo name from URL
|
| 538 |
-
response = f"❌ Please set the following in Configuration panel first: {', '.join(missing_config)}\n\n💡 **Note:** GitHub Owner/Repository should be your fork of [`{repo_name}`]({config.repo_url}) (e.g., Owner: `your-username`, Repository: `{repo_name}`)"
|
| 539 |
-
history.append(["GitHub PR creation request", response])
|
| 540 |
-
return history, "", update_status()
|
| 541 |
-
|
| 542 |
-
# Update reference PR URL (can be set per PR)
|
| 543 |
-
if reference_pr_url:
|
| 544 |
-
state.persistent_settings["github_config"]["reference_pr_url"] = reference_pr_url
|
| 545 |
-
|
| 546 |
-
# Use persistent settings
|
| 547 |
-
github_config = state.persistent_settings["github_config"]
|
| 548 |
-
|
| 549 |
-
# Initialize response variable
|
| 550 |
-
response = ""
|
| 551 |
-
|
| 552 |
-
# If reference PR is not provided, use the agent to find one
|
| 553 |
-
if not github_config.get("reference_pr_url"):
|
| 554 |
-
response = "🤖 **Reference PR URL not found. The agent will now search for a suitable one...**"
|
| 555 |
-
try:
|
| 556 |
-
# This part is simplified to avoid streaming logic in a non-generator function
|
| 557 |
-
stream_gen = find_reference_pr_simple_stream(
|
| 558 |
-
target_language=state.target_language,
|
| 559 |
-
context="documentation translation",
|
| 560 |
-
)
|
| 561 |
-
# We will just get the final result from the generator
|
| 562 |
-
final_result = None
|
| 563 |
-
try:
|
| 564 |
-
while True:
|
| 565 |
-
# We are not interested in the streamed messages here, just the final result.
|
| 566 |
-
next(stream_gen)
|
| 567 |
-
except StopIteration as e:
|
| 568 |
-
final_result = e.value
|
| 569 |
-
|
| 570 |
-
if final_result and final_result.get("status") == "success":
|
| 571 |
-
result_text = final_result.get("result", "")
|
| 572 |
-
match = re.search(r"https://github.com/[^\s]+", result_text)
|
| 573 |
-
if match:
|
| 574 |
-
found_url = match.group(0)
|
| 575 |
-
state.github_config["reference_pr_url"] = found_url
|
| 576 |
-
response += f"\n✅ **Agent found a reference PR:** {found_url}"
|
| 577 |
-
else:
|
| 578 |
-
raise ValueError(
|
| 579 |
-
"Could not extract a valid PR URL from agent's response."
|
| 580 |
-
)
|
| 581 |
-
else:
|
| 582 |
-
error_message = final_result.get("message") or final_result.get(
|
| 583 |
-
"result", "Unknown error"
|
| 584 |
-
)
|
| 585 |
-
raise ValueError(f"Agent failed to find a PR. Reason: {error_message}")
|
| 586 |
-
except Exception as e:
|
| 587 |
-
response += f"\n❌ **Agent failed to find a reference PR.**\nReason: {e}\n\nPlease provide a reference PR URL manually in Tab 3 and try again."
|
| 588 |
-
history.append(["Agent searching for PR", response])
|
| 589 |
-
return history, "", update_status()
|
| 590 |
-
|
| 591 |
-
# Proceed with PR generation
|
| 592 |
-
if state.files_to_translate and state.current_file_content.get("translated"):
|
| 593 |
-
current_file = state.files_to_translate[0]
|
| 594 |
-
translated_content = state.current_file_content["translated"]
|
| 595 |
-
response += "\n\n🚀 **Generating GitHub PR...**"
|
| 596 |
-
|
| 597 |
-
# Extract title from file for toctree mapping
|
| 598 |
-
file_name = current_file.split("/")[-1].replace(".md", "").replace("_", " ").title()
|
| 599 |
-
print(file_name)
|
| 600 |
-
|
| 601 |
-
pr_response = generate_github_pr(
|
| 602 |
-
target_language=state.target_language,
|
| 603 |
-
filepath=current_file,
|
| 604 |
-
translated_content=translated_content,
|
| 605 |
-
github_config=state.github_config,
|
| 606 |
-
en_title=file_name,
|
| 607 |
-
project=state.selected_project,
|
| 608 |
-
)
|
| 609 |
-
response += f"\n{pr_response}"
|
| 610 |
-
else:
|
| 611 |
-
response = "❌ No translated file available. Please complete the translation process first."
|
| 612 |
-
|
| 613 |
-
history.append(["GitHub PR creation request", response])
|
| 614 |
-
return history, "", update_status()
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
def restart_handler(history):
|
| 618 |
-
"""Resets the workflow state but preserves persistent settings."""
|
| 619 |
-
global state
|
| 620 |
-
# Backup persistent settings
|
| 621 |
-
backup_settings = state.persistent_settings.copy()
|
| 622 |
-
|
| 623 |
-
# Reset state
|
| 624 |
-
state = ChatState()
|
| 625 |
-
|
| 626 |
-
# Restore persistent settings
|
| 627 |
-
state.persistent_settings = backup_settings
|
| 628 |
-
|
| 629 |
-
# Restore environment variables
|
| 630 |
-
if backup_settings["anthropic_api_key"]:
|
| 631 |
-
os.environ["ANTHROPIC_API_KEY"] = backup_settings["anthropic_api_key"]
|
| 632 |
-
if backup_settings["aws_bearer_token_bedrock"]:
|
| 633 |
-
os.environ["AWS_BEARER_TOKEN_BEDROCK"] = backup_settings["aws_bearer_token_bedrock"]
|
| 634 |
-
if backup_settings["github_config"]["token"]:
|
| 635 |
-
os.environ["GITHUB_TOKEN"] = backup_settings["github_config"]["token"]
|
| 636 |
-
|
| 637 |
-
welcome_msg = get_welcome_message()
|
| 638 |
-
new_hist = [[None, welcome_msg]]
|
| 639 |
-
return new_hist, "", update_status(), gr.Tabs(selected=0)
|
|
|
|
| 1 |
+
"""Module for gradio chat-based translation agent interface."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
|
| 9 |
+
from agent.workflow import (
|
| 10 |
+
report_translation_target_files,
|
| 11 |
+
translate_docs_interactive,
|
| 12 |
+
generate_github_pr,
|
| 13 |
+
)
|
| 14 |
+
from pr_generator.searcher import find_reference_pr_simple_stream
|
| 15 |
+
from translator.content import get_full_prompt, get_content, preprocess_content
|
| 16 |
+
from translator.project_config import get_available_projects, get_project_config
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# State management
|
| 20 |
+
class ChatState:
|
| 21 |
+
def __init__(self):
|
| 22 |
+
self.step = "welcome" # welcome -> find_files -> translate -> create_github_pr
|
| 23 |
+
|
| 24 |
+
# Transient state (reset on restart)
|
| 25 |
+
self.selected_project = "transformers"
|
| 26 |
+
self.target_language = "ko"
|
| 27 |
+
self.k_files = 10
|
| 28 |
+
self.files_to_translate = []
|
| 29 |
+
self.additional_instruction = ""
|
| 30 |
+
self.current_file_content = {"translated": ""}
|
| 31 |
+
self.pr_result = None
|
| 32 |
+
|
| 33 |
+
# Persistent settings (preserved across restarts)
|
| 34 |
+
self.persistent_settings = {
|
| 35 |
+
"anthropic_api_key": "",
|
| 36 |
+
"aws_bearer_token_bedrock": "",
|
| 37 |
+
"github_config": {
|
| 38 |
+
"token": "",
|
| 39 |
+
"owner": "",
|
| 40 |
+
"repo_name": "",
|
| 41 |
+
"reference_pr_url": "",
|
| 42 |
+
}
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
def reset_transient_state(self):
|
| 46 |
+
"""Reset only the workflow state, keep persistent settings"""
|
| 47 |
+
self.step = "welcome"
|
| 48 |
+
self.selected_project = "transformers"
|
| 49 |
+
self.target_language = "ko"
|
| 50 |
+
self.k_files = 10
|
| 51 |
+
self.files_to_translate = []
|
| 52 |
+
self.additional_instruction = ""
|
| 53 |
+
self.current_file_content = {"translated": ""}
|
| 54 |
+
self.pr_result = None
|
| 55 |
+
|
| 56 |
+
@property
|
| 57 |
+
def github_config(self):
|
| 58 |
+
return self.persistent_settings["github_config"]
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
state = ChatState()
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def _extract_content_for_display(content: str) -> str:
|
| 65 |
+
"""Extract text from document for display."""
|
| 66 |
+
# Remove Copyright header
|
| 67 |
+
to_translate = re.sub(r"<!--.*?-->", "", content, count=1, flags=re.DOTALL)
|
| 68 |
+
to_translate = to_translate.strip()
|
| 69 |
+
## remove code blocks from text
|
| 70 |
+
to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
|
| 71 |
+
## remove markdown tables from text
|
| 72 |
+
to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
|
| 73 |
+
## remove empty lines from text
|
| 74 |
+
to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
|
| 75 |
+
|
| 76 |
+
return to_translate
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def get_welcome_message():
|
| 80 |
+
"""Initial welcome message with project selection"""
|
| 81 |
+
return """**👋 Welcome to 🌐 Hugging Face i18n Translation Agent!**
|
| 82 |
+
|
| 83 |
+
I'll help you find files that need translation and translate them in a streamlined workflow.
|
| 84 |
+
|
| 85 |
+
**🎯 First, select which project you want to translate:**
|
| 86 |
+
|
| 87 |
+
Use the **`Quick Controls`** on the right to select a project, or **ask me `what`, `how`, or `help`** to get started.
|
| 88 |
+
"""
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def process_file_search_handler(project: str, lang: str, k: int, history: list) -> tuple:
|
| 92 |
+
"""Process file search request and update Gradio UI components."""
|
| 93 |
+
global state
|
| 94 |
+
state.selected_project = project
|
| 95 |
+
state.target_language = lang
|
| 96 |
+
state.k_files = k
|
| 97 |
+
state.step = "find_files"
|
| 98 |
+
|
| 99 |
+
try:
|
| 100 |
+
status_report, files_list = report_translation_target_files(project, lang, k)
|
| 101 |
+
except Exception as e:
|
| 102 |
+
if "rate limit" in str(e).lower():
|
| 103 |
+
response = f"""❌ **GitHub API Rate Limit Exceeded**
|
| 104 |
+
|
| 105 |
+
{str(e)}
|
| 106 |
+
|
| 107 |
+
**💡 To fix this:**
|
| 108 |
+
1. Set GitHub Token in Configuration panel above
|
| 109 |
+
2. Click "💾 Save Configuration"
|
| 110 |
+
3. Try "Find Files" again"""
|
| 111 |
+
history.append(["File search request", response])
|
| 112 |
+
return history, "", update_status(), gr.Tabs(selected=0), gr.update(choices=[]), gr.update(visible=False)
|
| 113 |
+
else:
|
| 114 |
+
raise # Re-raise non-rate-limit errors
|
| 115 |
+
state.files_to_translate = (
|
| 116 |
+
[file[0] for file in files_list]
|
| 117 |
+
if files_list
|
| 118 |
+
else []
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
response = f"""**✅ File search completed!**
|
| 122 |
+
|
| 123 |
+
**Status Report:**
|
| 124 |
+
{status_report}
|
| 125 |
+
|
| 126 |
+
**📁 Found first {len(state.files_to_translate)} files to translate:**
|
| 127 |
+
"""
|
| 128 |
+
|
| 129 |
+
if state.files_to_translate:
|
| 130 |
+
config = get_project_config(state.selected_project)
|
| 131 |
+
for i, file in enumerate(state.files_to_translate, 1):
|
| 132 |
+
file_link = f"{config.repo_url}/blob/main/{file}"
|
| 133 |
+
response += f"\n{i}. [`{file}`]({file_link})"
|
| 134 |
+
|
| 135 |
+
# if len(state.files_to_translate) > 5:
|
| 136 |
+
# response += f"\n... and {len(state.files_to_translate) - 5} more files"
|
| 137 |
+
|
| 138 |
+
response += "\n\n**🚀 Ready to start translation?**\nI can begin translating these files one by one. Would you like to proceed?"
|
| 139 |
+
else:
|
| 140 |
+
response += "\nNo files found that need translation."
|
| 141 |
+
|
| 142 |
+
# Add to history
|
| 143 |
+
history.append(["Please find files that need translation", response])
|
| 144 |
+
cleared_input = ""
|
| 145 |
+
|
| 146 |
+
# 드롭다운 choices로 쓸 파일 리스트 반환 추가
|
| 147 |
+
return (
|
| 148 |
+
history,
|
| 149 |
+
cleared_input,
|
| 150 |
+
update_status(),
|
| 151 |
+
gr.Tabs(), # Don't change tab
|
| 152 |
+
update_dropdown_choices(state.files_to_translate),
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def update_dropdown_choices(file_list):
|
| 157 |
+
return gr.update(choices=file_list, value=None)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def confirm_and_go_translate_handler(history):
|
| 161 |
+
"""Confirm selection and go to translate tab"""
|
| 162 |
+
global state
|
| 163 |
+
|
| 164 |
+
response = f"✅ **Selection confirmed!**\n\n🎯 **Project:** {state.selected_project}\n🌍 **Language:** {state.target_language}\n\n**➡️ Go to Tab 2 to start translation.**"
|
| 165 |
+
history.append(["Confirm selection", response])
|
| 166 |
+
return history, "", update_status(), gr.Tabs(selected=1)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def confirm_translation_and_go_upload_handler(history):
|
| 170 |
+
"""Confirm translation and go to upload PR tab"""
|
| 171 |
+
global state
|
| 172 |
+
|
| 173 |
+
if not state.current_file_content.get("translated"):
|
| 174 |
+
response = "❌ No translation available. Please complete translation first."
|
| 175 |
+
history.append(["Upload PR request", response])
|
| 176 |
+
return history, "", update_status(), gr.Tabs()
|
| 177 |
+
|
| 178 |
+
response = f"✅ **Translation confirmed!**\n\n📄 **File:** `{state.files_to_translate[0] if state.files_to_translate else 'Unknown'}`\n\n**➡️ Go to Tab 3 to upload PR.**"
|
| 179 |
+
history.append(["Upload PR request", response])
|
| 180 |
+
return history, "", update_status(), gr.Tabs(selected=2)
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def start_translation_process(force_retranslate=False):
|
| 184 |
+
"""Start the translation process for the first file"""
|
| 185 |
+
if not state.files_to_translate:
|
| 186 |
+
return "❌ No files available for translation.", ""
|
| 187 |
+
|
| 188 |
+
current_file = state.files_to_translate[0]
|
| 189 |
+
|
| 190 |
+
# Call translation function (simplified for demo)
|
| 191 |
+
try:
|
| 192 |
+
status, translated = translate_docs_interactive(
|
| 193 |
+
state.target_language, [[current_file]], state.additional_instruction, state.selected_project, force_retranslate
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
state.current_file_content = {"translated": translated}
|
| 197 |
+
path = (
|
| 198 |
+
Path(__file__).resolve().parent.parent
|
| 199 |
+
/ f"translation_result/{current_file}"
|
| 200 |
+
)
|
| 201 |
+
p = Path(path)
|
| 202 |
+
p.parent.mkdir(parents=True, exist_ok=True)
|
| 203 |
+
p.write_text(translated, encoding="utf-8")
|
| 204 |
+
|
| 205 |
+
config = get_project_config(state.selected_project)
|
| 206 |
+
original_file_link = f"{config.repo_url}/blob/main/{current_file}"
|
| 207 |
+
print("Compeleted translation:\n")
|
| 208 |
+
print(translated)
|
| 209 |
+
print("----------------------------")
|
| 210 |
+
|
| 211 |
+
# Different response format for existing vs new translation
|
| 212 |
+
if isinstance(status, str) and "Existing translation loaded" in status:
|
| 213 |
+
response = f"{status}\n**📄 Original Content Link:** {original_file_link}\n\n**🌐 Translated Content:**"
|
| 214 |
+
else:
|
| 215 |
+
response = (
|
| 216 |
+
f"""🔄 Translation for: `{current_file}`\n"""
|
| 217 |
+
f"**📄 Original Content Link:** {original_file_link}\n\n"
|
| 218 |
+
f"{status}\n\n"
|
| 219 |
+
"**🌐 Translated Content:**"
|
| 220 |
+
)
|
| 221 |
+
return response, translated
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
except Exception as e:
|
| 225 |
+
response = f"❌ Translation failed: {str(e)}"
|
| 226 |
+
response += "\n**➡️ Please try from the beginning.**"
|
| 227 |
+
return response, ""
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def handle_general_message(message):
|
| 231 |
+
"""Handle general messages"""
|
| 232 |
+
message_lower = message.lower()
|
| 233 |
+
|
| 234 |
+
if any(word in message_lower for word in ["help", "what", "how"]):
|
| 235 |
+
return """**🤖 I'm your Hugging Face i18n Translation Agent!**
|
| 236 |
+
|
| 237 |
+
I can help you:
|
| 238 |
+
1. **🔍 Find files** that need translation
|
| 239 |
+
2. **🌐 Translate documents** using AI
|
| 240 |
+
3. **📋 Review translations** for quality
|
| 241 |
+
4. **🚀 Create GitHub PR** for translation
|
| 242 |
+
|
| 243 |
+
Currently available actions with quick controls:
|
| 244 |
+
- "find files" - Search for files needing translation
|
| 245 |
+
- "translate" - Start translation process
|
| 246 |
+
- "review" - Review current translation
|
| 247 |
+
- "github" - Create GitHub Pull Request
|
| 248 |
+
- "restart" - Start over"""
|
| 249 |
+
|
| 250 |
+
elif "restart" in message_lower:
|
| 251 |
+
global state
|
| 252 |
+
state = ChatState()
|
| 253 |
+
return get_welcome_message()
|
| 254 |
+
|
| 255 |
+
else:
|
| 256 |
+
return """I understand you want to work on translations!
|
| 257 |
+
|
| 258 |
+
**Two ways to get started:**
|
| 259 |
+
|
| 260 |
+
1. **🔍 Find Files first** - Use Tab 1 to discover files that need translation
|
| 261 |
+
2. **🚀 Direct Translation** - Go to Tab 2 and enter a file path directly (e.g., `docs/source/en/model_doc/bert.md`)
|
| 262 |
+
|
| 263 |
+
Make sure to configure your API keys in the Configuration panel above.
|
| 264 |
+
"""
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
# Main handler
|
| 268 |
+
def handle_user_message(message, history):
|
| 269 |
+
"""Handle user messages and provide appropriate responses"""
|
| 270 |
+
global state
|
| 271 |
+
|
| 272 |
+
if not message.strip():
|
| 273 |
+
return history, ""
|
| 274 |
+
|
| 275 |
+
elif state.step == "find_files" and any(
|
| 276 |
+
word in message.lower()
|
| 277 |
+
for word in ["yes", "proceed", "start", "translate", "translation"]
|
| 278 |
+
):
|
| 279 |
+
# User wants to start translation
|
| 280 |
+
if state.files_to_translate:
|
| 281 |
+
state.step = "translate"
|
| 282 |
+
response, translated = start_translation_process()
|
| 283 |
+
history.append([message, response])
|
| 284 |
+
history.append(["", translated])
|
| 285 |
+
return history, ""
|
| 286 |
+
else:
|
| 287 |
+
response = (
|
| 288 |
+
"❌ No files available for translation. Please search for files first."
|
| 289 |
+
)
|
| 290 |
+
# Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
|
| 291 |
+
else:
|
| 292 |
+
# General response
|
| 293 |
+
response = handle_general_message(message)
|
| 294 |
+
|
| 295 |
+
history.append([message, response])
|
| 296 |
+
return history, ""
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
def update_status():
|
| 300 |
+
if state.step == "welcome":
|
| 301 |
+
return f"""
|
| 302 |
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px;padding: 10px; background: rgba(0, 0, 0, 0.25); border-radius: 8px;">
|
| 303 |
+
<div><strong>🔄 Step:</strong> Welcome</div>
|
| 304 |
+
<div><strong>🎯 Project:</strong> {state.selected_project}</div>
|
| 305 |
+
<div><strong>📁 Files:</strong> 0</div>
|
| 306 |
+
<div><strong>🌍 Language:</strong> {state.target_language}</div>
|
| 307 |
+
</div>
|
| 308 |
+
"""
|
| 309 |
+
|
| 310 |
+
step_map = {
|
| 311 |
+
"welcome": "Welcome",
|
| 312 |
+
"find_files": "Finding Files",
|
| 313 |
+
"translate": "Translating",
|
| 314 |
+
"review": "Reviewing",
|
| 315 |
+
"create_github_pr": "Creating PR",
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
progress_map = {
|
| 319 |
+
"welcome": "Ready to start",
|
| 320 |
+
"find_files": "Files found",
|
| 321 |
+
"translate": f"{len(state.files_to_translate)} remaining",
|
| 322 |
+
"review": "Review complete",
|
| 323 |
+
"create_github_pr": "PR generation in progress",
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
# Check GitHub configuration status
|
| 327 |
+
github_status = "❌ Not configured"
|
| 328 |
+
if all(
|
| 329 |
+
[
|
| 330 |
+
state.github_config["token"],
|
| 331 |
+
state.github_config["owner"],
|
| 332 |
+
state.github_config["repo_name"],
|
| 333 |
+
]
|
| 334 |
+
):
|
| 335 |
+
github_status = (
|
| 336 |
+
f"✅ {state.github_config['owner']}/{state.github_config['repo_name']}"
|
| 337 |
+
)
|
| 338 |
+
|
| 339 |
+
status_html = f"""
|
| 340 |
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px; padding: 10px; background: rgba(0, 0, 0, 0.25); border-radius: 8px;">
|
| 341 |
+
<div><strong>🔄 Step:</strong> {step_map.get(state.step, state.step)}</div>
|
| 342 |
+
<div><strong>🎯 Project:</strong> {state.selected_project}</div>
|
| 343 |
+
<div><strong>📁 Files:</strong> {len(state.files_to_translate)}</div>
|
| 344 |
+
<div><strong>🌍 Language:</strong> {state.target_language}</div>
|
| 345 |
+
<div><strong>⏳ Progress:</strong> {progress_map.get(state.step, 'In progress')}</div>
|
| 346 |
+
<div><strong>🔧 GitHub:</strong> {github_status}</div>
|
| 347 |
+
</div>
|
| 348 |
+
"""
|
| 349 |
+
|
| 350 |
+
return status_html
|
| 351 |
+
|
| 352 |
+
|
| 353 |
+
# Event handlers
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
def sync_language_displays(lang):
|
| 357 |
+
return lang
|
| 358 |
+
|
| 359 |
+
|
| 360 |
+
def update_project_selection(project, history):
|
| 361 |
+
"""Update state when project is selected"""
|
| 362 |
+
global state
|
| 363 |
+
state.selected_project = project
|
| 364 |
+
response = f"Selection confirmed: 🎯 Project → **{project}**"
|
| 365 |
+
history.append(["Project selection", response])
|
| 366 |
+
return history, "", update_status()
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
def update_language_selection(lang, history):
|
| 370 |
+
"""Update state when language is selected"""
|
| 371 |
+
global state
|
| 372 |
+
state.target_language = lang
|
| 373 |
+
response = f"Selection confirmed: 🌍 Language → **{lang}**"
|
| 374 |
+
history.append(["Language selection", response])
|
| 375 |
+
return history, "", update_status(), lang
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
def update_persistent_config(api_provider, anthropic_key, aws_bearer_token_bedrock, github_token, github_owner, github_repo, reference_pr_url, history):
|
| 379 |
+
"""Update persistent configuration settings."""
|
| 380 |
+
global state
|
| 381 |
+
|
| 382 |
+
# Update API keys based on provider selection
|
| 383 |
+
if api_provider == "Anthropic":
|
| 384 |
+
state.persistent_settings["anthropic_api_key"] = anthropic_key
|
| 385 |
+
os.environ["ANTHROPIC_API_KEY"] = anthropic_key
|
| 386 |
+
# Clear AWS Bedrock token if Anthropic is selected
|
| 387 |
+
state.persistent_settings["aws_bearer_token_bedrock"] = ""
|
| 388 |
+
os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None)
|
| 389 |
+
elif api_provider == "AWS Bedrock":
|
| 390 |
+
state.persistent_settings["aws_bearer_token_bedrock"] = aws_bearer_token_bedrock
|
| 391 |
+
os.environ["AWS_BEARER_TOKEN_BEDROCK"] = aws_bearer_token_bedrock
|
| 392 |
+
# Clear Anthropic key if AWS Bedrock is selected
|
| 393 |
+
state.persistent_settings["anthropic_api_key"] = ""
|
| 394 |
+
os.environ.pop("ANTHROPIC_API_KEY", None)
|
| 395 |
+
else:
|
| 396 |
+
# If no provider is selected or unknown, clear both
|
| 397 |
+
state.persistent_settings["anthropic_api_key"] = ""
|
| 398 |
+
os.environ.pop("ANTHROPIC_API_KEY", None)
|
| 399 |
+
state.persistent_settings["aws_bearer_token_bedrock"] = ""
|
| 400 |
+
os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None)
|
| 401 |
+
|
| 402 |
+
if github_token:
|
| 403 |
+
os.environ["GITHUB_TOKEN"] = github_token
|
| 404 |
+
|
| 405 |
+
# Get default reference PR URL from project config if not provided
|
| 406 |
+
if not reference_pr_url and state.selected_project:
|
| 407 |
+
try:
|
| 408 |
+
config = get_project_config(state.selected_project)
|
| 409 |
+
reference_pr_url = config.reference_pr_url
|
| 410 |
+
except:
|
| 411 |
+
pass
|
| 412 |
+
|
| 413 |
+
# Save GitHub configuration to persistent settings
|
| 414 |
+
state.persistent_settings["github_config"].update({
|
| 415 |
+
"token": github_token or "",
|
| 416 |
+
"owner": github_owner or "",
|
| 417 |
+
"repo_name": github_repo or "",
|
| 418 |
+
"reference_pr_url": reference_pr_url or "",
|
| 419 |
+
})
|
| 420 |
+
|
| 421 |
+
# Build response message based on what was configured
|
| 422 |
+
response = "✅ Configuration saved!"
|
| 423 |
+
if github_owner and github_repo:
|
| 424 |
+
response += f" GitHub: {github_owner}/{github_repo}"
|
| 425 |
+
|
| 426 |
+
if api_provider == "Anthropic" and anthropic_key:
|
| 427 |
+
response += " Anthropic API key updated."
|
| 428 |
+
elif api_provider == "AWS Bedrock" and aws_bearer_token_bedrock:
|
| 429 |
+
response += " AWS Bedrock Bearer Token updated."
|
| 430 |
+
|
| 431 |
+
history.append(["Configuration update", response])
|
| 432 |
+
return history, "", update_status()
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
def update_github_config(token, owner, repo, reference_pr_url):
|
| 436 |
+
"""Legacy function for backward compatibility."""
|
| 437 |
+
return update_persistent_config("", token, owner, repo, reference_pr_url)
|
| 438 |
+
|
| 439 |
+
|
| 440 |
+
def update_prompt_preview(language, file_path, additional_instruction):
|
| 441 |
+
"""Update prompt preview based on current settings"""
|
| 442 |
+
if not file_path.strip():
|
| 443 |
+
return "Select a file to see the prompt preview..."
|
| 444 |
+
|
| 445 |
+
try:
|
| 446 |
+
# Get language name
|
| 447 |
+
if language == "ko":
|
| 448 |
+
translation_lang = "Korean"
|
| 449 |
+
else:
|
| 450 |
+
translation_lang = language
|
| 451 |
+
|
| 452 |
+
# Get sample content (first 500 characters)
|
| 453 |
+
content = get_content(file_path, state.selected_project)
|
| 454 |
+
to_translate = preprocess_content(content)
|
| 455 |
+
|
| 456 |
+
# Truncate for preview
|
| 457 |
+
sample_content = to_translate[:500] + ("..." if len(to_translate) > 500 else "")
|
| 458 |
+
|
| 459 |
+
# Generate prompt
|
| 460 |
+
prompt = get_full_prompt(translation_lang, sample_content, additional_instruction)
|
| 461 |
+
|
| 462 |
+
return prompt
|
| 463 |
+
except Exception as e:
|
| 464 |
+
error_str = str(e)
|
| 465 |
+
if "Failed to retrieve content from the URL" in error_str:
|
| 466 |
+
return f"❌ **File not found:** `{file_path}`\n\n💡 **Please check:**\n1. Is this file in the **{state.selected_project}** project?\n2. Use \"🔍 Find Files to Translate\" to see available files\n3. Verify the file path is correct"
|
| 467 |
+
return f"Error generating prompt preview: {error_str}"
|
| 468 |
+
|
| 469 |
+
|
| 470 |
+
def send_message(message, history):
|
| 471 |
+
new_history, cleared_input = handle_user_message(message, history)
|
| 472 |
+
return new_history, cleared_input, update_status()
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
# Button handlers with tab switching
|
| 476 |
+
def start_translate_handler(history, file_to_translate, additional_instruction="", force_retranslate=False):
|
| 477 |
+
# Use persistent anthropic key
|
| 478 |
+
anthropic_key = state.persistent_settings["anthropic_api_key"]
|
| 479 |
+
aws_bearer_token_bedrock = state.persistent_settings["aws_bearer_token_bedrock"]
|
| 480 |
+
|
| 481 |
+
if not anthropic_key and not aws_bearer_token_bedrock:
|
| 482 |
+
response = "❌ Please set either Anthropic API key or AWS Bearer Token for Bedrock in Configuration panel first."
|
| 483 |
+
history.append(["Translation request", response])
|
| 484 |
+
return history, "", update_status(), gr.Tabs(), gr.update(), gr.update()
|
| 485 |
+
|
| 486 |
+
# Set the active API key to environment variable for translator.content.py
|
| 487 |
+
if anthropic_key:
|
| 488 |
+
os.environ["ANTHROPIC_API_KEY"] = anthropic_key
|
| 489 |
+
os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None) # Ensure only one is active
|
| 490 |
+
elif aws_bearer_token_bedrock:
|
| 491 |
+
os.environ["AWS_BEARER_TOKEN_BEDROCK"] = aws_bearer_token_bedrock
|
| 492 |
+
os.environ.pop("ANTHROPIC_API_KEY", None) # Ensure only one is active
|
| 493 |
+
|
| 494 |
+
# Check if file path is provided
|
| 495 |
+
if not file_to_translate or not file_to_translate.strip():
|
| 496 |
+
response = "❌ Please select a file from the dropdown or enter a file path to translate."
|
| 497 |
+
history.append(["Translation request", response])
|
| 498 |
+
return history, "", update_status(), gr.Tabs(), gr.update(), gr.update()
|
| 499 |
+
|
| 500 |
+
state.additional_instruction = additional_instruction
|
| 501 |
+
state.files_to_translate = [file_to_translate]
|
| 502 |
+
state.step = "translate"
|
| 503 |
+
|
| 504 |
+
# Start translation directly
|
| 505 |
+
if force_retranslate:
|
| 506 |
+
history.append(["Translation request", "🔄 **Force retranslation started...**"])
|
| 507 |
+
response, translated = start_translation_process(force_retranslate)
|
| 508 |
+
history.append(["", response])
|
| 509 |
+
if translated:
|
| 510 |
+
history.append(["", translated])
|
| 511 |
+
|
| 512 |
+
# Update button text and show confirm button after translation
|
| 513 |
+
start_btn_text = "🔄 Retranslation" if state.current_file_content["translated"] else "🚀 Start Translation"
|
| 514 |
+
confirm_btn_visible = bool(state.current_file_content["translated"])
|
| 515 |
+
|
| 516 |
+
return history, "", update_status(), gr.Tabs(), gr.update(value=start_btn_text), gr.update(visible=confirm_btn_visible)
|
| 517 |
+
|
| 518 |
+
|
| 519 |
+
def approve_handler(history, owner, repo, reference_pr_url):
|
| 520 |
+
"""Handles the request to generate a GitHub PR."""
|
| 521 |
+
global state
|
| 522 |
+
state.step = "create_github_pr"
|
| 523 |
+
|
| 524 |
+
# Check all required GitHub configuration at once
|
| 525 |
+
github_config = state.persistent_settings["github_config"]
|
| 526 |
+
missing_config = []
|
| 527 |
+
|
| 528 |
+
if not github_config.get("token"):
|
| 529 |
+
missing_config.append("GitHub Token")
|
| 530 |
+
if not owner:
|
| 531 |
+
missing_config.append("GitHub Owner")
|
| 532 |
+
if not repo:
|
| 533 |
+
missing_config.append("Repository Name")
|
| 534 |
+
|
| 535 |
+
if missing_config:
|
| 536 |
+
config = get_project_config(state.selected_project)
|
| 537 |
+
repo_name = config.repo_url.split('/')[-1] # Extract repo name from URL
|
| 538 |
+
response = f"❌ Please set the following in Configuration panel first: {', '.join(missing_config)}\n\n💡 **Note:** GitHub Owner/Repository should be your fork of [`{repo_name}`]({config.repo_url}) (e.g., Owner: `your-username`, Repository: `{repo_name}`)"
|
| 539 |
+
history.append(["GitHub PR creation request", response])
|
| 540 |
+
return history, "", update_status()
|
| 541 |
+
|
| 542 |
+
# Update reference PR URL (can be set per PR)
|
| 543 |
+
if reference_pr_url:
|
| 544 |
+
state.persistent_settings["github_config"]["reference_pr_url"] = reference_pr_url
|
| 545 |
+
|
| 546 |
+
# Use persistent settings
|
| 547 |
+
github_config = state.persistent_settings["github_config"]
|
| 548 |
+
|
| 549 |
+
# Initialize response variable
|
| 550 |
+
response = ""
|
| 551 |
+
|
| 552 |
+
# If reference PR is not provided, use the agent to find one
|
| 553 |
+
if not github_config.get("reference_pr_url"):
|
| 554 |
+
response = "🤖 **Reference PR URL not found. The agent will now search for a suitable one...**"
|
| 555 |
+
try:
|
| 556 |
+
# This part is simplified to avoid streaming logic in a non-generator function
|
| 557 |
+
stream_gen = find_reference_pr_simple_stream(
|
| 558 |
+
target_language=state.target_language,
|
| 559 |
+
context="documentation translation",
|
| 560 |
+
)
|
| 561 |
+
# We will just get the final result from the generator
|
| 562 |
+
final_result = None
|
| 563 |
+
try:
|
| 564 |
+
while True:
|
| 565 |
+
# We are not interested in the streamed messages here, just the final result.
|
| 566 |
+
next(stream_gen)
|
| 567 |
+
except StopIteration as e:
|
| 568 |
+
final_result = e.value
|
| 569 |
+
|
| 570 |
+
if final_result and final_result.get("status") == "success":
|
| 571 |
+
result_text = final_result.get("result", "")
|
| 572 |
+
match = re.search(r"https://github.com/[^\s]+", result_text)
|
| 573 |
+
if match:
|
| 574 |
+
found_url = match.group(0)
|
| 575 |
+
state.github_config["reference_pr_url"] = found_url
|
| 576 |
+
response += f"\n✅ **Agent found a reference PR:** {found_url}"
|
| 577 |
+
else:
|
| 578 |
+
raise ValueError(
|
| 579 |
+
"Could not extract a valid PR URL from agent's response."
|
| 580 |
+
)
|
| 581 |
+
else:
|
| 582 |
+
error_message = final_result.get("message") or final_result.get(
|
| 583 |
+
"result", "Unknown error"
|
| 584 |
+
)
|
| 585 |
+
raise ValueError(f"Agent failed to find a PR. Reason: {error_message}")
|
| 586 |
+
except Exception as e:
|
| 587 |
+
response += f"\n❌ **Agent failed to find a reference PR.**\nReason: {e}\n\nPlease provide a reference PR URL manually in Tab 3 and try again."
|
| 588 |
+
history.append(["Agent searching for PR", response])
|
| 589 |
+
return history, "", update_status()
|
| 590 |
+
|
| 591 |
+
# Proceed with PR generation
|
| 592 |
+
if state.files_to_translate and state.current_file_content.get("translated"):
|
| 593 |
+
current_file = state.files_to_translate[0]
|
| 594 |
+
translated_content = state.current_file_content["translated"]
|
| 595 |
+
response += "\n\n🚀 **Generating GitHub PR...**"
|
| 596 |
+
|
| 597 |
+
# Extract title from file for toctree mapping
|
| 598 |
+
file_name = current_file.split("/")[-1].replace(".md", "").replace("_", " ").title()
|
| 599 |
+
print(file_name)
|
| 600 |
+
|
| 601 |
+
pr_response = generate_github_pr(
|
| 602 |
+
target_language=state.target_language,
|
| 603 |
+
filepath=current_file,
|
| 604 |
+
translated_content=translated_content,
|
| 605 |
+
github_config=state.github_config,
|
| 606 |
+
en_title=file_name,
|
| 607 |
+
project=state.selected_project,
|
| 608 |
+
)
|
| 609 |
+
response += f"\n{pr_response}"
|
| 610 |
+
else:
|
| 611 |
+
response = "❌ No translated file available. Please complete the translation process first."
|
| 612 |
+
|
| 613 |
+
history.append(["GitHub PR creation request", response])
|
| 614 |
+
return history, "", update_status()
|
| 615 |
+
|
| 616 |
+
|
| 617 |
+
def restart_handler(history):
|
| 618 |
+
"""Resets the workflow state but preserves persistent settings."""
|
| 619 |
+
global state
|
| 620 |
+
# Backup persistent settings
|
| 621 |
+
backup_settings = state.persistent_settings.copy()
|
| 622 |
+
|
| 623 |
+
# Reset state
|
| 624 |
+
state = ChatState()
|
| 625 |
+
|
| 626 |
+
# Restore persistent settings
|
| 627 |
+
state.persistent_settings = backup_settings
|
| 628 |
+
|
| 629 |
+
# Restore environment variables
|
| 630 |
+
if backup_settings["anthropic_api_key"]:
|
| 631 |
+
os.environ["ANTHROPIC_API_KEY"] = backup_settings["anthropic_api_key"]
|
| 632 |
+
if backup_settings["aws_bearer_token_bedrock"]:
|
| 633 |
+
os.environ["AWS_BEARER_TOKEN_BEDROCK"] = backup_settings["aws_bearer_token_bedrock"]
|
| 634 |
+
if backup_settings["github_config"]["token"]:
|
| 635 |
+
os.environ["GITHUB_TOKEN"] = backup_settings["github_config"]["token"]
|
| 636 |
+
|
| 637 |
+
welcome_msg = get_welcome_message()
|
| 638 |
+
new_hist = [[None, welcome_msg]]
|
| 639 |
+
return new_hist, "", update_status(), gr.Tabs(selected=0)
|
agent/toctree_handler.py
CHANGED
|
@@ -1,419 +1,419 @@
|
|
| 1 |
-
import yaml
|
| 2 |
-
import requests
|
| 3 |
-
from typing import Dict, List, Any
|
| 4 |
-
import os
|
| 5 |
-
|
| 6 |
-
class TocTreeHandler:
|
| 7 |
-
def __init__(self, project: str = "transformers"):
|
| 8 |
-
from translator.project_config import get_project_config
|
| 9 |
-
self.project = project
|
| 10 |
-
self.project_config = get_project_config(project)
|
| 11 |
-
|
| 12 |
-
# Extract repository path from config
|
| 13 |
-
repo_path = self.project_config.repo_url.replace("https://github.com/", "")
|
| 14 |
-
|
| 15 |
-
# Build project-specific URLs
|
| 16 |
-
self.en_toctree_url = f"https://raw.githubusercontent.com/{repo_path}/main/docs/source/en/_toctree.yml"
|
| 17 |
-
self.ko_toctree_url = f"https://raw.githubusercontent.com/{repo_path}/main/docs/source/ko/_toctree.yml"
|
| 18 |
-
self.local_docs_path = "docs/source/ko"
|
| 19 |
-
|
| 20 |
-
def fetch_toctree(self, url: str) -> Dict[str, Any]:
|
| 21 |
-
"""Fetch and parse YAML from URL"""
|
| 22 |
-
response = requests.get(url)
|
| 23 |
-
response.raise_for_status()
|
| 24 |
-
return yaml.safe_load(response.text)
|
| 25 |
-
|
| 26 |
-
def get_en_toctree(self) -> Dict[str, Any]:
|
| 27 |
-
"""Get English toctree structure"""
|
| 28 |
-
return self.fetch_toctree(self.en_toctree_url)
|
| 29 |
-
|
| 30 |
-
def get_ko_toctree(self) -> Dict[str, Any]:
|
| 31 |
-
"""Get Korean toctree structure"""
|
| 32 |
-
return self.fetch_toctree(self.ko_toctree_url)
|
| 33 |
-
|
| 34 |
-
def extract_title_mappings(self, en_data: List[Dict], ko_data: List[Dict]) -> Dict[str, str]:
|
| 35 |
-
"""Extract title mappings between English and Korean"""
|
| 36 |
-
mappings = {}
|
| 37 |
-
|
| 38 |
-
def process_section(en_section: Dict, ko_section: Dict):
|
| 39 |
-
if 'local' in en_section and 'local' in ko_section:
|
| 40 |
-
if en_section['local'] == ko_section['local']:
|
| 41 |
-
en_title = en_section.get('title', '')
|
| 42 |
-
ko_title = ko_section.get('title', '')
|
| 43 |
-
if en_title and ko_title:
|
| 44 |
-
mappings[en_title] = ko_title
|
| 45 |
-
|
| 46 |
-
if 'sections' in en_section and 'sections' in ko_section:
|
| 47 |
-
en_sections = en_section['sections']
|
| 48 |
-
ko_sections = ko_section['sections']
|
| 49 |
-
|
| 50 |
-
for i, en_sub in enumerate(en_sections):
|
| 51 |
-
if i < len(ko_sections):
|
| 52 |
-
process_section(en_sub, ko_sections[i])
|
| 53 |
-
|
| 54 |
-
for i, en_item in enumerate(en_data):
|
| 55 |
-
if i < len(ko_data):
|
| 56 |
-
process_section(en_item, ko_data[i])
|
| 57 |
-
|
| 58 |
-
return mappings
|
| 59 |
-
|
| 60 |
-
def translate_title(self, en_title: str) -> str:
|
| 61 |
-
"""Translate English title to Korean using LLM"""
|
| 62 |
-
try:
|
| 63 |
-
from translator.content import llm_translate
|
| 64 |
-
|
| 65 |
-
prompt = f"""Translate the following English documentation title to Korean. Return only the translated title, nothing else.
|
| 66 |
-
|
| 67 |
-
English title: {en_title}
|
| 68 |
-
|
| 69 |
-
Korean title:"""
|
| 70 |
-
|
| 71 |
-
callback_result, translated_title = llm_translate(prompt)
|
| 72 |
-
return translated_title.strip()
|
| 73 |
-
except Exception as e:
|
| 74 |
-
print(f"Error translating title '{en_title}': {e}")
|
| 75 |
-
return en_title
|
| 76 |
-
|
| 77 |
-
def create_local_toctree(self, en_title: str, local_file_path: str) -> Dict[str, str]:
|
| 78 |
-
"""Create local toctree entry with Korean title and local path"""
|
| 79 |
-
try:
|
| 80 |
-
# First try to get Korean title from existing mappings
|
| 81 |
-
en_data = self.get_en_toctree()
|
| 82 |
-
ko_data = self.get_ko_toctree()
|
| 83 |
-
|
| 84 |
-
title_mappings = self.extract_title_mappings(en_data, ko_data)
|
| 85 |
-
ko_title = title_mappings.get(en_title)
|
| 86 |
-
|
| 87 |
-
# If no existing mapping, translate the title
|
| 88 |
-
if not ko_title:
|
| 89 |
-
ko_title = self.translate_title(en_title)
|
| 90 |
-
|
| 91 |
-
return {
|
| 92 |
-
'local': local_file_path,
|
| 93 |
-
'title': ko_title
|
| 94 |
-
}
|
| 95 |
-
except Exception as e:
|
| 96 |
-
print(f"Error creating local toctree: {e}")
|
| 97 |
-
return {
|
| 98 |
-
'local': local_file_path,
|
| 99 |
-
'title': en_title
|
| 100 |
-
}
|
| 101 |
-
|
| 102 |
-
def find_and_update_translation_entry(self, ko_toctree_data, target_local: str, english_title: str, korean_title: str):
|
| 103 |
-
"""Find entry with '(번역중) 영어제목' and update it"""
|
| 104 |
-
target_title_pattern = f"(번역중) {english_title}"
|
| 105 |
-
|
| 106 |
-
def process_item(item):
|
| 107 |
-
if isinstance(item, dict):
|
| 108 |
-
# Check if title matches the pattern
|
| 109 |
-
if item.get('title') == target_title_pattern:
|
| 110 |
-
# Update local path and title
|
| 111 |
-
item['local'] = target_local
|
| 112 |
-
item['title'] = korean_title
|
| 113 |
-
return True
|
| 114 |
-
|
| 115 |
-
# Process sections recursively
|
| 116 |
-
if 'sections' in item:
|
| 117 |
-
for section in item['sections']:
|
| 118 |
-
if process_item(section):
|
| 119 |
-
return True
|
| 120 |
-
return False
|
| 121 |
-
|
| 122 |
-
# Process the toctree data
|
| 123 |
-
if isinstance(ko_toctree_data, list):
|
| 124 |
-
for item in ko_toctree_data:
|
| 125 |
-
if process_item(item):
|
| 126 |
-
return True
|
| 127 |
-
return False
|
| 128 |
-
|
| 129 |
-
def create_updated_toctree_with_replacement(self, ko_toctree: list, target_local: str) -> list:
|
| 130 |
-
"""Update Korean toctree by finding and updating translation entry"""
|
| 131 |
-
try:
|
| 132 |
-
# Step 1: Get English toctree and find the English title for target_local
|
| 133 |
-
en_toctree = self.get_en_toctree()
|
| 134 |
-
english_title = self.find_title_for_local(en_toctree, target_local)
|
| 135 |
-
|
| 136 |
-
if not english_title:
|
| 137 |
-
print(f"⚠️ Toctree entry not found: '{target_local}' not in English toctree")
|
| 138 |
-
print(f"🔍 Attempting to find appropriate section for new entry...")
|
| 139 |
-
# Try to add new entry in appropriate location
|
| 140 |
-
return self.add_new_toctree_entry(ko_toctree, target_local)
|
| 141 |
-
|
| 142 |
-
print(f"Found English title: {english_title} for local: {target_local}")
|
| 143 |
-
|
| 144 |
-
# Step 2: Translate the English title to Korean
|
| 145 |
-
korean_title = self.translate_title(english_title)
|
| 146 |
-
print(f"Translated Korean title: {korean_title}")
|
| 147 |
-
|
| 148 |
-
# Step 3: Make a deep copy to avoid modifying original
|
| 149 |
-
import copy
|
| 150 |
-
updated_toctree = copy.deepcopy(ko_toctree)
|
| 151 |
-
|
| 152 |
-
# Step 4: Find and update the "(번역중) 영어제목" entry
|
| 153 |
-
updated = self.find_and_update_translation_entry(
|
| 154 |
-
updated_toctree, target_local, english_title, korean_title
|
| 155 |
-
)
|
| 156 |
-
|
| 157 |
-
if updated:
|
| 158 |
-
print(f"✅ Successfully updated translation entry: local={target_local}, title={korean_title}")
|
| 159 |
-
return updated_toctree
|
| 160 |
-
else:
|
| 161 |
-
print(f"⚠️ Toctree update skipped: '(번역중) {english_title}' entry not found")
|
| 162 |
-
print(f"📋 This may be a new file not yet added to Korean toctree")
|
| 163 |
-
return ko_toctree
|
| 164 |
-
|
| 165 |
-
except Exception as e:
|
| 166 |
-
print(f"Error creating updated toctree: {e}")
|
| 167 |
-
return ko_toctree
|
| 168 |
-
|
| 169 |
-
def find_title_for_local(self, toctree_data, target_local: str):
|
| 170 |
-
"""Find title for given local path in toctree"""
|
| 171 |
-
def search_item(item):
|
| 172 |
-
if isinstance(item, dict):
|
| 173 |
-
if item.get('local') == target_local:
|
| 174 |
-
return item.get('title', '')
|
| 175 |
-
|
| 176 |
-
if 'sections' in item:
|
| 177 |
-
for section in item['sections']:
|
| 178 |
-
result = search_item(section)
|
| 179 |
-
if result:
|
| 180 |
-
return result
|
| 181 |
-
return None
|
| 182 |
-
|
| 183 |
-
if isinstance(toctree_data, list):
|
| 184 |
-
for item in toctree_data:
|
| 185 |
-
result = search_item(item)
|
| 186 |
-
if result:
|
| 187 |
-
return result
|
| 188 |
-
return None
|
| 189 |
-
|
| 190 |
-
def process_pr_commit(self, filepath: str):
|
| 191 |
-
"""Process PR commit by updating Korean toctree with translated entry"""
|
| 192 |
-
# Get filepath without prefix
|
| 193 |
-
filepath_without_prefix = filepath.replace("docs/source/en/", "").replace(".md", "")
|
| 194 |
-
|
| 195 |
-
# Get Korean toctree
|
| 196 |
-
ko_toctree = self.get_ko_toctree()
|
| 197 |
-
|
| 198 |
-
# Use diff-merge algorithm to add new entry
|
| 199 |
-
updated_ko_toctree = self.add_new_toctree_entry(ko_toctree, filepath_without_prefix)
|
| 200 |
-
|
| 201 |
-
if not updated_ko_toctree:
|
| 202 |
-
print(f"Failed to create updated Korean toctree for local: {filepath_without_prefix}")
|
| 203 |
-
return
|
| 204 |
-
|
| 205 |
-
print(f"Successfully updated Korean toctree")
|
| 206 |
-
|
| 207 |
-
# Store the updated toctree for commit
|
| 208 |
-
self.updated_ko_toctree = updated_ko_toctree
|
| 209 |
-
|
| 210 |
-
def commit_and_push_toctree(self, pr_agent, owner: str, repo_name: str, branch_name: str):
|
| 211 |
-
"""Commit and push toctree updates as a separate commit"""
|
| 212 |
-
try:
|
| 213 |
-
# Use the updated toctree created by LLM
|
| 214 |
-
if not hasattr(self, 'updated_ko_toctree') or not self.updated_ko_toctree:
|
| 215 |
-
print("No updated Korean toctree available")
|
| 216 |
-
return {"status": "error", "message": "No updated toctree to commit"}
|
| 217 |
-
|
| 218 |
-
ko_data = self.updated_ko_toctree
|
| 219 |
-
|
| 220 |
-
# Convert to YAML string
|
| 221 |
-
toctree_content = yaml.dump(ko_data, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
| 222 |
-
|
| 223 |
-
# Create toctree commit message
|
| 224 |
-
commit_message = "docs: update Korean documentation table of contents"
|
| 225 |
-
|
| 226 |
-
# Commit toctree file
|
| 227 |
-
file_result = pr_agent.create_or_update_file(
|
| 228 |
-
owner=owner,
|
| 229 |
-
repo_name=repo_name,
|
| 230 |
-
path="docs/source/ko/_toctree.yml",
|
| 231 |
-
message=commit_message,
|
| 232 |
-
content=toctree_content,
|
| 233 |
-
branch_name=branch_name
|
| 234 |
-
)
|
| 235 |
-
|
| 236 |
-
if file_result.startswith("SUCCESS"):
|
| 237 |
-
return {
|
| 238 |
-
"status": "success",
|
| 239 |
-
"message": f"Toctree committed successfully: {file_result}",
|
| 240 |
-
"commit_message": commit_message
|
| 241 |
-
}
|
| 242 |
-
else:
|
| 243 |
-
return {
|
| 244 |
-
"status": "error",
|
| 245 |
-
"message": f"Toctree commit failed: {file_result}"
|
| 246 |
-
}
|
| 247 |
-
|
| 248 |
-
except Exception as e:
|
| 249 |
-
return {
|
| 250 |
-
"status": "error",
|
| 251 |
-
"message": f"Error committing toctree: {str(e)}"
|
| 252 |
-
}
|
| 253 |
-
|
| 254 |
-
def update_toctree_after_translation(
|
| 255 |
-
self,
|
| 256 |
-
translation_result: dict,
|
| 257 |
-
filepath: str,
|
| 258 |
-
pr_agent,
|
| 259 |
-
github_config: dict,
|
| 260 |
-
project: str = "transformers"
|
| 261 |
-
) -> dict:
|
| 262 |
-
"""Update toctree after successful translation PR.
|
| 263 |
-
|
| 264 |
-
Args:
|
| 265 |
-
translation_result: Result from translation PR workflow
|
| 266 |
-
filepath: Original file path
|
| 267 |
-
pr_agent: GitHub PR agent instance
|
| 268 |
-
github_config: GitHub configuration dictionary
|
| 269 |
-
|
| 270 |
-
Returns:
|
| 271 |
-
Dictionary with toctree update result
|
| 272 |
-
"""
|
| 273 |
-
if translation_result["status"] == "error":
|
| 274 |
-
return None
|
| 275 |
-
|
| 276 |
-
try:
|
| 277 |
-
# Process toctree update with LLM
|
| 278 |
-
self.process_pr_commit(filepath)
|
| 279 |
-
# Commit toctree as separate commit
|
| 280 |
-
if self.updated_ko_toctree:
|
| 281 |
-
return self.commit_and_push_toctree(
|
| 282 |
-
pr_agent=pr_agent,
|
| 283 |
-
owner=github_config["owner"],
|
| 284 |
-
repo_name=github_config["repo_name"],
|
| 285 |
-
branch_name=translation_result["branch"]
|
| 286 |
-
)
|
| 287 |
-
|
| 288 |
-
except Exception as e:
|
| 289 |
-
return {
|
| 290 |
-
"status": "error",
|
| 291 |
-
"message": f"Error updating toctree: {str(e)}"
|
| 292 |
-
}
|
| 293 |
-
|
| 294 |
-
def add_new_toctree_entry(self, ko_toctree: list, target_local: str) -> list:
|
| 295 |
-
"""Add new toctree entry using diff-merge algorithm"""
|
| 296 |
-
try:
|
| 297 |
-
import copy
|
| 298 |
-
updated_toctree = copy.deepcopy(ko_toctree)
|
| 299 |
-
|
| 300 |
-
# Generate new entry
|
| 301 |
-
filename = target_local.split('/')[-1].replace('_', ' ').title()
|
| 302 |
-
korean_title = self.translate_title(filename)
|
| 303 |
-
new_entry = {
|
| 304 |
-
'local': target_local,
|
| 305 |
-
'title': korean_title
|
| 306 |
-
}
|
| 307 |
-
|
| 308 |
-
# Get English toctree for structure reference
|
| 309 |
-
en_toctree = self.get_en_toctree()
|
| 310 |
-
|
| 311 |
-
# Use diff-merge algorithm
|
| 312 |
-
if self.merge_toctree_sections(en_toctree, updated_toctree, target_local, new_entry):
|
| 313 |
-
return updated_toctree
|
| 314 |
-
else:
|
| 315 |
-
# Fallback: add to root level
|
| 316 |
-
updated_toctree.append(new_entry)
|
| 317 |
-
print(f"✅ Added new entry at root level: {target_local} -> {korean_title}")
|
| 318 |
-
return updated_toctree
|
| 319 |
-
|
| 320 |
-
except Exception as e:
|
| 321 |
-
print(f"❌ Error adding new toctree entry: {e}")
|
| 322 |
-
return ko_toctree
|
| 323 |
-
|
| 324 |
-
def merge_toctree_sections(self, en_toctree: list, ko_toctree: list, target_local: str, new_entry: dict) -> bool:
|
| 325 |
-
"""Merge English toctree structure into Korean toctree for target_local"""
|
| 326 |
-
for en_section in en_toctree:
|
| 327 |
-
en_title = en_section.get('title')
|
| 328 |
-
|
| 329 |
-
# Check if this English section contains our target
|
| 330 |
-
if self.contains_target(en_section, target_local):
|
| 331 |
-
# Find matching Korean section
|
| 332 |
-
ko_section = self.find_matching_section(ko_toctree, en_title)
|
| 333 |
-
|
| 334 |
-
if ko_section:
|
| 335 |
-
# Section exists - merge subsections
|
| 336 |
-
return self.merge_subsections(en_section, ko_section, target_local, new_entry)
|
| 337 |
-
else:
|
| 338 |
-
# Section doesn't exist - create new section
|
| 339 |
-
new_ko_section = self.create_section_with_order(en_section, target_local, new_entry)
|
| 340 |
-
ko_toctree.append(new_ko_section)
|
| 341 |
-
print(f"✅ Created new section '{new_ko_section.get('title')}' with ordered structure")
|
| 342 |
-
return True
|
| 343 |
-
return False
|
| 344 |
-
|
| 345 |
-
def contains_target(self, section: dict, target_local: str) -> bool:
|
| 346 |
-
"""Check if section contains target_local recursively"""
|
| 347 |
-
if 'sections' in section:
|
| 348 |
-
for subsection in section['sections']:
|
| 349 |
-
if subsection.get('local') == target_local:
|
| 350 |
-
return True
|
| 351 |
-
if self.contains_target(subsection, target_local):
|
| 352 |
-
return True
|
| 353 |
-
return False
|
| 354 |
-
|
| 355 |
-
def find_matching_section(self, ko_toctree: list, en_title: str) -> dict:
|
| 356 |
-
"""Find Korean section that matches English title"""
|
| 357 |
-
# Try exact match first
|
| 358 |
-
for item in ko_toctree:
|
| 359 |
-
if item.get('title') == en_title:
|
| 360 |
-
return item
|
| 361 |
-
|
| 362 |
-
# Try translated title match
|
| 363 |
-
try:
|
| 364 |
-
translated_title = self.translate_title(en_title)
|
| 365 |
-
for item in ko_toctree:
|
| 366 |
-
if item.get('title') == translated_title:
|
| 367 |
-
return item
|
| 368 |
-
except:
|
| 369 |
-
pass
|
| 370 |
-
|
| 371 |
-
return None
|
| 372 |
-
|
| 373 |
-
def merge_subsections(self, en_section: dict, ko_section: dict, target_local: str, new_entry: dict) -> bool:
|
| 374 |
-
"""Merge subsections while maintaining order"""
|
| 375 |
-
if 'sections' not in en_section:
|
| 376 |
-
return False
|
| 377 |
-
|
| 378 |
-
# Find target index in English sections
|
| 379 |
-
target_index = -1
|
| 380 |
-
for i, en_subsection in enumerate(en_section['sections']):
|
| 381 |
-
if en_subsection.get('local') == target_local:
|
| 382 |
-
target_index = i
|
| 383 |
-
break
|
| 384 |
-
|
| 385 |
-
if target_index == -1:
|
| 386 |
-
return False
|
| 387 |
-
|
| 388 |
-
# Ensure Korean section has sections array
|
| 389 |
-
if 'sections' not in ko_section:
|
| 390 |
-
ko_section['sections'] = []
|
| 391 |
-
|
| 392 |
-
# Insert at correct position
|
| 393 |
-
self.insert_at_correct_position(ko_section, target_index, new_entry)
|
| 394 |
-
return True
|
| 395 |
-
|
| 396 |
-
def insert_at_correct_position(self, ko_section: dict, target_index: int, new_entry: dict):
|
| 397 |
-
"""Insert entry at correct position, expanding array if needed"""
|
| 398 |
-
sections = ko_section['sections']
|
| 399 |
-
|
| 400 |
-
# Expand sections array if needed
|
| 401 |
-
while len(sections) <= target_index:
|
| 402 |
-
sections.append(None) # Placeholder
|
| 403 |
-
|
| 404 |
-
# Insert new entry
|
| 405 |
-
sections[target_index] = new_entry
|
| 406 |
-
|
| 407 |
-
# Clean up None placeholders at the end
|
| 408 |
-
while sections and sections[-1] is None:
|
| 409 |
-
sections.pop()
|
| 410 |
-
|
| 411 |
-
def create_section_with_order(self, en_section: dict, target_local: str, new_entry: dict) -> dict:
|
| 412 |
-
"""Create new Korean section with only the translated entry"""
|
| 413 |
-
new_ko_section = {
|
| 414 |
-
'title': self.translate_title(en_section.get('title')),
|
| 415 |
-
'isExpanded': en_section.get('isExpanded', False),
|
| 416 |
-
'sections': [new_entry] # Only add the translated entry
|
| 417 |
-
}
|
| 418 |
-
|
| 419 |
-
return new_ko_section
|
|
|
|
| 1 |
+
import yaml
|
| 2 |
+
import requests
|
| 3 |
+
from typing import Dict, List, Any
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
class TocTreeHandler:
|
| 7 |
+
def __init__(self, project: str = "transformers"):
|
| 8 |
+
from translator.project_config import get_project_config
|
| 9 |
+
self.project = project
|
| 10 |
+
self.project_config = get_project_config(project)
|
| 11 |
+
|
| 12 |
+
# Extract repository path from config
|
| 13 |
+
repo_path = self.project_config.repo_url.replace("https://github.com/", "")
|
| 14 |
+
|
| 15 |
+
# Build project-specific URLs
|
| 16 |
+
self.en_toctree_url = f"https://raw.githubusercontent.com/{repo_path}/main/docs/source/en/_toctree.yml"
|
| 17 |
+
self.ko_toctree_url = f"https://raw.githubusercontent.com/{repo_path}/main/docs/source/ko/_toctree.yml"
|
| 18 |
+
self.local_docs_path = "docs/source/ko"
|
| 19 |
+
|
| 20 |
+
def fetch_toctree(self, url: str) -> Dict[str, Any]:
|
| 21 |
+
"""Fetch and parse YAML from URL"""
|
| 22 |
+
response = requests.get(url)
|
| 23 |
+
response.raise_for_status()
|
| 24 |
+
return yaml.safe_load(response.text)
|
| 25 |
+
|
| 26 |
+
def get_en_toctree(self) -> Dict[str, Any]:
|
| 27 |
+
"""Get English toctree structure"""
|
| 28 |
+
return self.fetch_toctree(self.en_toctree_url)
|
| 29 |
+
|
| 30 |
+
def get_ko_toctree(self) -> Dict[str, Any]:
|
| 31 |
+
"""Get Korean toctree structure"""
|
| 32 |
+
return self.fetch_toctree(self.ko_toctree_url)
|
| 33 |
+
|
| 34 |
+
def extract_title_mappings(self, en_data: List[Dict], ko_data: List[Dict]) -> Dict[str, str]:
|
| 35 |
+
"""Extract title mappings between English and Korean"""
|
| 36 |
+
mappings = {}
|
| 37 |
+
|
| 38 |
+
def process_section(en_section: Dict, ko_section: Dict):
|
| 39 |
+
if 'local' in en_section and 'local' in ko_section:
|
| 40 |
+
if en_section['local'] == ko_section['local']:
|
| 41 |
+
en_title = en_section.get('title', '')
|
| 42 |
+
ko_title = ko_section.get('title', '')
|
| 43 |
+
if en_title and ko_title:
|
| 44 |
+
mappings[en_title] = ko_title
|
| 45 |
+
|
| 46 |
+
if 'sections' in en_section and 'sections' in ko_section:
|
| 47 |
+
en_sections = en_section['sections']
|
| 48 |
+
ko_sections = ko_section['sections']
|
| 49 |
+
|
| 50 |
+
for i, en_sub in enumerate(en_sections):
|
| 51 |
+
if i < len(ko_sections):
|
| 52 |
+
process_section(en_sub, ko_sections[i])
|
| 53 |
+
|
| 54 |
+
for i, en_item in enumerate(en_data):
|
| 55 |
+
if i < len(ko_data):
|
| 56 |
+
process_section(en_item, ko_data[i])
|
| 57 |
+
|
| 58 |
+
return mappings
|
| 59 |
+
|
| 60 |
+
def translate_title(self, en_title: str) -> str:
|
| 61 |
+
"""Translate English title to Korean using LLM"""
|
| 62 |
+
try:
|
| 63 |
+
from translator.content import llm_translate
|
| 64 |
+
|
| 65 |
+
prompt = f"""Translate the following English documentation title to Korean. Return only the translated title, nothing else.
|
| 66 |
+
|
| 67 |
+
English title: {en_title}
|
| 68 |
+
|
| 69 |
+
Korean title:"""
|
| 70 |
+
|
| 71 |
+
callback_result, translated_title = llm_translate(prompt)
|
| 72 |
+
return translated_title.strip()
|
| 73 |
+
except Exception as e:
|
| 74 |
+
print(f"Error translating title '{en_title}': {e}")
|
| 75 |
+
return en_title
|
| 76 |
+
|
| 77 |
+
def create_local_toctree(self, en_title: str, local_file_path: str) -> Dict[str, str]:
|
| 78 |
+
"""Create local toctree entry with Korean title and local path"""
|
| 79 |
+
try:
|
| 80 |
+
# First try to get Korean title from existing mappings
|
| 81 |
+
en_data = self.get_en_toctree()
|
| 82 |
+
ko_data = self.get_ko_toctree()
|
| 83 |
+
|
| 84 |
+
title_mappings = self.extract_title_mappings(en_data, ko_data)
|
| 85 |
+
ko_title = title_mappings.get(en_title)
|
| 86 |
+
|
| 87 |
+
# If no existing mapping, translate the title
|
| 88 |
+
if not ko_title:
|
| 89 |
+
ko_title = self.translate_title(en_title)
|
| 90 |
+
|
| 91 |
+
return {
|
| 92 |
+
'local': local_file_path,
|
| 93 |
+
'title': ko_title
|
| 94 |
+
}
|
| 95 |
+
except Exception as e:
|
| 96 |
+
print(f"Error creating local toctree: {e}")
|
| 97 |
+
return {
|
| 98 |
+
'local': local_file_path,
|
| 99 |
+
'title': en_title
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
def find_and_update_translation_entry(self, ko_toctree_data, target_local: str, english_title: str, korean_title: str):
|
| 103 |
+
"""Find entry with '(번역중) 영어제목' and update it"""
|
| 104 |
+
target_title_pattern = f"(번역중) {english_title}"
|
| 105 |
+
|
| 106 |
+
def process_item(item):
|
| 107 |
+
if isinstance(item, dict):
|
| 108 |
+
# Check if title matches the pattern
|
| 109 |
+
if item.get('title') == target_title_pattern:
|
| 110 |
+
# Update local path and title
|
| 111 |
+
item['local'] = target_local
|
| 112 |
+
item['title'] = korean_title
|
| 113 |
+
return True
|
| 114 |
+
|
| 115 |
+
# Process sections recursively
|
| 116 |
+
if 'sections' in item:
|
| 117 |
+
for section in item['sections']:
|
| 118 |
+
if process_item(section):
|
| 119 |
+
return True
|
| 120 |
+
return False
|
| 121 |
+
|
| 122 |
+
# Process the toctree data
|
| 123 |
+
if isinstance(ko_toctree_data, list):
|
| 124 |
+
for item in ko_toctree_data:
|
| 125 |
+
if process_item(item):
|
| 126 |
+
return True
|
| 127 |
+
return False
|
| 128 |
+
|
| 129 |
+
def create_updated_toctree_with_replacement(self, ko_toctree: list, target_local: str) -> list:
|
| 130 |
+
"""Update Korean toctree by finding and updating translation entry"""
|
| 131 |
+
try:
|
| 132 |
+
# Step 1: Get English toctree and find the English title for target_local
|
| 133 |
+
en_toctree = self.get_en_toctree()
|
| 134 |
+
english_title = self.find_title_for_local(en_toctree, target_local)
|
| 135 |
+
|
| 136 |
+
if not english_title:
|
| 137 |
+
print(f"⚠️ Toctree entry not found: '{target_local}' not in English toctree")
|
| 138 |
+
print(f"🔍 Attempting to find appropriate section for new entry...")
|
| 139 |
+
# Try to add new entry in appropriate location
|
| 140 |
+
return self.add_new_toctree_entry(ko_toctree, target_local)
|
| 141 |
+
|
| 142 |
+
print(f"Found English title: {english_title} for local: {target_local}")
|
| 143 |
+
|
| 144 |
+
# Step 2: Translate the English title to Korean
|
| 145 |
+
korean_title = self.translate_title(english_title)
|
| 146 |
+
print(f"Translated Korean title: {korean_title}")
|
| 147 |
+
|
| 148 |
+
# Step 3: Make a deep copy to avoid modifying original
|
| 149 |
+
import copy
|
| 150 |
+
updated_toctree = copy.deepcopy(ko_toctree)
|
| 151 |
+
|
| 152 |
+
# Step 4: Find and update the "(번역중) 영어제목" entry
|
| 153 |
+
updated = self.find_and_update_translation_entry(
|
| 154 |
+
updated_toctree, target_local, english_title, korean_title
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
if updated:
|
| 158 |
+
print(f"✅ Successfully updated translation entry: local={target_local}, title={korean_title}")
|
| 159 |
+
return updated_toctree
|
| 160 |
+
else:
|
| 161 |
+
print(f"⚠️ Toctree update skipped: '(번역중) {english_title}' entry not found")
|
| 162 |
+
print(f"📋 This may be a new file not yet added to Korean toctree")
|
| 163 |
+
return ko_toctree
|
| 164 |
+
|
| 165 |
+
except Exception as e:
|
| 166 |
+
print(f"Error creating updated toctree: {e}")
|
| 167 |
+
return ko_toctree
|
| 168 |
+
|
| 169 |
+
def find_title_for_local(self, toctree_data, target_local: str):
|
| 170 |
+
"""Find title for given local path in toctree"""
|
| 171 |
+
def search_item(item):
|
| 172 |
+
if isinstance(item, dict):
|
| 173 |
+
if item.get('local') == target_local:
|
| 174 |
+
return item.get('title', '')
|
| 175 |
+
|
| 176 |
+
if 'sections' in item:
|
| 177 |
+
for section in item['sections']:
|
| 178 |
+
result = search_item(section)
|
| 179 |
+
if result:
|
| 180 |
+
return result
|
| 181 |
+
return None
|
| 182 |
+
|
| 183 |
+
if isinstance(toctree_data, list):
|
| 184 |
+
for item in toctree_data:
|
| 185 |
+
result = search_item(item)
|
| 186 |
+
if result:
|
| 187 |
+
return result
|
| 188 |
+
return None
|
| 189 |
+
|
| 190 |
+
def process_pr_commit(self, filepath: str):
|
| 191 |
+
"""Process PR commit by updating Korean toctree with translated entry"""
|
| 192 |
+
# Get filepath without prefix
|
| 193 |
+
filepath_without_prefix = filepath.replace("docs/source/en/", "").replace(".md", "")
|
| 194 |
+
|
| 195 |
+
# Get Korean toctree
|
| 196 |
+
ko_toctree = self.get_ko_toctree()
|
| 197 |
+
|
| 198 |
+
# Use diff-merge algorithm to add new entry
|
| 199 |
+
updated_ko_toctree = self.add_new_toctree_entry(ko_toctree, filepath_without_prefix)
|
| 200 |
+
|
| 201 |
+
if not updated_ko_toctree:
|
| 202 |
+
print(f"Failed to create updated Korean toctree for local: {filepath_without_prefix}")
|
| 203 |
+
return
|
| 204 |
+
|
| 205 |
+
print(f"Successfully updated Korean toctree")
|
| 206 |
+
|
| 207 |
+
# Store the updated toctree for commit
|
| 208 |
+
self.updated_ko_toctree = updated_ko_toctree
|
| 209 |
+
|
| 210 |
+
def commit_and_push_toctree(self, pr_agent, owner: str, repo_name: str, branch_name: str):
|
| 211 |
+
"""Commit and push toctree updates as a separate commit"""
|
| 212 |
+
try:
|
| 213 |
+
# Use the updated toctree created by LLM
|
| 214 |
+
if not hasattr(self, 'updated_ko_toctree') or not self.updated_ko_toctree:
|
| 215 |
+
print("No updated Korean toctree available")
|
| 216 |
+
return {"status": "error", "message": "No updated toctree to commit"}
|
| 217 |
+
|
| 218 |
+
ko_data = self.updated_ko_toctree
|
| 219 |
+
|
| 220 |
+
# Convert to YAML string
|
| 221 |
+
toctree_content = yaml.dump(ko_data, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
| 222 |
+
|
| 223 |
+
# Create toctree commit message
|
| 224 |
+
commit_message = "docs: update Korean documentation table of contents"
|
| 225 |
+
|
| 226 |
+
# Commit toctree file
|
| 227 |
+
file_result = pr_agent.create_or_update_file(
|
| 228 |
+
owner=owner,
|
| 229 |
+
repo_name=repo_name,
|
| 230 |
+
path="docs/source/ko/_toctree.yml",
|
| 231 |
+
message=commit_message,
|
| 232 |
+
content=toctree_content,
|
| 233 |
+
branch_name=branch_name
|
| 234 |
+
)
|
| 235 |
+
|
| 236 |
+
if file_result.startswith("SUCCESS"):
|
| 237 |
+
return {
|
| 238 |
+
"status": "success",
|
| 239 |
+
"message": f"Toctree committed successfully: {file_result}",
|
| 240 |
+
"commit_message": commit_message
|
| 241 |
+
}
|
| 242 |
+
else:
|
| 243 |
+
return {
|
| 244 |
+
"status": "error",
|
| 245 |
+
"message": f"Toctree commit failed: {file_result}"
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
except Exception as e:
|
| 249 |
+
return {
|
| 250 |
+
"status": "error",
|
| 251 |
+
"message": f"Error committing toctree: {str(e)}"
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
def update_toctree_after_translation(
|
| 255 |
+
self,
|
| 256 |
+
translation_result: dict,
|
| 257 |
+
filepath: str,
|
| 258 |
+
pr_agent,
|
| 259 |
+
github_config: dict,
|
| 260 |
+
project: str = "transformers"
|
| 261 |
+
) -> dict:
|
| 262 |
+
"""Update toctree after successful translation PR.
|
| 263 |
+
|
| 264 |
+
Args:
|
| 265 |
+
translation_result: Result from translation PR workflow
|
| 266 |
+
filepath: Original file path
|
| 267 |
+
pr_agent: GitHub PR agent instance
|
| 268 |
+
github_config: GitHub configuration dictionary
|
| 269 |
+
|
| 270 |
+
Returns:
|
| 271 |
+
Dictionary with toctree update result
|
| 272 |
+
"""
|
| 273 |
+
if translation_result["status"] == "error":
|
| 274 |
+
return None
|
| 275 |
+
|
| 276 |
+
try:
|
| 277 |
+
# Process toctree update with LLM
|
| 278 |
+
self.process_pr_commit(filepath)
|
| 279 |
+
# Commit toctree as separate commit
|
| 280 |
+
if self.updated_ko_toctree:
|
| 281 |
+
return self.commit_and_push_toctree(
|
| 282 |
+
pr_agent=pr_agent,
|
| 283 |
+
owner=github_config["owner"],
|
| 284 |
+
repo_name=github_config["repo_name"],
|
| 285 |
+
branch_name=translation_result["branch"]
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
except Exception as e:
|
| 289 |
+
return {
|
| 290 |
+
"status": "error",
|
| 291 |
+
"message": f"Error updating toctree: {str(e)}"
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
def add_new_toctree_entry(self, ko_toctree: list, target_local: str) -> list:
|
| 295 |
+
"""Add new toctree entry using diff-merge algorithm"""
|
| 296 |
+
try:
|
| 297 |
+
import copy
|
| 298 |
+
updated_toctree = copy.deepcopy(ko_toctree)
|
| 299 |
+
|
| 300 |
+
# Generate new entry
|
| 301 |
+
filename = target_local.split('/')[-1].replace('_', ' ').title()
|
| 302 |
+
korean_title = self.translate_title(filename)
|
| 303 |
+
new_entry = {
|
| 304 |
+
'local': target_local,
|
| 305 |
+
'title': korean_title
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
# Get English toctree for structure reference
|
| 309 |
+
en_toctree = self.get_en_toctree()
|
| 310 |
+
|
| 311 |
+
# Use diff-merge algorithm
|
| 312 |
+
if self.merge_toctree_sections(en_toctree, updated_toctree, target_local, new_entry):
|
| 313 |
+
return updated_toctree
|
| 314 |
+
else:
|
| 315 |
+
# Fallback: add to root level
|
| 316 |
+
updated_toctree.append(new_entry)
|
| 317 |
+
print(f"✅ Added new entry at root level: {target_local} -> {korean_title}")
|
| 318 |
+
return updated_toctree
|
| 319 |
+
|
| 320 |
+
except Exception as e:
|
| 321 |
+
print(f"❌ Error adding new toctree entry: {e}")
|
| 322 |
+
return ko_toctree
|
| 323 |
+
|
| 324 |
+
def merge_toctree_sections(self, en_toctree: list, ko_toctree: list, target_local: str, new_entry: dict) -> bool:
|
| 325 |
+
"""Merge English toctree structure into Korean toctree for target_local"""
|
| 326 |
+
for en_section in en_toctree:
|
| 327 |
+
en_title = en_section.get('title')
|
| 328 |
+
|
| 329 |
+
# Check if this English section contains our target
|
| 330 |
+
if self.contains_target(en_section, target_local):
|
| 331 |
+
# Find matching Korean section
|
| 332 |
+
ko_section = self.find_matching_section(ko_toctree, en_title)
|
| 333 |
+
|
| 334 |
+
if ko_section:
|
| 335 |
+
# Section exists - merge subsections
|
| 336 |
+
return self.merge_subsections(en_section, ko_section, target_local, new_entry)
|
| 337 |
+
else:
|
| 338 |
+
# Section doesn't exist - create new section
|
| 339 |
+
new_ko_section = self.create_section_with_order(en_section, target_local, new_entry)
|
| 340 |
+
ko_toctree.append(new_ko_section)
|
| 341 |
+
print(f"✅ Created new section '{new_ko_section.get('title')}' with ordered structure")
|
| 342 |
+
return True
|
| 343 |
+
return False
|
| 344 |
+
|
| 345 |
+
def contains_target(self, section: dict, target_local: str) -> bool:
|
| 346 |
+
"""Check if section contains target_local recursively"""
|
| 347 |
+
if 'sections' in section:
|
| 348 |
+
for subsection in section['sections']:
|
| 349 |
+
if subsection.get('local') == target_local:
|
| 350 |
+
return True
|
| 351 |
+
if self.contains_target(subsection, target_local):
|
| 352 |
+
return True
|
| 353 |
+
return False
|
| 354 |
+
|
| 355 |
+
def find_matching_section(self, ko_toctree: list, en_title: str) -> dict:
|
| 356 |
+
"""Find Korean section that matches English title"""
|
| 357 |
+
# Try exact match first
|
| 358 |
+
for item in ko_toctree:
|
| 359 |
+
if item.get('title') == en_title:
|
| 360 |
+
return item
|
| 361 |
+
|
| 362 |
+
# Try translated title match
|
| 363 |
+
try:
|
| 364 |
+
translated_title = self.translate_title(en_title)
|
| 365 |
+
for item in ko_toctree:
|
| 366 |
+
if item.get('title') == translated_title:
|
| 367 |
+
return item
|
| 368 |
+
except:
|
| 369 |
+
pass
|
| 370 |
+
|
| 371 |
+
return None
|
| 372 |
+
|
| 373 |
+
def merge_subsections(self, en_section: dict, ko_section: dict, target_local: str, new_entry: dict) -> bool:
|
| 374 |
+
"""Merge subsections while maintaining order"""
|
| 375 |
+
if 'sections' not in en_section:
|
| 376 |
+
return False
|
| 377 |
+
|
| 378 |
+
# Find target index in English sections
|
| 379 |
+
target_index = -1
|
| 380 |
+
for i, en_subsection in enumerate(en_section['sections']):
|
| 381 |
+
if en_subsection.get('local') == target_local:
|
| 382 |
+
target_index = i
|
| 383 |
+
break
|
| 384 |
+
|
| 385 |
+
if target_index == -1:
|
| 386 |
+
return False
|
| 387 |
+
|
| 388 |
+
# Ensure Korean section has sections array
|
| 389 |
+
if 'sections' not in ko_section:
|
| 390 |
+
ko_section['sections'] = []
|
| 391 |
+
|
| 392 |
+
# Insert at correct position
|
| 393 |
+
self.insert_at_correct_position(ko_section, target_index, new_entry)
|
| 394 |
+
return True
|
| 395 |
+
|
| 396 |
+
def insert_at_correct_position(self, ko_section: dict, target_index: int, new_entry: dict):
|
| 397 |
+
"""Insert entry at correct position, expanding array if needed"""
|
| 398 |
+
sections = ko_section['sections']
|
| 399 |
+
|
| 400 |
+
# Expand sections array if needed
|
| 401 |
+
while len(sections) <= target_index:
|
| 402 |
+
sections.append(None) # Placeholder
|
| 403 |
+
|
| 404 |
+
# Insert new entry
|
| 405 |
+
sections[target_index] = new_entry
|
| 406 |
+
|
| 407 |
+
# Clean up None placeholders at the end
|
| 408 |
+
while sections and sections[-1] is None:
|
| 409 |
+
sections.pop()
|
| 410 |
+
|
| 411 |
+
def create_section_with_order(self, en_section: dict, target_local: str, new_entry: dict) -> dict:
|
| 412 |
+
"""Create new Korean section with only the translated entry"""
|
| 413 |
+
new_ko_section = {
|
| 414 |
+
'title': self.translate_title(en_section.get('title')),
|
| 415 |
+
'isExpanded': en_section.get('isExpanded', False),
|
| 416 |
+
'sections': [new_entry] # Only add the translated entry
|
| 417 |
+
}
|
| 418 |
+
|
| 419 |
+
return new_ko_section
|
agent/workflow.py
CHANGED
|
@@ -1,338 +1,338 @@
|
|
| 1 |
-
"""Module for gradio interfaces."""
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
import gradio as gr
|
| 6 |
-
|
| 7 |
-
from translator.content import (
|
| 8 |
-
fill_scaffold,
|
| 9 |
-
get_content,
|
| 10 |
-
get_full_prompt,
|
| 11 |
-
llm_translate,
|
| 12 |
-
preprocess_content,
|
| 13 |
-
)
|
| 14 |
-
from translator.retriever import report, get_github_issue_open_pr, get_github_repo_files
|
| 15 |
-
# GitHub PR Agent import
|
| 16 |
-
try:
|
| 17 |
-
from pr_generator.agent import GitHubPRAgent
|
| 18 |
-
|
| 19 |
-
GITHUB_PR_AVAILABLE = True
|
| 20 |
-
except ImportError as e:
|
| 21 |
-
print(f"⚠️ GitHub PR Agent is not available: {e}")
|
| 22 |
-
GITHUB_PR_AVAILABLE = False
|
| 23 |
-
|
| 24 |
-
import json
|
| 25 |
-
from logger.github_logger import GitHubLogger
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
def report_translation_target_files(
|
| 29 |
-
project: str, translate_lang: str, top_k: int = 1
|
| 30 |
-
) -> tuple[str, list[list[str]]]:
|
| 31 |
-
"""Return the top-k files that need translation, excluding files already in progress.
|
| 32 |
-
|
| 33 |
-
Args:
|
| 34 |
-
project: Project to translate (e.g., "transformers", "smolagents")
|
| 35 |
-
translate_lang: Target language to translate
|
| 36 |
-
top_k: Number of top-first files to return for translation. (Default 1)
|
| 37 |
-
"""
|
| 38 |
-
# Get repo files once to avoid duplicate API calls
|
| 39 |
-
all_repo_files = get_github_repo_files(project)
|
| 40 |
-
|
| 41 |
-
# Get all available files for translation using the file list
|
| 42 |
-
all_status_report, all_filepath_list = report(project, translate_lang, top_k * 2, all_repo_files) # Get more to account for filtering
|
| 43 |
-
|
| 44 |
-
# Get files in progress using the same file list
|
| 45 |
-
docs_in_progress, pr_info_list = get_github_issue_open_pr(project, translate_lang, all_repo_files)
|
| 46 |
-
|
| 47 |
-
# Filter out files that are already in progress
|
| 48 |
-
available_files = [f for f in all_filepath_list if f not in docs_in_progress]
|
| 49 |
-
|
| 50 |
-
# Take only the requested number
|
| 51 |
-
filepath_list = available_files[:top_k]
|
| 52 |
-
|
| 53 |
-
# Build combined status report
|
| 54 |
-
status_report = all_status_report
|
| 55 |
-
|
| 56 |
-
if docs_in_progress:
|
| 57 |
-
status_report += f"\n\n🤖 Found {len(docs_in_progress)} files in progress for translation:"
|
| 58 |
-
for i, file in enumerate(docs_in_progress):
|
| 59 |
-
status_report += f"\n{i+1}. [`{file}`]({pr_info_list[i]})"
|
| 60 |
-
status_report += f"\n\n📋 Showing {len(filepath_list)} available files (excluding in-progress):"
|
| 61 |
-
|
| 62 |
-
return status_report, [[file] for file in filepath_list]
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
def translate_docs(lang: str, file_path: str, additional_instruction: str = "", project: str = "transformers", force_retranslate: bool = False) -> tuple[str, str]:
|
| 66 |
-
"""Translate documentation."""
|
| 67 |
-
# Check if translation already exists (unless force retranslate is enabled)
|
| 68 |
-
translation_file_path = (
|
| 69 |
-
Path(__file__).resolve().parent.parent
|
| 70 |
-
/ f"translation_result/{file_path}"
|
| 71 |
-
)
|
| 72 |
-
|
| 73 |
-
if not force_retranslate and translation_file_path.exists():
|
| 74 |
-
print(f"📄 Found existing translation: {translation_file_path}")
|
| 75 |
-
with open(translation_file_path, "r", encoding="utf-8") as f:
|
| 76 |
-
existing_content = f.read()
|
| 77 |
-
if existing_content.strip():
|
| 78 |
-
existing_msg = f"♻️ **Existing translation loaded** (no tokens used)\n📁 **File:** `{file_path}`\n📅 **Loaded from:** `{translation_file_path}`\n💡 **To retranslate:** Check 'Force Retranslate' option."
|
| 79 |
-
return existing_msg, existing_content
|
| 80 |
-
|
| 81 |
-
# step 1. Get content from file path
|
| 82 |
-
content = get_content(file_path, project)
|
| 83 |
-
to_translate = preprocess_content(content)
|
| 84 |
-
|
| 85 |
-
# step 2. Prepare prompt with docs content
|
| 86 |
-
if lang == "ko":
|
| 87 |
-
translation_lang = "Korean"
|
| 88 |
-
to_translate_with_prompt = get_full_prompt(translation_lang, to_translate, additional_instruction)
|
| 89 |
-
|
| 90 |
-
print("to_translate_with_prompt:\n", to_translate_with_prompt)
|
| 91 |
-
|
| 92 |
-
# step 3. Translate with LLM
|
| 93 |
-
# TODO: MCP clilent 넘길 부분
|
| 94 |
-
callback_result, translated_content = llm_translate(to_translate_with_prompt)
|
| 95 |
-
print("translated_content:\n")
|
| 96 |
-
print(translated_content)
|
| 97 |
-
if translated_content.startswith("```md\n") and translated_content.endswith("```"):
|
| 98 |
-
print("Satisfied translated_content.startswith ``` md")
|
| 99 |
-
translated_content = translated_content[5:-3].strip()
|
| 100 |
-
# step 4. Add scaffold to translation result
|
| 101 |
-
translated_doc = fill_scaffold(content, to_translate, translated_content)
|
| 102 |
-
print("translated_doc:\n")
|
| 103 |
-
print(translated_doc)
|
| 104 |
-
return callback_result, translated_doc
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
def translate_docs_interactive(
|
| 108 |
-
translate_lang: str, selected_files: list[list[str]], additional_instruction: str = "", project: str = "transformers", force_retranslate: bool = False
|
| 109 |
-
) -> tuple[str, str]:
|
| 110 |
-
"""Interactive translation function that processes files one by one.
|
| 111 |
-
|
| 112 |
-
Args:
|
| 113 |
-
translate_lang: Target language to translate
|
| 114 |
-
selected_files: List of file paths to translate
|
| 115 |
-
"""
|
| 116 |
-
# Extract file paths from the dataframe format
|
| 117 |
-
file_paths = [row[0] for row in selected_files if row and len(row) > 0]
|
| 118 |
-
|
| 119 |
-
# Start with the first file
|
| 120 |
-
current_file = file_paths[0]
|
| 121 |
-
|
| 122 |
-
callback_result, translated_content = translate_docs(translate_lang, current_file, additional_instruction, project, force_retranslate)
|
| 123 |
-
|
| 124 |
-
# Check if existing translation was loaded
|
| 125 |
-
if isinstance(callback_result, str) and "Existing translation loaded" in callback_result:
|
| 126 |
-
status = callback_result # Use the existing translation message
|
| 127 |
-
else:
|
| 128 |
-
if force_retranslate:
|
| 129 |
-
status = f"🔄 **Force Retranslation completed**: `{current_file}` → `{translate_lang}`\n\n"
|
| 130 |
-
else:
|
| 131 |
-
status = f"✅ Translation completed: `{current_file}` → `{translate_lang}`\n\n"
|
| 132 |
-
status += f"💰 Used token and cost: \n```\n{callback_result}\n```"
|
| 133 |
-
|
| 134 |
-
print(callback_result)
|
| 135 |
-
print(status)
|
| 136 |
-
|
| 137 |
-
return status, translated_content
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
def generate_github_pr(
|
| 141 |
-
target_language: str,
|
| 142 |
-
filepath: str,
|
| 143 |
-
translated_content: str = None,
|
| 144 |
-
github_config: dict = None,
|
| 145 |
-
en_title: str = None,
|
| 146 |
-
project: str = "transformers",
|
| 147 |
-
) -> str:
|
| 148 |
-
"""Generate a GitHub PR for translated documentation.
|
| 149 |
-
|
| 150 |
-
Args:
|
| 151 |
-
target_language: Target language for translation (e.g., "ko")
|
| 152 |
-
filepath: Original file path (e.g., "docs/source/en/accelerator_selection.md")
|
| 153 |
-
translated_content: Translated content (if None, read from file)
|
| 154 |
-
github_config: GitHub configuration dictionary
|
| 155 |
-
en_title: English title for toctree mapping
|
| 156 |
-
|
| 157 |
-
Returns:
|
| 158 |
-
PR creation result message
|
| 159 |
-
"""
|
| 160 |
-
if not GITHUB_PR_AVAILABLE:
|
| 161 |
-
return "❌ GitHub PR Agent is not available. Please install required libraries."
|
| 162 |
-
|
| 163 |
-
if not github_config:
|
| 164 |
-
return "❌ GitHub configuration not provided. Please set up GitHub token, owner, and repository in Configuration panel."
|
| 165 |
-
|
| 166 |
-
# Validate required configuration
|
| 167 |
-
required_fields = ["token", "owner", "repo_name", "reference_pr_url"]
|
| 168 |
-
missing_fields = [
|
| 169 |
-
field for field in required_fields if not github_config.get(field)
|
| 170 |
-
]
|
| 171 |
-
|
| 172 |
-
if missing_fields:
|
| 173 |
-
return f"❌ Missing required GitHub configuration: {', '.join(missing_fields)}\n\n💡 Go to Configuration panel and set:\n" + "\n".join([f" • {field}" for field in missing_fields])
|
| 174 |
-
|
| 175 |
-
# Set token in environment for the agent.
|
| 176 |
-
os.environ["GITHUB_TOKEN"] = github_config["token"]
|
| 177 |
-
|
| 178 |
-
try:
|
| 179 |
-
# Read translated content from file if not provided
|
| 180 |
-
if translated_content is None:
|
| 181 |
-
translation_file_path = (
|
| 182 |
-
Path(__file__).resolve().parent.parent
|
| 183 |
-
/ f"translation_result/{filepath}"
|
| 184 |
-
)
|
| 185 |
-
if not translation_file_path.exists():
|
| 186 |
-
return f"❌ Translation file not found: {translation_file_path}\n\n💡 Please complete translation first in Tab 2 for file: {filepath}"
|
| 187 |
-
|
| 188 |
-
with open(translation_file_path, "r", encoding="utf-8") as f:
|
| 189 |
-
translated_content = f.read()
|
| 190 |
-
|
| 191 |
-
if not translated_content or not translated_content.strip():
|
| 192 |
-
return f"❌ Translated content is empty for file: {filepath}\n\n💡 Please complete translation first in Tab 2."
|
| 193 |
-
|
| 194 |
-
# Execute GitHub PR Agent
|
| 195 |
-
# Get base repository from project config
|
| 196 |
-
from translator.project_config import get_project_config
|
| 197 |
-
project_config = get_project_config(project)
|
| 198 |
-
base_repo_path = project_config.repo_url.replace("https://github.com/", "")
|
| 199 |
-
base_owner, base_repo = base_repo_path.split("/")
|
| 200 |
-
|
| 201 |
-
print(f"🚀 Starting GitHub PR creation...")
|
| 202 |
-
print(f" 📁 File: {filepath}")
|
| 203 |
-
print(f" 🌍 Language: {target_language}")
|
| 204 |
-
print(f" 📊 Reference PR: {github_config['reference_pr_url']}")
|
| 205 |
-
print(f" 🏠 User Fork: {github_config['owner']}/{github_config['repo_name']}")
|
| 206 |
-
print(f" 🎯 Base Repository: {base_owner}/{base_repo}")
|
| 207 |
-
|
| 208 |
-
agent = GitHubPRAgent(
|
| 209 |
-
user_owner=github_config["owner"],
|
| 210 |
-
user_repo=github_config["repo_name"],
|
| 211 |
-
base_owner=base_owner,
|
| 212 |
-
base_repo=base_repo,
|
| 213 |
-
)
|
| 214 |
-
result = agent.run_translation_pr_workflow(
|
| 215 |
-
reference_pr_url=github_config["reference_pr_url"],
|
| 216 |
-
target_language=target_language,
|
| 217 |
-
filepath=filepath,
|
| 218 |
-
translated_doc=translated_content,
|
| 219 |
-
base_branch=github_config.get("base_branch", "main"),
|
| 220 |
-
)
|
| 221 |
-
# TEST CODE
|
| 222 |
-
# result = {
|
| 223 |
-
# 'status': 'partial_success',
|
| 224 |
-
# 'branch': 'ko-attention_interface',
|
| 225 |
-
# 'file_path': 'docs/source/ko/attention_interface.md',
|
| 226 |
-
# 'message': 'File was saved and commit was successful.\nPR creation failed: ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1', 'error_details': 'ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1'
|
| 227 |
-
# }
|
| 228 |
-
# Process toctree update after successful translation PR
|
| 229 |
-
toctree_result = None
|
| 230 |
-
if en_title:
|
| 231 |
-
from agent.toctree_handler import TocTreeHandler
|
| 232 |
-
toctree_handler = TocTreeHandler(project)
|
| 233 |
-
toctree_result = toctree_handler.update_toctree_after_translation(
|
| 234 |
-
result, filepath, agent, github_config, project
|
| 235 |
-
)
|
| 236 |
-
|
| 237 |
-
# Process result
|
| 238 |
-
# Generate toctree status message (shared for both success and partial_success)
|
| 239 |
-
toctree_status = ""
|
| 240 |
-
if toctree_result:
|
| 241 |
-
if toctree_result["status"] == "success":
|
| 242 |
-
toctree_status = f"\n📋 **Toctree Updated:** ✅ {toctree_result['message']}"
|
| 243 |
-
else:
|
| 244 |
-
toctree_status = f"\n📋 **Toctree Update Failed:** ❌ {toctree_result['message']}"
|
| 245 |
-
|
| 246 |
-
# Append full result JSON to dedicated GitHub logging repository (always)
|
| 247 |
-
try:
|
| 248 |
-
log_data = result.copy()
|
| 249 |
-
if toctree_result:
|
| 250 |
-
log_data["toctree_result"] = toctree_result
|
| 251 |
-
log_entry = json.dumps(log_data, ensure_ascii=False) + "\n"
|
| 252 |
-
log_res = GitHubLogger().append_jsonl(log_entry)
|
| 253 |
-
print(f"📝 Log append result: {log_res}")
|
| 254 |
-
except Exception as e:
|
| 255 |
-
print(f"❌ Failed to append PR log via GitHub API: {e}")
|
| 256 |
-
|
| 257 |
-
if result["status"] == "success":
|
| 258 |
-
return f"""✅ **GitHub PR Creation Successful!**
|
| 259 |
-
|
| 260 |
-
🔗 **PR URL:** {result.get('pr_url', 'NO_PR_URL')}
|
| 261 |
-
🌿 **Branch:** {result["branch"]}
|
| 262 |
-
📁 **File:** {result["file_path"]}{toctree_status}
|
| 263 |
-
|
| 264 |
-
{result["message"]}"""
|
| 265 |
-
|
| 266 |
-
elif result["status"] == "partial_success":
|
| 267 |
-
error_details = result.get("error_details", "Unknown error")
|
| 268 |
-
|
| 269 |
-
# Check if it's "existing PR" case (not really an error)
|
| 270 |
-
if "Existing PR found" in error_details:
|
| 271 |
-
existing_pr_url = error_details.split(": ")[-1] if ": " in error_details else "Unknown"
|
| 272 |
-
return f"""🔄 **Translation Updated Successfully**
|
| 273 |
-
|
| 274 |
-
🎯 **Selected Project:** {project}
|
| 275 |
-
🌿 **Branch:** {result["branch"]}
|
| 276 |
-
📁 **File:** {result["file_path"]}{toctree_status}
|
| 277 |
-
|
| 278 |
-
🔗 **Existing PR Updated:** {existing_pr_url}
|
| 279 |
-
|
| 280 |
-
✅ Your translation has been added to the existing PR. The file and toctree have been successfully updated!"""
|
| 281 |
-
else:
|
| 282 |
-
# Actual error case
|
| 283 |
-
return f"""⚠️ **Partial Success**
|
| 284 |
-
|
| 285 |
-
🎯 **Selected Project:** {project}
|
| 286 |
-
🏠 **User Fork:** {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')}
|
| 287 |
-
🎯 **Target Base:** {base_owner}/{base_repo}
|
| 288 |
-
🌿 **Branch:** {result["branch"]}
|
| 289 |
-
📁 **File:** {result["file_path"]}{toctree_status}
|
| 290 |
-
|
| 291 |
-
{result["message"]}
|
| 292 |
-
|
| 293 |
-
**Error Details:**
|
| 294 |
-
{error_details}
|
| 295 |
-
|
| 296 |
-
💡 **Project-Repository Mismatch Check:**
|
| 297 |
-
- Selected project '{project}' should match repository '{github_config.get('repo_name', 'REPO')}'
|
| 298 |
-
- For smolagents: use Jwaminju/smolagents fork
|
| 299 |
-
- For transformers: use Jwaminju/transformers fork"""
|
| 300 |
-
|
| 301 |
-
else:
|
| 302 |
-
error_details = result.get("error_details", "No additional details")
|
| 303 |
-
return f"""❌ **GitHub PR Creation Failed**
|
| 304 |
-
|
| 305 |
-
🎯 **Selected Project:** {project}
|
| 306 |
-
🏠 **User Fork:** {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')}
|
| 307 |
-
🎯 **Target Base:** {base_owner}/{base_repo}
|
| 308 |
-
|
| 309 |
-
**Error Message:**
|
| 310 |
-
{result["message"]}
|
| 311 |
-
|
| 312 |
-
**Error Details:**
|
| 313 |
-
{error_details}
|
| 314 |
-
|
| 315 |
-
💡 **Project-Repository Mismatch:**
|
| 316 |
-
Selected project '{project}' but configured repository '{github_config.get('repo_name', 'REPO')}'
|
| 317 |
-
• For smolagents project: use 'smolagents' repository
|
| 318 |
-
• For transformers project: use 'transformers' repository"""
|
| 319 |
-
|
| 320 |
-
except Exception as e:
|
| 321 |
-
error_msg = f"""❌ **Unexpected Error During PR Creation**
|
| 322 |
-
|
| 323 |
-
**Error:** {str(e)}
|
| 324 |
-
|
| 325 |
-
**Configuration:**
|
| 326 |
-
• Project: {project}
|
| 327 |
-
• File: {filepath}
|
| 328 |
-
• Target: {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')} → {base_owner if 'base_owner' in locals() else 'BASE'}/{base_repo if 'base_repo' in locals() else 'REPO'}"""
|
| 329 |
-
print(error_msg)
|
| 330 |
-
return error_msg
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
# Backward compatibility function (replaces old mock function)
|
| 334 |
-
def mock_generate_PR():
|
| 335 |
-
"""Backward compatibility function - returns warning message only"""
|
| 336 |
-
return (
|
| 337 |
-
"⚠️ mock_generate_PR() is deprecated. Please use generate_github_pr() instead."
|
| 338 |
-
)
|
|
|
|
| 1 |
+
"""Module for gradio interfaces."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
import gradio as gr
|
| 6 |
+
|
| 7 |
+
from translator.content import (
|
| 8 |
+
fill_scaffold,
|
| 9 |
+
get_content,
|
| 10 |
+
get_full_prompt,
|
| 11 |
+
llm_translate,
|
| 12 |
+
preprocess_content,
|
| 13 |
+
)
|
| 14 |
+
from translator.retriever import report, get_github_issue_open_pr, get_github_repo_files
|
| 15 |
+
# GitHub PR Agent import
|
| 16 |
+
try:
|
| 17 |
+
from pr_generator.agent import GitHubPRAgent
|
| 18 |
+
|
| 19 |
+
GITHUB_PR_AVAILABLE = True
|
| 20 |
+
except ImportError as e:
|
| 21 |
+
print(f"⚠️ GitHub PR Agent is not available: {e}")
|
| 22 |
+
GITHUB_PR_AVAILABLE = False
|
| 23 |
+
|
| 24 |
+
import json
|
| 25 |
+
from logger.github_logger import GitHubLogger
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def report_translation_target_files(
|
| 29 |
+
project: str, translate_lang: str, top_k: int = 1
|
| 30 |
+
) -> tuple[str, list[list[str]]]:
|
| 31 |
+
"""Return the top-k files that need translation, excluding files already in progress.
|
| 32 |
+
|
| 33 |
+
Args:
|
| 34 |
+
project: Project to translate (e.g., "transformers", "smolagents")
|
| 35 |
+
translate_lang: Target language to translate
|
| 36 |
+
top_k: Number of top-first files to return for translation. (Default 1)
|
| 37 |
+
"""
|
| 38 |
+
# Get repo files once to avoid duplicate API calls
|
| 39 |
+
all_repo_files = get_github_repo_files(project)
|
| 40 |
+
|
| 41 |
+
# Get all available files for translation using the file list
|
| 42 |
+
all_status_report, all_filepath_list = report(project, translate_lang, top_k * 2, all_repo_files) # Get more to account for filtering
|
| 43 |
+
|
| 44 |
+
# Get files in progress using the same file list
|
| 45 |
+
docs_in_progress, pr_info_list = get_github_issue_open_pr(project, translate_lang, all_repo_files)
|
| 46 |
+
|
| 47 |
+
# Filter out files that are already in progress
|
| 48 |
+
available_files = [f for f in all_filepath_list if f not in docs_in_progress]
|
| 49 |
+
|
| 50 |
+
# Take only the requested number
|
| 51 |
+
filepath_list = available_files[:top_k]
|
| 52 |
+
|
| 53 |
+
# Build combined status report
|
| 54 |
+
status_report = all_status_report
|
| 55 |
+
|
| 56 |
+
if docs_in_progress:
|
| 57 |
+
status_report += f"\n\n🤖 Found {len(docs_in_progress)} files in progress for translation:"
|
| 58 |
+
for i, file in enumerate(docs_in_progress):
|
| 59 |
+
status_report += f"\n{i+1}. [`{file}`]({pr_info_list[i]})"
|
| 60 |
+
status_report += f"\n\n📋 Showing {len(filepath_list)} available files (excluding in-progress):"
|
| 61 |
+
|
| 62 |
+
return status_report, [[file] for file in filepath_list]
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def translate_docs(lang: str, file_path: str, additional_instruction: str = "", project: str = "transformers", force_retranslate: bool = False) -> tuple[str, str]:
|
| 66 |
+
"""Translate documentation."""
|
| 67 |
+
# Check if translation already exists (unless force retranslate is enabled)
|
| 68 |
+
translation_file_path = (
|
| 69 |
+
Path(__file__).resolve().parent.parent
|
| 70 |
+
/ f"translation_result/{file_path}"
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
if not force_retranslate and translation_file_path.exists():
|
| 74 |
+
print(f"📄 Found existing translation: {translation_file_path}")
|
| 75 |
+
with open(translation_file_path, "r", encoding="utf-8") as f:
|
| 76 |
+
existing_content = f.read()
|
| 77 |
+
if existing_content.strip():
|
| 78 |
+
existing_msg = f"♻️ **Existing translation loaded** (no tokens used)\n📁 **File:** `{file_path}`\n📅 **Loaded from:** `{translation_file_path}`\n💡 **To retranslate:** Check 'Force Retranslate' option."
|
| 79 |
+
return existing_msg, existing_content
|
| 80 |
+
|
| 81 |
+
# step 1. Get content from file path
|
| 82 |
+
content = get_content(file_path, project)
|
| 83 |
+
to_translate = preprocess_content(content)
|
| 84 |
+
|
| 85 |
+
# step 2. Prepare prompt with docs content
|
| 86 |
+
if lang == "ko":
|
| 87 |
+
translation_lang = "Korean"
|
| 88 |
+
to_translate_with_prompt = get_full_prompt(translation_lang, to_translate, additional_instruction)
|
| 89 |
+
|
| 90 |
+
print("to_translate_with_prompt:\n", to_translate_with_prompt)
|
| 91 |
+
|
| 92 |
+
# step 3. Translate with LLM
|
| 93 |
+
# TODO: MCP clilent 넘길 부분
|
| 94 |
+
callback_result, translated_content = llm_translate(to_translate_with_prompt)
|
| 95 |
+
print("translated_content:\n")
|
| 96 |
+
print(translated_content)
|
| 97 |
+
if translated_content.startswith("```md\n") and translated_content.endswith("```"):
|
| 98 |
+
print("Satisfied translated_content.startswith ``` md")
|
| 99 |
+
translated_content = translated_content[5:-3].strip()
|
| 100 |
+
# step 4. Add scaffold to translation result
|
| 101 |
+
translated_doc = fill_scaffold(content, to_translate, translated_content)
|
| 102 |
+
print("translated_doc:\n")
|
| 103 |
+
print(translated_doc)
|
| 104 |
+
return callback_result, translated_doc
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def translate_docs_interactive(
|
| 108 |
+
translate_lang: str, selected_files: list[list[str]], additional_instruction: str = "", project: str = "transformers", force_retranslate: bool = False
|
| 109 |
+
) -> tuple[str, str]:
|
| 110 |
+
"""Interactive translation function that processes files one by one.
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
translate_lang: Target language to translate
|
| 114 |
+
selected_files: List of file paths to translate
|
| 115 |
+
"""
|
| 116 |
+
# Extract file paths from the dataframe format
|
| 117 |
+
file_paths = [row[0] for row in selected_files if row and len(row) > 0]
|
| 118 |
+
|
| 119 |
+
# Start with the first file
|
| 120 |
+
current_file = file_paths[0]
|
| 121 |
+
|
| 122 |
+
callback_result, translated_content = translate_docs(translate_lang, current_file, additional_instruction, project, force_retranslate)
|
| 123 |
+
|
| 124 |
+
# Check if existing translation was loaded
|
| 125 |
+
if isinstance(callback_result, str) and "Existing translation loaded" in callback_result:
|
| 126 |
+
status = callback_result # Use the existing translation message
|
| 127 |
+
else:
|
| 128 |
+
if force_retranslate:
|
| 129 |
+
status = f"🔄 **Force Retranslation completed**: `{current_file}` → `{translate_lang}`\n\n"
|
| 130 |
+
else:
|
| 131 |
+
status = f"✅ Translation completed: `{current_file}` → `{translate_lang}`\n\n"
|
| 132 |
+
status += f"💰 Used token and cost: \n```\n{callback_result}\n```"
|
| 133 |
+
|
| 134 |
+
print(callback_result)
|
| 135 |
+
print(status)
|
| 136 |
+
|
| 137 |
+
return status, translated_content
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def generate_github_pr(
|
| 141 |
+
target_language: str,
|
| 142 |
+
filepath: str,
|
| 143 |
+
translated_content: str = None,
|
| 144 |
+
github_config: dict = None,
|
| 145 |
+
en_title: str = None,
|
| 146 |
+
project: str = "transformers",
|
| 147 |
+
) -> str:
|
| 148 |
+
"""Generate a GitHub PR for translated documentation.
|
| 149 |
+
|
| 150 |
+
Args:
|
| 151 |
+
target_language: Target language for translation (e.g., "ko")
|
| 152 |
+
filepath: Original file path (e.g., "docs/source/en/accelerator_selection.md")
|
| 153 |
+
translated_content: Translated content (if None, read from file)
|
| 154 |
+
github_config: GitHub configuration dictionary
|
| 155 |
+
en_title: English title for toctree mapping
|
| 156 |
+
|
| 157 |
+
Returns:
|
| 158 |
+
PR creation result message
|
| 159 |
+
"""
|
| 160 |
+
if not GITHUB_PR_AVAILABLE:
|
| 161 |
+
return "❌ GitHub PR Agent is not available. Please install required libraries."
|
| 162 |
+
|
| 163 |
+
if not github_config:
|
| 164 |
+
return "❌ GitHub configuration not provided. Please set up GitHub token, owner, and repository in Configuration panel."
|
| 165 |
+
|
| 166 |
+
# Validate required configuration
|
| 167 |
+
required_fields = ["token", "owner", "repo_name", "reference_pr_url"]
|
| 168 |
+
missing_fields = [
|
| 169 |
+
field for field in required_fields if not github_config.get(field)
|
| 170 |
+
]
|
| 171 |
+
|
| 172 |
+
if missing_fields:
|
| 173 |
+
return f"❌ Missing required GitHub configuration: {', '.join(missing_fields)}\n\n💡 Go to Configuration panel and set:\n" + "\n".join([f" • {field}" for field in missing_fields])
|
| 174 |
+
|
| 175 |
+
# Set token in environment for the agent.
|
| 176 |
+
os.environ["GITHUB_TOKEN"] = github_config["token"]
|
| 177 |
+
|
| 178 |
+
try:
|
| 179 |
+
# Read translated content from file if not provided
|
| 180 |
+
if translated_content is None:
|
| 181 |
+
translation_file_path = (
|
| 182 |
+
Path(__file__).resolve().parent.parent
|
| 183 |
+
/ f"translation_result/{filepath}"
|
| 184 |
+
)
|
| 185 |
+
if not translation_file_path.exists():
|
| 186 |
+
return f"❌ Translation file not found: {translation_file_path}\n\n💡 Please complete translation first in Tab 2 for file: {filepath}"
|
| 187 |
+
|
| 188 |
+
with open(translation_file_path, "r", encoding="utf-8") as f:
|
| 189 |
+
translated_content = f.read()
|
| 190 |
+
|
| 191 |
+
if not translated_content or not translated_content.strip():
|
| 192 |
+
return f"❌ Translated content is empty for file: {filepath}\n\n💡 Please complete translation first in Tab 2."
|
| 193 |
+
|
| 194 |
+
# Execute GitHub PR Agent
|
| 195 |
+
# Get base repository from project config
|
| 196 |
+
from translator.project_config import get_project_config
|
| 197 |
+
project_config = get_project_config(project)
|
| 198 |
+
base_repo_path = project_config.repo_url.replace("https://github.com/", "")
|
| 199 |
+
base_owner, base_repo = base_repo_path.split("/")
|
| 200 |
+
|
| 201 |
+
print(f"🚀 Starting GitHub PR creation...")
|
| 202 |
+
print(f" 📁 File: {filepath}")
|
| 203 |
+
print(f" 🌍 Language: {target_language}")
|
| 204 |
+
print(f" 📊 Reference PR: {github_config['reference_pr_url']}")
|
| 205 |
+
print(f" 🏠 User Fork: {github_config['owner']}/{github_config['repo_name']}")
|
| 206 |
+
print(f" 🎯 Base Repository: {base_owner}/{base_repo}")
|
| 207 |
+
|
| 208 |
+
agent = GitHubPRAgent(
|
| 209 |
+
user_owner=github_config["owner"],
|
| 210 |
+
user_repo=github_config["repo_name"],
|
| 211 |
+
base_owner=base_owner,
|
| 212 |
+
base_repo=base_repo,
|
| 213 |
+
)
|
| 214 |
+
result = agent.run_translation_pr_workflow(
|
| 215 |
+
reference_pr_url=github_config["reference_pr_url"],
|
| 216 |
+
target_language=target_language,
|
| 217 |
+
filepath=filepath,
|
| 218 |
+
translated_doc=translated_content,
|
| 219 |
+
base_branch=github_config.get("base_branch", "main"),
|
| 220 |
+
)
|
| 221 |
+
# TEST CODE
|
| 222 |
+
# result = {
|
| 223 |
+
# 'status': 'partial_success',
|
| 224 |
+
# 'branch': 'ko-attention_interface',
|
| 225 |
+
# 'file_path': 'docs/source/ko/attention_interface.md',
|
| 226 |
+
# 'message': 'File was saved and commit was successful.\nPR creation failed: ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1', 'error_details': 'ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1'
|
| 227 |
+
# }
|
| 228 |
+
# Process toctree update after successful translation PR
|
| 229 |
+
toctree_result = None
|
| 230 |
+
if en_title:
|
| 231 |
+
from agent.toctree_handler import TocTreeHandler
|
| 232 |
+
toctree_handler = TocTreeHandler(project)
|
| 233 |
+
toctree_result = toctree_handler.update_toctree_after_translation(
|
| 234 |
+
result, filepath, agent, github_config, project
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
# Process result
|
| 238 |
+
# Generate toctree status message (shared for both success and partial_success)
|
| 239 |
+
toctree_status = ""
|
| 240 |
+
if toctree_result:
|
| 241 |
+
if toctree_result["status"] == "success":
|
| 242 |
+
toctree_status = f"\n📋 **Toctree Updated:** ✅ {toctree_result['message']}"
|
| 243 |
+
else:
|
| 244 |
+
toctree_status = f"\n📋 **Toctree Update Failed:** ❌ {toctree_result['message']}"
|
| 245 |
+
|
| 246 |
+
# Append full result JSON to dedicated GitHub logging repository (always)
|
| 247 |
+
try:
|
| 248 |
+
log_data = result.copy()
|
| 249 |
+
if toctree_result:
|
| 250 |
+
log_data["toctree_result"] = toctree_result
|
| 251 |
+
log_entry = json.dumps(log_data, ensure_ascii=False) + "\n"
|
| 252 |
+
log_res = GitHubLogger().append_jsonl(log_entry)
|
| 253 |
+
print(f"📝 Log append result: {log_res}")
|
| 254 |
+
except Exception as e:
|
| 255 |
+
print(f"❌ Failed to append PR log via GitHub API: {e}")
|
| 256 |
+
|
| 257 |
+
if result["status"] == "success":
|
| 258 |
+
return f"""✅ **GitHub PR Creation Successful!**
|
| 259 |
+
|
| 260 |
+
🔗 **PR URL:** {result.get('pr_url', 'NO_PR_URL')}
|
| 261 |
+
🌿 **Branch:** {result["branch"]}
|
| 262 |
+
📁 **File:** {result["file_path"]}{toctree_status}
|
| 263 |
+
|
| 264 |
+
{result["message"]}"""
|
| 265 |
+
|
| 266 |
+
elif result["status"] == "partial_success":
|
| 267 |
+
error_details = result.get("error_details", "Unknown error")
|
| 268 |
+
|
| 269 |
+
# Check if it's "existing PR" case (not really an error)
|
| 270 |
+
if "Existing PR found" in error_details:
|
| 271 |
+
existing_pr_url = error_details.split(": ")[-1] if ": " in error_details else "Unknown"
|
| 272 |
+
return f"""🔄 **Translation Updated Successfully**
|
| 273 |
+
|
| 274 |
+
🎯 **Selected Project:** {project}
|
| 275 |
+
🌿 **Branch:** {result["branch"]}
|
| 276 |
+
📁 **File:** {result["file_path"]}{toctree_status}
|
| 277 |
+
|
| 278 |
+
🔗 **Existing PR Updated:** {existing_pr_url}
|
| 279 |
+
|
| 280 |
+
✅ Your translation has been added to the existing PR. The file and toctree have been successfully updated!"""
|
| 281 |
+
else:
|
| 282 |
+
# Actual error case
|
| 283 |
+
return f"""⚠️ **Partial Success**
|
| 284 |
+
|
| 285 |
+
🎯 **Selected Project:** {project}
|
| 286 |
+
🏠 **User Fork:** {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')}
|
| 287 |
+
🎯 **Target Base:** {base_owner}/{base_repo}
|
| 288 |
+
🌿 **Branch:** {result["branch"]}
|
| 289 |
+
📁 **File:** {result["file_path"]}{toctree_status}
|
| 290 |
+
|
| 291 |
+
{result["message"]}
|
| 292 |
+
|
| 293 |
+
**Error Details:**
|
| 294 |
+
{error_details}
|
| 295 |
+
|
| 296 |
+
💡 **Project-Repository Mismatch Check:**
|
| 297 |
+
- Selected project '{project}' should match repository '{github_config.get('repo_name', 'REPO')}'
|
| 298 |
+
- For smolagents: use Jwaminju/smolagents fork
|
| 299 |
+
- For transformers: use Jwaminju/transformers fork"""
|
| 300 |
+
|
| 301 |
+
else:
|
| 302 |
+
error_details = result.get("error_details", "No additional details")
|
| 303 |
+
return f"""❌ **GitHub PR Creation Failed**
|
| 304 |
+
|
| 305 |
+
🎯 **Selected Project:** {project}
|
| 306 |
+
🏠 **User Fork:** {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')}
|
| 307 |
+
🎯 **Target Base:** {base_owner}/{base_repo}
|
| 308 |
+
|
| 309 |
+
**Error Message:**
|
| 310 |
+
{result["message"]}
|
| 311 |
+
|
| 312 |
+
**Error Details:**
|
| 313 |
+
{error_details}
|
| 314 |
+
|
| 315 |
+
💡 **Project-Repository Mismatch:**
|
| 316 |
+
Selected project '{project}' but configured repository '{github_config.get('repo_name', 'REPO')}'
|
| 317 |
+
• For smolagents project: use 'smolagents' repository
|
| 318 |
+
• For transformers project: use 'transformers' repository"""
|
| 319 |
+
|
| 320 |
+
except Exception as e:
|
| 321 |
+
error_msg = f"""❌ **Unexpected Error During PR Creation**
|
| 322 |
+
|
| 323 |
+
**Error:** {str(e)}
|
| 324 |
+
|
| 325 |
+
**Configuration:**
|
| 326 |
+
• Project: {project}
|
| 327 |
+
• File: {filepath}
|
| 328 |
+
• Target: {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')} → {base_owner if 'base_owner' in locals() else 'BASE'}/{base_repo if 'base_repo' in locals() else 'REPO'}"""
|
| 329 |
+
print(error_msg)
|
| 330 |
+
return error_msg
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
# Backward compatibility function (replaces old mock function)
|
| 334 |
+
def mock_generate_PR():
|
| 335 |
+
"""Backward compatibility function - returns warning message only"""
|
| 336 |
+
return (
|
| 337 |
+
"⚠️ mock_generate_PR() is deprecated. Please use generate_github_pr() instead."
|
| 338 |
+
)
|
app.py
CHANGED
|
@@ -1,379 +1,379 @@
|
|
| 1 |
-
"""Module for gradio chat-based translation agent interface."""
|
| 2 |
-
|
| 3 |
-
import base64
|
| 4 |
-
import os
|
| 5 |
-
|
| 6 |
-
import gradio as gr
|
| 7 |
-
from dotenv import load_dotenv
|
| 8 |
-
|
| 9 |
-
from agent.handler import (
|
| 10 |
-
approve_handler,
|
| 11 |
-
confirm_and_go_translate_handler,
|
| 12 |
-
confirm_translation_and_go_upload_handler,
|
| 13 |
-
get_welcome_message,
|
| 14 |
-
process_file_search_handler,
|
| 15 |
-
restart_handler,
|
| 16 |
-
send_message,
|
| 17 |
-
start_translate_handler,
|
| 18 |
-
sync_language_displays,
|
| 19 |
-
update_language_selection,
|
| 20 |
-
update_project_selection,
|
| 21 |
-
update_prompt_preview,
|
| 22 |
-
update_status,
|
| 23 |
-
update_github_config,
|
| 24 |
-
update_persistent_config,
|
| 25 |
-
)
|
| 26 |
-
from translator.model import Languages
|
| 27 |
-
from translator.project_config import get_available_projects
|
| 28 |
-
|
| 29 |
-
load_dotenv()
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
css = """
|
| 33 |
-
.gradio-container {
|
| 34 |
-
background: linear-gradient(135deg, #ffeda7 0%, #ffbebf 100%);
|
| 35 |
-
}
|
| 36 |
-
.chat-container {
|
| 37 |
-
background: rgba(255, 255, 180, 0.25);
|
| 38 |
-
border-radius: 18px;
|
| 39 |
-
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
| 40 |
-
padding: 1.0em;
|
| 41 |
-
backdrop-filter: blur(8px);
|
| 42 |
-
border: 1px solid rgba(255,255,180,0.25);
|
| 43 |
-
width: 100%;
|
| 44 |
-
height: 100%;
|
| 45 |
-
}
|
| 46 |
-
.control-panel {
|
| 47 |
-
background: rgba(255, 255, 180, 0.25);
|
| 48 |
-
border-radius: 18px;
|
| 49 |
-
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
| 50 |
-
padding: 1.0em;
|
| 51 |
-
backdrop-filter: blur(8px);
|
| 52 |
-
border: 1px solid rgba(255,255,180,0.25);
|
| 53 |
-
width: 100%;
|
| 54 |
-
overflow: visible !important;
|
| 55 |
-
|
| 56 |
-
}
|
| 57 |
-
.status-card {
|
| 58 |
-
width: 100%
|
| 59 |
-
}
|
| 60 |
-
.action-button {
|
| 61 |
-
background: linear-gradient(135deg, #ff8c8c 0%, #f9a889 100%) !important;
|
| 62 |
-
color: white !important;
|
| 63 |
-
border: none !important;
|
| 64 |
-
font-weight: 600 !important;
|
| 65 |
-
box-shadow: 0 4px 12px rgba(0,0,0,0.15) !important;
|
| 66 |
-
transition: all 0.3s ease-in-out !important;
|
| 67 |
-
}
|
| 68 |
-
.action-button:hover {
|
| 69 |
-
background: linear-gradient(135deg, #f9a889 0%, #ff8c8c 100%) !important;
|
| 70 |
-
box-shadow: 0 6px 16px rgba(0,0,0,0.2) !important;
|
| 71 |
-
transform: translateY(-2px) !important;
|
| 72 |
-
}
|
| 73 |
-
|
| 74 |
-
.simple-tabs .tab-nav button {
|
| 75 |
-
background: transparent !important;
|
| 76 |
-
color: #4A5568 !important;
|
| 77 |
-
box-shadow: none !important;
|
| 78 |
-
transform: none !important;
|
| 79 |
-
border: none !important;
|
| 80 |
-
border-bottom: 2px solid #E2E8F0 !important;
|
| 81 |
-
border-radius: 0 !important;
|
| 82 |
-
font-weight: 600 !important;
|
| 83 |
-
}
|
| 84 |
-
|
| 85 |
-
.simple-tabs .tab-nav button.selected {
|
| 86 |
-
color: #f97316 !important;
|
| 87 |
-
border-bottom: 2px solid #f97316 !important;
|
| 88 |
-
}
|
| 89 |
-
|
| 90 |
-
.simple-tabs .tab-nav button:hover {
|
| 91 |
-
background: #f3f4f6 !important;
|
| 92 |
-
color: #f97316 !important;
|
| 93 |
-
box-shadow: none !important;
|
| 94 |
-
transform: none !important;
|
| 95 |
-
}
|
| 96 |
-
"""
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
# Create the main interface
|
| 100 |
-
with gr.Blocks(
|
| 101 |
-
css=css, title=" 🌐 Hugging Face Transformers Docs i18n made easy"
|
| 102 |
-
) as demo:
|
| 103 |
-
# Title
|
| 104 |
-
with open("images/hfkr_logo.png", "rb") as img_file:
|
| 105 |
-
base64_img = base64.b64encode(img_file.read()).decode()
|
| 106 |
-
gr.Markdown(
|
| 107 |
-
f'<img src="data:image/png;base64,{base64_img}" style="display: block; margin-left: auto; margin-right: auto; height: 15em;"/>'
|
| 108 |
-
)
|
| 109 |
-
gr.Markdown(
|
| 110 |
-
'<h1 style="text-align: center;"> 🌐 Hugging Face Transformers Docs i18n made easy</h1>'
|
| 111 |
-
)
|
| 112 |
-
|
| 113 |
-
# Content
|
| 114 |
-
with gr.Row():
|
| 115 |
-
# Chat interface
|
| 116 |
-
with gr.Column(scale=3, elem_classes=["chat-container"]):
|
| 117 |
-
gr.Markdown("### 🌐 Hugging Face i18n Agent")
|
| 118 |
-
|
| 119 |
-
chatbot = gr.Chatbot(
|
| 120 |
-
value=[[None, get_welcome_message()]], scale=1, height=525,
|
| 121 |
-
show_copy_button=True
|
| 122 |
-
)
|
| 123 |
-
|
| 124 |
-
# Chat input directly under main chat
|
| 125 |
-
gr.Markdown("### 💬 Chat with agent")
|
| 126 |
-
with gr.Row():
|
| 127 |
-
msg_input = gr.Textbox(
|
| 128 |
-
placeholder="Type your message here... (e.g. 'what', 'how', or 'help')",
|
| 129 |
-
container=False,
|
| 130 |
-
scale=4,
|
| 131 |
-
)
|
| 132 |
-
send_btn = gr.Button("Send", scale=1, elem_classes="action-button")
|
| 133 |
-
|
| 134 |
-
# Controller interface
|
| 135 |
-
with gr.Column(scale=2):
|
| 136 |
-
# Configuration Panel
|
| 137 |
-
with gr.Column(elem_classes=["control-panel"]):
|
| 138 |
-
gr.Markdown("### ⚙️ Configuration")
|
| 139 |
-
|
| 140 |
-
with gr.Accordion("🔧 API & GitHub Settings", open=True):
|
| 141 |
-
api_provider_radio = gr.Radio(
|
| 142 |
-
["Anthropic", "AWS Bedrock"],
|
| 143 |
-
label="Select API Provider",
|
| 144 |
-
value="Anthropic", # Default selection
|
| 145 |
-
interactive=True,
|
| 146 |
-
)
|
| 147 |
-
config_anthropic_key = gr.Textbox(
|
| 148 |
-
label="🔑 Anthropic API Key",
|
| 149 |
-
type="password",
|
| 150 |
-
placeholder="sk-ant-...",
|
| 151 |
-
visible=True, # Initially visible as Anthropic is default
|
| 152 |
-
)
|
| 153 |
-
config_aws_bearer_token_bedrock = gr.Textbox(
|
| 154 |
-
label="🔑 AWS Bearer Token for Bedrock",
|
| 155 |
-
type="password",
|
| 156 |
-
placeholder="AWS_BEARER_TOKEN_BEDROCK",
|
| 157 |
-
visible=False, # Initially hidden
|
| 158 |
-
)
|
| 159 |
-
config_github_token = gr.Textbox(
|
| 160 |
-
label="🔑 GitHub Token (Required for PR, Optional for file search)",
|
| 161 |
-
type="password",
|
| 162 |
-
placeholder="ghp_...",
|
| 163 |
-
)
|
| 164 |
-
|
| 165 |
-
with gr.Row():
|
| 166 |
-
config_github_owner = gr.Textbox(
|
| 167 |
-
label="👤 GitHub Owner",
|
| 168 |
-
placeholder="your-username",
|
| 169 |
-
scale=1,
|
| 170 |
-
)
|
| 171 |
-
config_github_repo = gr.Textbox(
|
| 172 |
-
label="📁 Repository Name",
|
| 173 |
-
placeholder="your-repository",
|
| 174 |
-
scale=1,
|
| 175 |
-
)
|
| 176 |
-
|
| 177 |
-
save_config_btn = gr.Button(
|
| 178 |
-
"💾 Save Configuration", elem_classes="action-button"
|
| 179 |
-
)
|
| 180 |
-
|
| 181 |
-
# Quick Controller
|
| 182 |
-
with gr.Column(elem_classes=["control-panel"]):
|
| 183 |
-
gr.Markdown("### 🛠️ Quick Controls")
|
| 184 |
-
status_display = gr.HTML(update_status())
|
| 185 |
-
|
| 186 |
-
with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
|
| 187 |
-
with gr.TabItem("1. Find Files", id=0):
|
| 188 |
-
with gr.Group():
|
| 189 |
-
project_dropdown = gr.Radio(
|
| 190 |
-
choices=get_available_projects(),
|
| 191 |
-
label="🎯 Select Project",
|
| 192 |
-
value="transformers",
|
| 193 |
-
)
|
| 194 |
-
lang_dropdown = gr.Radio(
|
| 195 |
-
choices=[language.value for language in Languages],
|
| 196 |
-
label="🌍 Translate To",
|
| 197 |
-
value="ko",
|
| 198 |
-
)
|
| 199 |
-
k_input = gr.Number(
|
| 200 |
-
label="📊 First k missing translated docs",
|
| 201 |
-
value=10,
|
| 202 |
-
minimum=1,
|
| 203 |
-
)
|
| 204 |
-
find_btn = gr.Button(
|
| 205 |
-
"🔍 Find Files to Translate",
|
| 206 |
-
elem_classes="action-button",
|
| 207 |
-
)
|
| 208 |
-
|
| 209 |
-
confirm_go_btn = gr.Button(
|
| 210 |
-
"✅ Confirm Selection & Go to Translate",
|
| 211 |
-
elem_classes="action-button",
|
| 212 |
-
)
|
| 213 |
-
|
| 214 |
-
with gr.TabItem("2. Translate", id=1):
|
| 215 |
-
with gr.Group():
|
| 216 |
-
files_to_translate = gr.Radio(
|
| 217 |
-
choices=[],
|
| 218 |
-
label="📄 Select a file to translate",
|
| 219 |
-
interactive=True,
|
| 220 |
-
value=None,
|
| 221 |
-
)
|
| 222 |
-
file_to_translate_input = gr.Textbox(
|
| 223 |
-
label="🌍 Select in the dropdown or write the file path to translate",
|
| 224 |
-
value="",
|
| 225 |
-
)
|
| 226 |
-
|
| 227 |
-
translate_lang_display = gr.Dropdown(
|
| 228 |
-
choices=[language.value for language in Languages],
|
| 229 |
-
label="🌍 Translation Language",
|
| 230 |
-
value="ko",
|
| 231 |
-
interactive=False,
|
| 232 |
-
)
|
| 233 |
-
additional_instruction = gr.Textbox(
|
| 234 |
-
label="📝 Additional instructions (Optional - e.g., custom glossary)",
|
| 235 |
-
placeholder="Example: Translate 'model' as '모델' consistently",
|
| 236 |
-
lines=2,
|
| 237 |
-
)
|
| 238 |
-
|
| 239 |
-
force_retranslate = gr.Checkbox(
|
| 240 |
-
label="🔄 Force Retranslate (ignore existing translations)",
|
| 241 |
-
value=False,
|
| 242 |
-
)
|
| 243 |
-
|
| 244 |
-
with gr.Accordion("🔍 Preview Translation Prompt", open=False):
|
| 245 |
-
prompt_preview = gr.Textbox(
|
| 246 |
-
lines=8,
|
| 247 |
-
interactive=False,
|
| 248 |
-
placeholder="Select a file and language to see the prompt preview...",
|
| 249 |
-
show_copy_button=True,
|
| 250 |
-
)
|
| 251 |
-
|
| 252 |
-
start_translate_btn = gr.Button(
|
| 253 |
-
"🚀 Start Translation", elem_classes="action-button"
|
| 254 |
-
)
|
| 255 |
-
|
| 256 |
-
confirm_upload_btn = gr.Button(
|
| 257 |
-
"✅ Confirm Translation & Upload PR",
|
| 258 |
-
elem_classes="action-button",
|
| 259 |
-
visible=False,
|
| 260 |
-
)
|
| 261 |
-
|
| 262 |
-
with gr.TabItem("3. Upload PR", id=2):
|
| 263 |
-
with gr.Group():
|
| 264 |
-
reference_pr_url = gr.Textbox(
|
| 265 |
-
label="🔗 Reference PR URL (Optional)",
|
| 266 |
-
placeholder="Auto-filled based on project selection",
|
| 267 |
-
)
|
| 268 |
-
approve_btn = gr.Button(
|
| 269 |
-
"✅ Generate GitHub PR", elem_classes="action-button"
|
| 270 |
-
)
|
| 271 |
-
restart_btn = gr.Button(
|
| 272 |
-
"🔄 Restart Translation", elem_classes="action-button"
|
| 273 |
-
)
|
| 274 |
-
|
| 275 |
-
# Event Handlers
|
| 276 |
-
|
| 277 |
-
find_btn.click(
|
| 278 |
-
fn=process_file_search_handler,
|
| 279 |
-
inputs=[project_dropdown, lang_dropdown, k_input, chatbot],
|
| 280 |
-
outputs=[chatbot, msg_input, status_display, control_tabs, files_to_translate],
|
| 281 |
-
)
|
| 282 |
-
|
| 283 |
-
confirm_go_btn.click(
|
| 284 |
-
fn=confirm_and_go_translate_handler,
|
| 285 |
-
inputs=[chatbot],
|
| 286 |
-
outputs=[chatbot, msg_input, status_display, control_tabs],
|
| 287 |
-
)
|
| 288 |
-
|
| 289 |
-
# Auto-save selections to state and update prompt preview
|
| 290 |
-
project_dropdown.change(
|
| 291 |
-
fn=update_project_selection,
|
| 292 |
-
inputs=[project_dropdown, chatbot],
|
| 293 |
-
outputs=[chatbot, msg_input, status_display],
|
| 294 |
-
)
|
| 295 |
-
|
| 296 |
-
# Update prompt preview when project changes
|
| 297 |
-
project_dropdown.change(
|
| 298 |
-
fn=update_prompt_preview,
|
| 299 |
-
inputs=[translate_lang_display, file_to_translate_input, additional_instruction],
|
| 300 |
-
outputs=[prompt_preview],
|
| 301 |
-
)
|
| 302 |
-
|
| 303 |
-
lang_dropdown.change(
|
| 304 |
-
fn=update_language_selection,
|
| 305 |
-
inputs=[lang_dropdown, chatbot],
|
| 306 |
-
outputs=[chatbot, msg_input, status_display, translate_lang_display],
|
| 307 |
-
)
|
| 308 |
-
|
| 309 |
-
#
|
| 310 |
-
files_to_translate.change(
|
| 311 |
-
fn=lambda x: x,
|
| 312 |
-
inputs=[files_to_translate],
|
| 313 |
-
outputs=[file_to_translate_input],
|
| 314 |
-
)
|
| 315 |
-
|
| 316 |
-
# Button event handlers
|
| 317 |
-
start_translate_btn.click(
|
| 318 |
-
fn=start_translate_handler,
|
| 319 |
-
inputs=[chatbot, file_to_translate_input, additional_instruction, force_retranslate],
|
| 320 |
-
outputs=[chatbot, msg_input, status_display, control_tabs, start_translate_btn, confirm_upload_btn],
|
| 321 |
-
)
|
| 322 |
-
|
| 323 |
-
confirm_upload_btn.click(
|
| 324 |
-
fn=confirm_translation_and_go_upload_handler,
|
| 325 |
-
inputs=[chatbot],
|
| 326 |
-
outputs=[chatbot, msg_input, status_display, control_tabs],
|
| 327 |
-
)
|
| 328 |
-
|
| 329 |
-
# Configuration Save
|
| 330 |
-
save_config_btn.click(
|
| 331 |
-
fn=update_persistent_config,
|
| 332 |
-
inputs=[api_provider_radio, config_anthropic_key, config_aws_bearer_token_bedrock, config_github_token, config_github_owner, config_github_repo, reference_pr_url, chatbot],
|
| 333 |
-
outputs=[chatbot, msg_input, status_display],
|
| 334 |
-
)
|
| 335 |
-
|
| 336 |
-
# API Provider selection handler
|
| 337 |
-
api_provider_radio.change(
|
| 338 |
-
fn=lambda provider: (
|
| 339 |
-
gr.update(visible=True) if provider == "Anthropic" else gr.update(visible=False),
|
| 340 |
-
gr.update(visible=True) if provider == "AWS Bedrock" else gr.update(visible=False),
|
| 341 |
-
),
|
| 342 |
-
inputs=[api_provider_radio],
|
| 343 |
-
outputs=[config_anthropic_key, config_aws_bearer_token_bedrock],
|
| 344 |
-
)
|
| 345 |
-
|
| 346 |
-
approve_btn.click(
|
| 347 |
-
fn=approve_handler,
|
| 348 |
-
inputs=[chatbot, config_github_owner, config_github_repo, reference_pr_url],
|
| 349 |
-
outputs=[chatbot, msg_input, status_display],
|
| 350 |
-
)
|
| 351 |
-
|
| 352 |
-
restart_btn.click(
|
| 353 |
-
fn=restart_handler,
|
| 354 |
-
inputs=[chatbot],
|
| 355 |
-
outputs=[chatbot, msg_input, status_display, control_tabs],
|
| 356 |
-
)
|
| 357 |
-
|
| 358 |
-
send_btn.click(
|
| 359 |
-
fn=send_message,
|
| 360 |
-
inputs=[msg_input, chatbot],
|
| 361 |
-
outputs=[chatbot, msg_input, status_display],
|
| 362 |
-
)
|
| 363 |
-
|
| 364 |
-
msg_input.submit(
|
| 365 |
-
fn=send_message,
|
| 366 |
-
inputs=[msg_input, chatbot],
|
| 367 |
-
outputs=[chatbot, msg_input, status_display],
|
| 368 |
-
)
|
| 369 |
-
|
| 370 |
-
# Update prompt preview when inputs change
|
| 371 |
-
for input_component in [translate_lang_display, file_to_translate_input, additional_instruction]:
|
| 372 |
-
input_component.change(
|
| 373 |
-
fn=update_prompt_preview,
|
| 374 |
-
inputs=[translate_lang_display, file_to_translate_input, additional_instruction],
|
| 375 |
-
outputs=[prompt_preview],
|
| 376 |
-
)
|
| 377 |
-
|
| 378 |
-
root_path = os.environ.get("GRADIO_ROOT_PATH")
|
| 379 |
-
demo.launch(root_path=root_path)
|
|
|
|
| 1 |
+
"""Module for gradio chat-based translation agent interface."""
|
| 2 |
+
|
| 3 |
+
import base64
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
|
| 9 |
+
from agent.handler import (
|
| 10 |
+
approve_handler,
|
| 11 |
+
confirm_and_go_translate_handler,
|
| 12 |
+
confirm_translation_and_go_upload_handler,
|
| 13 |
+
get_welcome_message,
|
| 14 |
+
process_file_search_handler,
|
| 15 |
+
restart_handler,
|
| 16 |
+
send_message,
|
| 17 |
+
start_translate_handler,
|
| 18 |
+
sync_language_displays,
|
| 19 |
+
update_language_selection,
|
| 20 |
+
update_project_selection,
|
| 21 |
+
update_prompt_preview,
|
| 22 |
+
update_status,
|
| 23 |
+
update_github_config,
|
| 24 |
+
update_persistent_config,
|
| 25 |
+
)
|
| 26 |
+
from translator.model import Languages
|
| 27 |
+
from translator.project_config import get_available_projects
|
| 28 |
+
|
| 29 |
+
load_dotenv()
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
css = """
|
| 33 |
+
.gradio-container {
|
| 34 |
+
background: linear-gradient(135deg, #ffeda7 0%, #ffbebf 100%);
|
| 35 |
+
}
|
| 36 |
+
.chat-container {
|
| 37 |
+
background: rgba(255, 255, 180, 0.25);
|
| 38 |
+
border-radius: 18px;
|
| 39 |
+
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
| 40 |
+
padding: 1.0em;
|
| 41 |
+
backdrop-filter: blur(8px);
|
| 42 |
+
border: 1px solid rgba(255,255,180,0.25);
|
| 43 |
+
width: 100%;
|
| 44 |
+
height: 100%;
|
| 45 |
+
}
|
| 46 |
+
.control-panel {
|
| 47 |
+
background: rgba(255, 255, 180, 0.25);
|
| 48 |
+
border-radius: 18px;
|
| 49 |
+
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
| 50 |
+
padding: 1.0em;
|
| 51 |
+
backdrop-filter: blur(8px);
|
| 52 |
+
border: 1px solid rgba(255,255,180,0.25);
|
| 53 |
+
width: 100%;
|
| 54 |
+
overflow: visible !important;
|
| 55 |
+
|
| 56 |
+
}
|
| 57 |
+
.status-card {
|
| 58 |
+
width: 100%
|
| 59 |
+
}
|
| 60 |
+
.action-button {
|
| 61 |
+
background: linear-gradient(135deg, #ff8c8c 0%, #f9a889 100%) !important;
|
| 62 |
+
color: white !important;
|
| 63 |
+
border: none !important;
|
| 64 |
+
font-weight: 600 !important;
|
| 65 |
+
box-shadow: 0 4px 12px rgba(0,0,0,0.15) !important;
|
| 66 |
+
transition: all 0.3s ease-in-out !important;
|
| 67 |
+
}
|
| 68 |
+
.action-button:hover {
|
| 69 |
+
background: linear-gradient(135deg, #f9a889 0%, #ff8c8c 100%) !important;
|
| 70 |
+
box-shadow: 0 6px 16px rgba(0,0,0,0.2) !important;
|
| 71 |
+
transform: translateY(-2px) !important;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
.simple-tabs .tab-nav button {
|
| 75 |
+
background: transparent !important;
|
| 76 |
+
color: #4A5568 !important;
|
| 77 |
+
box-shadow: none !important;
|
| 78 |
+
transform: none !important;
|
| 79 |
+
border: none !important;
|
| 80 |
+
border-bottom: 2px solid #E2E8F0 !important;
|
| 81 |
+
border-radius: 0 !important;
|
| 82 |
+
font-weight: 600 !important;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
.simple-tabs .tab-nav button.selected {
|
| 86 |
+
color: #f97316 !important;
|
| 87 |
+
border-bottom: 2px solid #f97316 !important;
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
.simple-tabs .tab-nav button:hover {
|
| 91 |
+
background: #f3f4f6 !important;
|
| 92 |
+
color: #f97316 !important;
|
| 93 |
+
box-shadow: none !important;
|
| 94 |
+
transform: none !important;
|
| 95 |
+
}
|
| 96 |
+
"""
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# Create the main interface
|
| 100 |
+
with gr.Blocks(
|
| 101 |
+
css=css, title=" 🌐 Hugging Face Transformers Docs i18n made easy"
|
| 102 |
+
) as demo:
|
| 103 |
+
# Title
|
| 104 |
+
with open("images/hfkr_logo.png", "rb") as img_file:
|
| 105 |
+
base64_img = base64.b64encode(img_file.read()).decode()
|
| 106 |
+
gr.Markdown(
|
| 107 |
+
f'<img src="data:image/png;base64,{base64_img}" style="display: block; margin-left: auto; margin-right: auto; height: 15em;"/>'
|
| 108 |
+
)
|
| 109 |
+
gr.Markdown(
|
| 110 |
+
'<h1 style="text-align: center;"> 🌐 Hugging Face Transformers Docs i18n made easy</h1>'
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# Content
|
| 114 |
+
with gr.Row():
|
| 115 |
+
# Chat interface
|
| 116 |
+
with gr.Column(scale=3, elem_classes=["chat-container"]):
|
| 117 |
+
gr.Markdown("### 🌐 Hugging Face i18n Agent")
|
| 118 |
+
|
| 119 |
+
chatbot = gr.Chatbot(
|
| 120 |
+
value=[[None, get_welcome_message()]], scale=1, height=525,
|
| 121 |
+
show_copy_button=True
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
# Chat input directly under main chat
|
| 125 |
+
gr.Markdown("### 💬 Chat with agent")
|
| 126 |
+
with gr.Row():
|
| 127 |
+
msg_input = gr.Textbox(
|
| 128 |
+
placeholder="Type your message here... (e.g. 'what', 'how', or 'help')",
|
| 129 |
+
container=False,
|
| 130 |
+
scale=4,
|
| 131 |
+
)
|
| 132 |
+
send_btn = gr.Button("Send", scale=1, elem_classes="action-button")
|
| 133 |
+
|
| 134 |
+
# Controller interface
|
| 135 |
+
with gr.Column(scale=2):
|
| 136 |
+
# Configuration Panel
|
| 137 |
+
with gr.Column(elem_classes=["control-panel"]):
|
| 138 |
+
gr.Markdown("### ⚙️ Configuration")
|
| 139 |
+
|
| 140 |
+
with gr.Accordion("🔧 API & GitHub Settings", open=True):
|
| 141 |
+
api_provider_radio = gr.Radio(
|
| 142 |
+
["Anthropic", "AWS Bedrock"],
|
| 143 |
+
label="Select API Provider",
|
| 144 |
+
value="Anthropic", # Default selection
|
| 145 |
+
interactive=True,
|
| 146 |
+
)
|
| 147 |
+
config_anthropic_key = gr.Textbox(
|
| 148 |
+
label="🔑 Anthropic API Key",
|
| 149 |
+
type="password",
|
| 150 |
+
placeholder="sk-ant-...",
|
| 151 |
+
visible=True, # Initially visible as Anthropic is default
|
| 152 |
+
)
|
| 153 |
+
config_aws_bearer_token_bedrock = gr.Textbox(
|
| 154 |
+
label="🔑 AWS Bearer Token for Bedrock",
|
| 155 |
+
type="password",
|
| 156 |
+
placeholder="AWS_BEARER_TOKEN_BEDROCK",
|
| 157 |
+
visible=False, # Initially hidden
|
| 158 |
+
)
|
| 159 |
+
config_github_token = gr.Textbox(
|
| 160 |
+
label="🔑 GitHub Token (Required for PR, Optional for file search)",
|
| 161 |
+
type="password",
|
| 162 |
+
placeholder="ghp_...",
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
with gr.Row():
|
| 166 |
+
config_github_owner = gr.Textbox(
|
| 167 |
+
label="👤 GitHub Owner",
|
| 168 |
+
placeholder="your-username",
|
| 169 |
+
scale=1,
|
| 170 |
+
)
|
| 171 |
+
config_github_repo = gr.Textbox(
|
| 172 |
+
label="📁 Repository Name",
|
| 173 |
+
placeholder="your-repository",
|
| 174 |
+
scale=1,
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
save_config_btn = gr.Button(
|
| 178 |
+
"💾 Save Configuration", elem_classes="action-button"
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
# Quick Controller
|
| 182 |
+
with gr.Column(elem_classes=["control-panel"]):
|
| 183 |
+
gr.Markdown("### 🛠️ Quick Controls")
|
| 184 |
+
status_display = gr.HTML(update_status())
|
| 185 |
+
|
| 186 |
+
with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
|
| 187 |
+
with gr.TabItem("1. Find Files", id=0):
|
| 188 |
+
with gr.Group():
|
| 189 |
+
project_dropdown = gr.Radio(
|
| 190 |
+
choices=get_available_projects(),
|
| 191 |
+
label="🎯 Select Project",
|
| 192 |
+
value="transformers",
|
| 193 |
+
)
|
| 194 |
+
lang_dropdown = gr.Radio(
|
| 195 |
+
choices=[language.value for language in Languages],
|
| 196 |
+
label="🌍 Translate To",
|
| 197 |
+
value="ko",
|
| 198 |
+
)
|
| 199 |
+
k_input = gr.Number(
|
| 200 |
+
label="📊 First k missing translated docs",
|
| 201 |
+
value=10,
|
| 202 |
+
minimum=1,
|
| 203 |
+
)
|
| 204 |
+
find_btn = gr.Button(
|
| 205 |
+
"🔍 Find Files to Translate",
|
| 206 |
+
elem_classes="action-button",
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
confirm_go_btn = gr.Button(
|
| 210 |
+
"✅ Confirm Selection & Go to Translate",
|
| 211 |
+
elem_classes="action-button",
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
with gr.TabItem("2. Translate", id=1):
|
| 215 |
+
with gr.Group():
|
| 216 |
+
files_to_translate = gr.Radio(
|
| 217 |
+
choices=[],
|
| 218 |
+
label="📄 Select a file to translate",
|
| 219 |
+
interactive=True,
|
| 220 |
+
value=None,
|
| 221 |
+
)
|
| 222 |
+
file_to_translate_input = gr.Textbox(
|
| 223 |
+
label="🌍 Select in the dropdown or write the file path to translate",
|
| 224 |
+
value="",
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
translate_lang_display = gr.Dropdown(
|
| 228 |
+
choices=[language.value for language in Languages],
|
| 229 |
+
label="🌍 Translation Language",
|
| 230 |
+
value="ko",
|
| 231 |
+
interactive=False,
|
| 232 |
+
)
|
| 233 |
+
additional_instruction = gr.Textbox(
|
| 234 |
+
label="📝 Additional instructions (Optional - e.g., custom glossary)",
|
| 235 |
+
placeholder="Example: Translate 'model' as '모델' consistently",
|
| 236 |
+
lines=2,
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
force_retranslate = gr.Checkbox(
|
| 240 |
+
label="🔄 Force Retranslate (ignore existing translations)",
|
| 241 |
+
value=False,
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
with gr.Accordion("🔍 Preview Translation Prompt", open=False):
|
| 245 |
+
prompt_preview = gr.Textbox(
|
| 246 |
+
lines=8,
|
| 247 |
+
interactive=False,
|
| 248 |
+
placeholder="Select a file and language to see the prompt preview...",
|
| 249 |
+
show_copy_button=True,
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
start_translate_btn = gr.Button(
|
| 253 |
+
"🚀 Start Translation", elem_classes="action-button"
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
confirm_upload_btn = gr.Button(
|
| 257 |
+
"✅ Confirm Translation & Upload PR",
|
| 258 |
+
elem_classes="action-button",
|
| 259 |
+
visible=False,
|
| 260 |
+
)
|
| 261 |
+
|
| 262 |
+
with gr.TabItem("3. Upload PR", id=2):
|
| 263 |
+
with gr.Group():
|
| 264 |
+
reference_pr_url = gr.Textbox(
|
| 265 |
+
label="🔗 Reference PR URL (Optional)",
|
| 266 |
+
placeholder="Auto-filled based on project selection",
|
| 267 |
+
)
|
| 268 |
+
approve_btn = gr.Button(
|
| 269 |
+
"✅ Generate GitHub PR", elem_classes="action-button"
|
| 270 |
+
)
|
| 271 |
+
restart_btn = gr.Button(
|
| 272 |
+
"🔄 Restart Translation", elem_classes="action-button"
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
# Event Handlers
|
| 276 |
+
|
| 277 |
+
find_btn.click(
|
| 278 |
+
fn=process_file_search_handler,
|
| 279 |
+
inputs=[project_dropdown, lang_dropdown, k_input, chatbot],
|
| 280 |
+
outputs=[chatbot, msg_input, status_display, control_tabs, files_to_translate],
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
confirm_go_btn.click(
|
| 284 |
+
fn=confirm_and_go_translate_handler,
|
| 285 |
+
inputs=[chatbot],
|
| 286 |
+
outputs=[chatbot, msg_input, status_display, control_tabs],
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
# Auto-save selections to state and update prompt preview
|
| 290 |
+
project_dropdown.change(
|
| 291 |
+
fn=update_project_selection,
|
| 292 |
+
inputs=[project_dropdown, chatbot],
|
| 293 |
+
outputs=[chatbot, msg_input, status_display],
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
# Update prompt preview when project changes
|
| 297 |
+
project_dropdown.change(
|
| 298 |
+
fn=update_prompt_preview,
|
| 299 |
+
inputs=[translate_lang_display, file_to_translate_input, additional_instruction],
|
| 300 |
+
outputs=[prompt_preview],
|
| 301 |
+
)
|
| 302 |
+
|
| 303 |
+
lang_dropdown.change(
|
| 304 |
+
fn=update_language_selection,
|
| 305 |
+
inputs=[lang_dropdown, chatbot],
|
| 306 |
+
outputs=[chatbot, msg_input, status_display, translate_lang_display],
|
| 307 |
+
)
|
| 308 |
+
|
| 309 |
+
#
|
| 310 |
+
files_to_translate.change(
|
| 311 |
+
fn=lambda x: x,
|
| 312 |
+
inputs=[files_to_translate],
|
| 313 |
+
outputs=[file_to_translate_input],
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
# Button event handlers
|
| 317 |
+
start_translate_btn.click(
|
| 318 |
+
fn=start_translate_handler,
|
| 319 |
+
inputs=[chatbot, file_to_translate_input, additional_instruction, force_retranslate],
|
| 320 |
+
outputs=[chatbot, msg_input, status_display, control_tabs, start_translate_btn, confirm_upload_btn],
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
confirm_upload_btn.click(
|
| 324 |
+
fn=confirm_translation_and_go_upload_handler,
|
| 325 |
+
inputs=[chatbot],
|
| 326 |
+
outputs=[chatbot, msg_input, status_display, control_tabs],
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
# Configuration Save
|
| 330 |
+
save_config_btn.click(
|
| 331 |
+
fn=update_persistent_config,
|
| 332 |
+
inputs=[api_provider_radio, config_anthropic_key, config_aws_bearer_token_bedrock, config_github_token, config_github_owner, config_github_repo, reference_pr_url, chatbot],
|
| 333 |
+
outputs=[chatbot, msg_input, status_display],
|
| 334 |
+
)
|
| 335 |
+
|
| 336 |
+
# API Provider selection handler
|
| 337 |
+
api_provider_radio.change(
|
| 338 |
+
fn=lambda provider: (
|
| 339 |
+
gr.update(visible=True) if provider == "Anthropic" else gr.update(visible=False),
|
| 340 |
+
gr.update(visible=True) if provider == "AWS Bedrock" else gr.update(visible=False),
|
| 341 |
+
),
|
| 342 |
+
inputs=[api_provider_radio],
|
| 343 |
+
outputs=[config_anthropic_key, config_aws_bearer_token_bedrock],
|
| 344 |
+
)
|
| 345 |
+
|
| 346 |
+
approve_btn.click(
|
| 347 |
+
fn=approve_handler,
|
| 348 |
+
inputs=[chatbot, config_github_owner, config_github_repo, reference_pr_url],
|
| 349 |
+
outputs=[chatbot, msg_input, status_display],
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
restart_btn.click(
|
| 353 |
+
fn=restart_handler,
|
| 354 |
+
inputs=[chatbot],
|
| 355 |
+
outputs=[chatbot, msg_input, status_display, control_tabs],
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
send_btn.click(
|
| 359 |
+
fn=send_message,
|
| 360 |
+
inputs=[msg_input, chatbot],
|
| 361 |
+
outputs=[chatbot, msg_input, status_display],
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
msg_input.submit(
|
| 365 |
+
fn=send_message,
|
| 366 |
+
inputs=[msg_input, chatbot],
|
| 367 |
+
outputs=[chatbot, msg_input, status_display],
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
# Update prompt preview when inputs change
|
| 371 |
+
for input_component in [translate_lang_display, file_to_translate_input, additional_instruction]:
|
| 372 |
+
input_component.change(
|
| 373 |
+
fn=update_prompt_preview,
|
| 374 |
+
inputs=[translate_lang_display, file_to_translate_input, additional_instruction],
|
| 375 |
+
outputs=[prompt_preview],
|
| 376 |
+
)
|
| 377 |
+
|
| 378 |
+
root_path = os.environ.get("GRADIO_ROOT_PATH")
|
| 379 |
+
demo.launch(root_path=root_path)
|
config.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
-
# config.py
|
| 2 |
-
|
| 3 |
-
# 기본 모델 목록
|
| 4 |
-
default_models = [
|
| 5 |
-
"Helsinki-NLP/opus-mt-ko-en",
|
| 6 |
-
"Helsinki-NLP/opus-mt-tc-big-en-ko",
|
| 7 |
-
"davidkim205/iris-7b",
|
| 8 |
-
"maywell/Synatra-7B-v0.3-Translation",
|
| 9 |
-
"CUSTOM_MODEL_INPUT" # Placeholder for custom model input
|
| 10 |
]
|
|
|
|
| 1 |
+
# config.py
|
| 2 |
+
|
| 3 |
+
# 기본 모델 목록
|
| 4 |
+
default_models = [
|
| 5 |
+
"Helsinki-NLP/opus-mt-ko-en",
|
| 6 |
+
"Helsinki-NLP/opus-mt-tc-big-en-ko",
|
| 7 |
+
"davidkim205/iris-7b",
|
| 8 |
+
"maywell/Synatra-7B-v0.3-Translation",
|
| 9 |
+
"CUSTOM_MODEL_INPUT" # Placeholder for custom model input
|
| 10 |
]
|
example.env
CHANGED
|
@@ -1,18 +1,18 @@
|
|
| 1 |
-
ANTHROPIC_API_KEY=<your api key>
|
| 2 |
-
|
| 3 |
-
# GitHub PR Agent Configuration
|
| 4 |
-
GITHUB_TOKEN=<your github token>
|
| 5 |
-
GITHUB_OWNER=<your github username>
|
| 6 |
-
GITHUB_REPO=<your repository name>
|
| 7 |
-
REFERENCE_PR_URL=<reference pr url for style analysis>
|
| 8 |
-
|
| 9 |
-
# Secrets for deployment to HF space
|
| 10 |
-
HF_TOKEN=
|
| 11 |
-
HF_USERNAME=
|
| 12 |
-
HF_SPACE_NAME=
|
| 13 |
-
|
| 14 |
-
# Secrets for logging to Github
|
| 15 |
-
LOG_REPO=
|
| 16 |
-
LOG_GITHUB_TOKEN=
|
| 17 |
-
LOG_BRANCH=
|
| 18 |
-
LOG_FILE_PATH=
|
|
|
|
| 1 |
+
ANTHROPIC_API_KEY=<your api key>
|
| 2 |
+
|
| 3 |
+
# GitHub PR Agent Configuration
|
| 4 |
+
GITHUB_TOKEN=<your github token>
|
| 5 |
+
GITHUB_OWNER=<your github username>
|
| 6 |
+
GITHUB_REPO=<your repository name>
|
| 7 |
+
REFERENCE_PR_URL=<reference pr url for style analysis>
|
| 8 |
+
|
| 9 |
+
# Secrets for deployment to HF space
|
| 10 |
+
HF_TOKEN=
|
| 11 |
+
HF_USERNAME=
|
| 12 |
+
HF_SPACE_NAME=
|
| 13 |
+
|
| 14 |
+
# Secrets for logging to Github
|
| 15 |
+
LOG_REPO=
|
| 16 |
+
LOG_GITHUB_TOKEN=
|
| 17 |
+
LOG_BRANCH=
|
| 18 |
+
LOG_FILE_PATH=
|
logger/github_logger.py
CHANGED
|
@@ -1,71 +1,71 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import base64
|
| 3 |
-
from typing import Optional
|
| 4 |
-
|
| 5 |
-
try:
|
| 6 |
-
from github import Github, GithubException
|
| 7 |
-
LIBS_OK = True
|
| 8 |
-
except ImportError:
|
| 9 |
-
LIBS_OK = False
|
| 10 |
-
|
| 11 |
-
class GitHubLogger:
|
| 12 |
-
"""Dedicated logger that appends JSONL entries to a GitHub repo/branch/file.
|
| 13 |
-
|
| 14 |
-
Env vars:
|
| 15 |
-
- LOG_GITHUB_TOKEN (fallback: GITHUB_TOKEN)
|
| 16 |
-
- LOG_REPO (format: owner/repo)
|
| 17 |
-
- LOG_BRANCH (default: 'log_event')
|
| 18 |
-
- LOG_FILE_PATH (default: 'pr_success.log')
|
| 19 |
-
"""
|
| 20 |
-
|
| 21 |
-
def __init__(self):
|
| 22 |
-
if not LIBS_OK:
|
| 23 |
-
raise ImportError("PyGithub not installed. Please install PyGithub.")
|
| 24 |
-
token = os.environ.get("LOG_GITHUB_TOKEN") or os.environ.get("GITHUB_TOKEN")
|
| 25 |
-
if not token:
|
| 26 |
-
raise ValueError("Missing LOG_GITHUB_TOKEN or GITHUB_TOKEN for logging")
|
| 27 |
-
self._client = Github(token)
|
| 28 |
-
|
| 29 |
-
repo_spec = os.environ.get("LOG_REPO")
|
| 30 |
-
if not repo_spec or "/" not in repo_spec:
|
| 31 |
-
raise ValueError("Missing or invalid LOG_REPO. Expected 'owner/repo'.")
|
| 32 |
-
self.owner, self.repo_name = repo_spec.split("/", 1)
|
| 33 |
-
|
| 34 |
-
self.branch = os.environ.get("LOG_BRANCH", "log_event")
|
| 35 |
-
self.path = os.environ.get("LOG_FILE_PATH", "pr_success.log")
|
| 36 |
-
|
| 37 |
-
def _ensure_branch(self, repo):
|
| 38 |
-
try:
|
| 39 |
-
repo.get_branch(self.branch)
|
| 40 |
-
except GithubException as e:
|
| 41 |
-
if e.status == 404:
|
| 42 |
-
base = repo.get_branch(repo.default_branch)
|
| 43 |
-
repo.create_git_ref(ref=f"refs/heads/{self.branch}", sha=base.commit.sha)
|
| 44 |
-
else:
|
| 45 |
-
raise
|
| 46 |
-
|
| 47 |
-
def append_jsonl(self, jsonl_line: str, commit_message: str = "chore(log): append entry") -> str:
|
| 48 |
-
repo = self._client.get_repo(f"{self.owner}/{self.repo_name}")
|
| 49 |
-
self._ensure_branch(repo)
|
| 50 |
-
try:
|
| 51 |
-
existing = repo.get_contents(self.path, ref=self.branch)
|
| 52 |
-
existing_content = base64.b64decode(existing.content).decode("utf-8")
|
| 53 |
-
new_content = existing_content + jsonl_line
|
| 54 |
-
repo.update_file(
|
| 55 |
-
path=self.path,
|
| 56 |
-
message=commit_message,
|
| 57 |
-
content=new_content,
|
| 58 |
-
sha=existing.sha,
|
| 59 |
-
branch=self.branch,
|
| 60 |
-
)
|
| 61 |
-
return "SUCCESS: Log appended"
|
| 62 |
-
except GithubException as e:
|
| 63 |
-
if e.status == 404:
|
| 64 |
-
repo.create_file(
|
| 65 |
-
path=self.path,
|
| 66 |
-
message=commit_message,
|
| 67 |
-
content=jsonl_line,
|
| 68 |
-
branch=self.branch,
|
| 69 |
-
)
|
| 70 |
-
return "SUCCESS: Log file created and first entry appended"
|
| 71 |
-
raise
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import base64
|
| 3 |
+
from typing import Optional
|
| 4 |
+
|
| 5 |
+
try:
|
| 6 |
+
from github import Github, GithubException
|
| 7 |
+
LIBS_OK = True
|
| 8 |
+
except ImportError:
|
| 9 |
+
LIBS_OK = False
|
| 10 |
+
|
| 11 |
+
class GitHubLogger:
|
| 12 |
+
"""Dedicated logger that appends JSONL entries to a GitHub repo/branch/file.
|
| 13 |
+
|
| 14 |
+
Env vars:
|
| 15 |
+
- LOG_GITHUB_TOKEN (fallback: GITHUB_TOKEN)
|
| 16 |
+
- LOG_REPO (format: owner/repo)
|
| 17 |
+
- LOG_BRANCH (default: 'log_event')
|
| 18 |
+
- LOG_FILE_PATH (default: 'pr_success.log')
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def __init__(self):
|
| 22 |
+
if not LIBS_OK:
|
| 23 |
+
raise ImportError("PyGithub not installed. Please install PyGithub.")
|
| 24 |
+
token = os.environ.get("LOG_GITHUB_TOKEN") or os.environ.get("GITHUB_TOKEN")
|
| 25 |
+
if not token:
|
| 26 |
+
raise ValueError("Missing LOG_GITHUB_TOKEN or GITHUB_TOKEN for logging")
|
| 27 |
+
self._client = Github(token)
|
| 28 |
+
|
| 29 |
+
repo_spec = os.environ.get("LOG_REPO")
|
| 30 |
+
if not repo_spec or "/" not in repo_spec:
|
| 31 |
+
raise ValueError("Missing or invalid LOG_REPO. Expected 'owner/repo'.")
|
| 32 |
+
self.owner, self.repo_name = repo_spec.split("/", 1)
|
| 33 |
+
|
| 34 |
+
self.branch = os.environ.get("LOG_BRANCH", "log_event")
|
| 35 |
+
self.path = os.environ.get("LOG_FILE_PATH", "pr_success.log")
|
| 36 |
+
|
| 37 |
+
def _ensure_branch(self, repo):
|
| 38 |
+
try:
|
| 39 |
+
repo.get_branch(self.branch)
|
| 40 |
+
except GithubException as e:
|
| 41 |
+
if e.status == 404:
|
| 42 |
+
base = repo.get_branch(repo.default_branch)
|
| 43 |
+
repo.create_git_ref(ref=f"refs/heads/{self.branch}", sha=base.commit.sha)
|
| 44 |
+
else:
|
| 45 |
+
raise
|
| 46 |
+
|
| 47 |
+
def append_jsonl(self, jsonl_line: str, commit_message: str = "chore(log): append entry") -> str:
|
| 48 |
+
repo = self._client.get_repo(f"{self.owner}/{self.repo_name}")
|
| 49 |
+
self._ensure_branch(repo)
|
| 50 |
+
try:
|
| 51 |
+
existing = repo.get_contents(self.path, ref=self.branch)
|
| 52 |
+
existing_content = base64.b64decode(existing.content).decode("utf-8")
|
| 53 |
+
new_content = existing_content + jsonl_line
|
| 54 |
+
repo.update_file(
|
| 55 |
+
path=self.path,
|
| 56 |
+
message=commit_message,
|
| 57 |
+
content=new_content,
|
| 58 |
+
sha=existing.sha,
|
| 59 |
+
branch=self.branch,
|
| 60 |
+
)
|
| 61 |
+
return "SUCCESS: Log appended"
|
| 62 |
+
except GithubException as e:
|
| 63 |
+
if e.status == 404:
|
| 64 |
+
repo.create_file(
|
| 65 |
+
path=self.path,
|
| 66 |
+
message=commit_message,
|
| 67 |
+
content=jsonl_line,
|
| 68 |
+
branch=self.branch,
|
| 69 |
+
)
|
| 70 |
+
return "SUCCESS: Log file created and first entry appended"
|
| 71 |
+
raise
|
pr_generator/agent.py
CHANGED
|
@@ -1,596 +1,596 @@
|
|
| 1 |
-
"""
|
| 2 |
-
GitHub PR creation agent using Langchain.
|
| 3 |
-
This code integrates with the actual GitHub API using the PyGithub library.
|
| 4 |
-
Please set the GITHUB_TOKEN environment variable and install required libraries before running.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
import os
|
| 8 |
-
import re
|
| 9 |
-
import json
|
| 10 |
-
from typing import Optional, Dict, List, Tuple, Any
|
| 11 |
-
|
| 12 |
-
# Load environment variables from .env file
|
| 13 |
-
from dotenv import load_dotenv
|
| 14 |
-
from translator.content import llm_translate
|
| 15 |
-
|
| 16 |
-
load_dotenv()
|
| 17 |
-
|
| 18 |
-
# Constants definition
|
| 19 |
-
ANTHROPIC_MODEL_ID = "claude-sonnet-4-20250514"
|
| 20 |
-
DEFAULT_TEMPERATURE = 0.0
|
| 21 |
-
|
| 22 |
-
# Library imports and error handling
|
| 23 |
-
try:
|
| 24 |
-
from github import Github, GithubException
|
| 25 |
-
from github.GitRef import GitRef
|
| 26 |
-
from langchain_anthropic import ChatAnthropic
|
| 27 |
-
|
| 28 |
-
REQUIRED_LIBS_AVAILABLE = True
|
| 29 |
-
except ImportError as e:
|
| 30 |
-
print(f"Required libraries are not installed: {e}")
|
| 31 |
-
print("Please run: pip install PyGithub boto3 langchain-anthropic")
|
| 32 |
-
REQUIRED_LIBS_AVAILABLE = False
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
class GitHubPRAgent:
|
| 36 |
-
"""Agent class for GitHub PR creation"""
|
| 37 |
-
|
| 38 |
-
def __init__(self, user_owner: str = None, user_repo: str = None, base_owner: str = None, base_repo: str = None):
|
| 39 |
-
self._github_client = None
|
| 40 |
-
self._llm = None
|
| 41 |
-
self.user_owner = user_owner
|
| 42 |
-
self.user_repo = user_repo
|
| 43 |
-
self.base_owner = base_owner
|
| 44 |
-
self.base_repo = base_repo
|
| 45 |
-
|
| 46 |
-
@property
|
| 47 |
-
def github_client(self) -> Optional[Github]:
|
| 48 |
-
"""Return GitHub API client with lazy initialization."""
|
| 49 |
-
if not REQUIRED_LIBS_AVAILABLE:
|
| 50 |
-
raise ImportError("Required libraries not found.")
|
| 51 |
-
|
| 52 |
-
if self._github_client is None:
|
| 53 |
-
token = os.environ.get("GITHUB_TOKEN")
|
| 54 |
-
if not token:
|
| 55 |
-
print("Warning: GITHUB_TOKEN environment variable not set.")
|
| 56 |
-
return Github() # Limited access
|
| 57 |
-
self._github_client = Github(token)
|
| 58 |
-
|
| 59 |
-
return self._github_client
|
| 60 |
-
|
| 61 |
-
@property
|
| 62 |
-
def llm(self):
|
| 63 |
-
"""Return LLM client with lazy initialization."""
|
| 64 |
-
if not REQUIRED_LIBS_AVAILABLE:
|
| 65 |
-
raise ImportError("Required libraries not found.")
|
| 66 |
-
|
| 67 |
-
if self._llm is None:
|
| 68 |
-
self._llm = ChatAnthropic(
|
| 69 |
-
model=ANTHROPIC_MODEL_ID,
|
| 70 |
-
temperature=DEFAULT_TEMPERATURE,
|
| 71 |
-
)
|
| 72 |
-
return self._llm
|
| 73 |
-
|
| 74 |
-
def _handle_github_error(self, e: Exception, operation: str) -> str:
|
| 75 |
-
"""Handle GitHub API errors consistently."""
|
| 76 |
-
if isinstance(e, GithubException):
|
| 77 |
-
return f"{operation} failed: {e.status} {e.data.get('message', e.data)}"
|
| 78 |
-
return f"Unexpected error during {operation}: {str(e)}"
|
| 79 |
-
|
| 80 |
-
def create_pull_request(
|
| 81 |
-
self,
|
| 82 |
-
owner: str,
|
| 83 |
-
repo_name: str,
|
| 84 |
-
title: str,
|
| 85 |
-
head: str,
|
| 86 |
-
base: str,
|
| 87 |
-
body: str = "",
|
| 88 |
-
draft: bool = False,
|
| 89 |
-
maintainer_can_modify: bool = True,
|
| 90 |
-
) -> str:
|
| 91 |
-
"""Create a new Pull Request."""
|
| 92 |
-
try:
|
| 93 |
-
# 1. Check if head and base are the same
|
| 94 |
-
if head == base:
|
| 95 |
-
return f"ERROR: head branch ({head}) and base branch ({base}) are identical."
|
| 96 |
-
|
| 97 |
-
# 2. Check for existing PR
|
| 98 |
-
existing_pr = self.check_existing_pr(owner, repo_name, head, base)
|
| 99 |
-
if existing_pr:
|
| 100 |
-
return f"ERROR: {existing_pr}"
|
| 101 |
-
|
| 102 |
-
# 3. Verify head and base branches exist
|
| 103 |
-
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
| 104 |
-
try:
|
| 105 |
-
# For fork-to-upstream PR, head format is "fork_owner:branch_name"
|
| 106 |
-
if ":" in head:
|
| 107 |
-
fork_owner, branch_name = head.split(":", 1)
|
| 108 |
-
fork_repo = self.github_client.get_repo(f"{fork_owner}/{repo_name}")
|
| 109 |
-
head_branch = fork_repo.get_branch(branch_name)
|
| 110 |
-
else:
|
| 111 |
-
head_branch = repo.get_branch(head)
|
| 112 |
-
|
| 113 |
-
base_branch = repo.get_branch(base)
|
| 114 |
-
|
| 115 |
-
# 4. Check if head and base branches point to the same commit
|
| 116 |
-
if head_branch.commit.sha == base_branch.commit.sha:
|
| 117 |
-
return f"ERROR: head branch ({head}) and base branch ({base}) point to the same commit. No changes to merge."
|
| 118 |
-
|
| 119 |
-
except GithubException as e:
|
| 120 |
-
if e.status == 404:
|
| 121 |
-
return f"ERROR: Branch not found. head: {head}, base: {base}"
|
| 122 |
-
|
| 123 |
-
# 5. Create PR
|
| 124 |
-
pr = repo.create_pull(
|
| 125 |
-
title=title,
|
| 126 |
-
body=body,
|
| 127 |
-
head=head,
|
| 128 |
-
base=base,
|
| 129 |
-
draft=draft,
|
| 130 |
-
maintainer_can_modify=maintainer_can_modify,
|
| 131 |
-
)
|
| 132 |
-
return f"PR creation successful: {pr.html_url}"
|
| 133 |
-
except GithubException as e:
|
| 134 |
-
if e.status == 422:
|
| 135 |
-
error_msg = e.data.get("message", "Unknown error")
|
| 136 |
-
errors = e.data.get("errors", [])
|
| 137 |
-
|
| 138 |
-
error_details = []
|
| 139 |
-
for error in errors:
|
| 140 |
-
if "message" in error:
|
| 141 |
-
error_details.append(error["message"])
|
| 142 |
-
|
| 143 |
-
detail_msg = " | ".join(error_details) if error_details else ""
|
| 144 |
-
return f"ERROR: PR creation failed (422): {error_msg}. {detail_msg}"
|
| 145 |
-
return self._handle_github_error(e, "PR creation")
|
| 146 |
-
except Exception as e:
|
| 147 |
-
return self._handle_github_error(e, "PR creation")
|
| 148 |
-
|
| 149 |
-
def create_branch(
|
| 150 |
-
self, owner: str, repo_name: str, branch_name: str, source_sha: str
|
| 151 |
-
) -> str:
|
| 152 |
-
"""Create a new branch."""
|
| 153 |
-
try:
|
| 154 |
-
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
| 155 |
-
ref_name = f"refs/heads/{branch_name}"
|
| 156 |
-
new_ref = repo.create_git_ref(ref=ref_name, sha=source_sha)
|
| 157 |
-
|
| 158 |
-
if isinstance(new_ref, GitRef):
|
| 159 |
-
return f"SUCCESS: Branch '{branch_name}' created successfully (ref: {new_ref.ref})"
|
| 160 |
-
return f"ERROR: Branch '{branch_name}' creation failed. Please check API response."
|
| 161 |
-
except GithubException as e:
|
| 162 |
-
if e.status == 422 and "Reference already exists" in str(e.data):
|
| 163 |
-
return f"WARNING: Branch '{branch_name}' already exists."
|
| 164 |
-
return self._handle_github_error(e, "branch creation")
|
| 165 |
-
except Exception as e:
|
| 166 |
-
return self._handle_github_error(e, "branch creation")
|
| 167 |
-
|
| 168 |
-
def check_existing_pr(
|
| 169 |
-
self, owner: str, repo_name: str, head: str, base: str
|
| 170 |
-
) -> Optional[str]:
|
| 171 |
-
"""Check if there's an existing PR with the same head and base."""
|
| 172 |
-
try:
|
| 173 |
-
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
| 174 |
-
# For head parameter, use exactly what was passed (could be "fork_owner:branch" or just "branch")
|
| 175 |
-
search_head = head if ":" in head else f"{owner}:{head}"
|
| 176 |
-
pulls = repo.get_pulls(state="open", head=search_head, base=base)
|
| 177 |
-
for pr in pulls:
|
| 178 |
-
return f"Existing PR found: {pr.html_url}"
|
| 179 |
-
return None
|
| 180 |
-
except Exception as e:
|
| 181 |
-
print(f"⚠️ Error checking existing PR: {str(e)}")
|
| 182 |
-
return None
|
| 183 |
-
|
| 184 |
-
def create_or_update_file(
|
| 185 |
-
self,
|
| 186 |
-
owner: str,
|
| 187 |
-
repo_name: str,
|
| 188 |
-
path: str,
|
| 189 |
-
message: str,
|
| 190 |
-
content: str,
|
| 191 |
-
branch_name: Optional[str] = None,
|
| 192 |
-
sha_blob: Optional[str] = None,
|
| 193 |
-
) -> str:
|
| 194 |
-
"""Create or update a single file."""
|
| 195 |
-
try:
|
| 196 |
-
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
| 197 |
-
|
| 198 |
-
args = {
|
| 199 |
-
"path": path,
|
| 200 |
-
"message": message,
|
| 201 |
-
"content": content,
|
| 202 |
-
}
|
| 203 |
-
if branch_name:
|
| 204 |
-
args["branch"] = branch_name
|
| 205 |
-
|
| 206 |
-
# Try to update file
|
| 207 |
-
if sha_blob:
|
| 208 |
-
args["sha"] = sha_blob
|
| 209 |
-
repo.update_file(**args)
|
| 210 |
-
return f"SUCCESS: File updated - {path}"
|
| 211 |
-
|
| 212 |
-
# Try to create file
|
| 213 |
-
repo.create_file(**args)
|
| 214 |
-
return f"SUCCESS: File created - {path}"
|
| 215 |
-
|
| 216 |
-
except GithubException as e:
|
| 217 |
-
# Try to update if file already exists
|
| 218 |
-
if e.status == 422:
|
| 219 |
-
try:
|
| 220 |
-
existing_file = repo.get_contents(
|
| 221 |
-
path, ref=branch_name or repo.default_branch
|
| 222 |
-
)
|
| 223 |
-
args["sha"] = existing_file.sha
|
| 224 |
-
repo.update_file(**args)
|
| 225 |
-
return f"SUCCESS: File updated - {path}"
|
| 226 |
-
except:
|
| 227 |
-
pass
|
| 228 |
-
return f"ERROR: File processing failed - {path}"
|
| 229 |
-
except Exception:
|
| 230 |
-
return f"ERROR: File processing failed - {path}"
|
| 231 |
-
|
| 232 |
-
def analyze_reference_pr(self, pr_url: str) -> Dict[str, Any]:
|
| 233 |
-
"""Analyze reference PR to extract style information."""
|
| 234 |
-
try:
|
| 235 |
-
# Parse PR URL
|
| 236 |
-
match = re.match(r"https://github\.com/([^/]+)/([^/]+)/pull/(\d+)", pr_url)
|
| 237 |
-
if not match:
|
| 238 |
-
return {"error": f"Invalid PR URL format: {pr_url}"}
|
| 239 |
-
|
| 240 |
-
owner, repo_name, pr_number = match.groups()
|
| 241 |
-
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
| 242 |
-
pr = repo.get_pull(int(pr_number))
|
| 243 |
-
|
| 244 |
-
return {
|
| 245 |
-
"title": pr.title,
|
| 246 |
-
"body": pr.body,
|
| 247 |
-
"head_branch": pr.head.ref,
|
| 248 |
-
"base_branch": pr.base.ref,
|
| 249 |
-
"files_changed": [f.filename for f in pr.get_files()],
|
| 250 |
-
"commits": [
|
| 251 |
-
{"message": c.commit.message, "sha": c.sha}
|
| 252 |
-
for c in pr.get_commits()
|
| 253 |
-
],
|
| 254 |
-
}
|
| 255 |
-
except Exception as e:
|
| 256 |
-
return {"error": f"Error occurred during PR analysis: {str(e)}"}
|
| 257 |
-
|
| 258 |
-
def _generate_with_llm(
|
| 259 |
-
self, prompt: str, fallback_value: str, operation: str
|
| 260 |
-
) -> str:
|
| 261 |
-
"""Generate text using LLM."""
|
| 262 |
-
try:
|
| 263 |
-
_usage_info, generated = llm_translate(prompt)
|
| 264 |
-
generated = generated.strip()
|
| 265 |
-
print(f"LLM generated {operation}: {generated}")
|
| 266 |
-
return generated
|
| 267 |
-
except Exception as e:
|
| 268 |
-
print(f"❌ Error generating {operation} with LLM: {e}")
|
| 269 |
-
print(f"Using fallback value: {fallback_value}")
|
| 270 |
-
return fallback_value
|
| 271 |
-
|
| 272 |
-
def generate_branch_name_from_reference(
|
| 273 |
-
self, reference_branch_name: str, target_language: str, file_name: str
|
| 274 |
-
) -> str:
|
| 275 |
-
"""Generate branch name using simple template."""
|
| 276 |
-
# Keep .md extension and make branch-safe
|
| 277 |
-
branch_safe_name = file_name.replace('_', '-')
|
| 278 |
-
return f"{target_language}-{branch_safe_name}"
|
| 279 |
-
|
| 280 |
-
def generate_pr_content_from_reference(
|
| 281 |
-
self,
|
| 282 |
-
reference_title: str,
|
| 283 |
-
reference_body: str,
|
| 284 |
-
target_language: str,
|
| 285 |
-
filepath: str,
|
| 286 |
-
target_filepath: str,
|
| 287 |
-
file_name: str,
|
| 288 |
-
) -> Tuple[str, str]:
|
| 289 |
-
"""Use LLM to analyze reference PR title and body and generate appropriate PR content."""
|
| 290 |
-
prompt = f"""Here is the reference PR information:
|
| 291 |
-
|
| 292 |
-
Reference PR title: {reference_title}
|
| 293 |
-
|
| 294 |
-
Reference PR body:
|
| 295 |
-
{reference_body}
|
| 296 |
-
|
| 297 |
-
Now I need to generate PR title and body for a new translation task:
|
| 298 |
-
- Target language: {target_language}
|
| 299 |
-
- Original file: {filepath}
|
| 300 |
-
- Translation file: {target_filepath}
|
| 301 |
-
- File name: {file_name}
|
| 302 |
-
|
| 303 |
-
Please analyze the style and format of the reference PR to generate consistent new PR title and body.
|
| 304 |
-
|
| 305 |
-
Requirements:
|
| 306 |
-
1. Follow the title format and pattern of the reference PR
|
| 307 |
-
2. Maintain the body style, markdown format, indentation, and line breaks of the reference PR
|
| 308 |
-
3. Appropriately reflect the target language ({target_language}) and file paths
|
| 309 |
-
4. If there are user mentions (@username), change them to general text instead of actual mentions
|
| 310 |
-
5. Adjust the content to fit the translation task
|
| 311 |
-
|
| 312 |
-
Response format:
|
| 313 |
-
Title: [PR title here]
|
| 314 |
-
Body: [PR body here, maintaining the exact markdown format and structure of the original]"""
|
| 315 |
-
|
| 316 |
-
try:
|
| 317 |
-
_usage_info, generated_content = llm_translate(prompt)
|
| 318 |
-
generated_content = generated_content.strip()
|
| 319 |
-
|
| 320 |
-
# Separate title and body from response
|
| 321 |
-
lines = generated_content.split("\n")
|
| 322 |
-
title_line = ""
|
| 323 |
-
body_lines = []
|
| 324 |
-
parsing_body = False
|
| 325 |
-
|
| 326 |
-
for line in lines:
|
| 327 |
-
if line.startswith("Title:"):
|
| 328 |
-
title_line = line.replace("Title:", "").strip()
|
| 329 |
-
elif line.startswith("Body:"):
|
| 330 |
-
parsing_body = True
|
| 331 |
-
body_content = line.replace("Body:", "").strip()
|
| 332 |
-
if body_content:
|
| 333 |
-
body_lines.append(body_content)
|
| 334 |
-
elif parsing_body:
|
| 335 |
-
body_lines.append(line)
|
| 336 |
-
|
| 337 |
-
generated_title = title_line if title_line else reference_title
|
| 338 |
-
generated_body = (
|
| 339 |
-
"\n".join(body_lines)
|
| 340 |
-
if body_lines
|
| 341 |
-
else f"Add {target_language} translation for `{filepath}`."
|
| 342 |
-
)
|
| 343 |
-
|
| 344 |
-
print(f"LLM generated PR title: {generated_title}")
|
| 345 |
-
print(f"LLM generated PR body (first 100 chars): {generated_body[:100]}...")
|
| 346 |
-
|
| 347 |
-
return generated_title, generated_body
|
| 348 |
-
|
| 349 |
-
except Exception as e:
|
| 350 |
-
print(f"❌ Error generating PR content with LLM: {e}")
|
| 351 |
-
return self._generate_default_pr_content(
|
| 352 |
-
target_language, filepath, target_filepath, file_name
|
| 353 |
-
)
|
| 354 |
-
|
| 355 |
-
def _generate_default_pr_content(
|
| 356 |
-
self, target_language: str, filepath: str, target_filepath: str, file_name: str
|
| 357 |
-
) -> Tuple[str, str]:
|
| 358 |
-
"""Generate default PR content."""
|
| 359 |
-
title = f"🌐 [i18n-{target_language}] Translated `{file_name}` to {target_language}"
|
| 360 |
-
body = f"""# What does this PR do?
|
| 361 |
-
|
| 362 |
-
Translated the `{filepath}` file of the documentation to {target_language} 😄
|
| 363 |
-
Thank you in advance for your review!
|
| 364 |
-
|
| 365 |
-
Part of https://github.com/huggingface/transformers/issues/20179
|
| 366 |
-
|
| 367 |
-
## Before reviewing
|
| 368 |
-
- [x] Check for missing / redundant translations (번역 누락/중복 검사)
|
| 369 |
-
- [x] Grammar Check (맞춤법 검사)
|
| 370 |
-
- [x] Review or Add new terms to glossary (용어 확인 및 추가)
|
| 371 |
-
- [x] Check Inline TOC (e.g. `[[lowercased-header]]`)
|
| 372 |
-
- [x] Check live-preview for gotchas (live-preview로 정상작동 확인)
|
| 373 |
-
|
| 374 |
-
## Who can review? (Initial)
|
| 375 |
-
{target_language} translation reviewers
|
| 376 |
-
|
| 377 |
-
## Before submitting
|
| 378 |
-
- [x] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
|
| 379 |
-
- [x] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
|
| 380 |
-
Pull Request section?
|
| 381 |
-
- [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
|
| 382 |
-
to it if that's the case.
|
| 383 |
-
- [x] Did you make sure to update the documentation with your changes? Here are the
|
| 384 |
-
[documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and
|
| 385 |
-
[here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
|
| 386 |
-
- [ ] Did you write any new necessary tests?
|
| 387 |
-
|
| 388 |
-
## Who can review? (Final)
|
| 389 |
-
May you please review this PR?
|
| 390 |
-
Documentation maintainers
|
| 391 |
-
"""
|
| 392 |
-
return title, body
|
| 393 |
-
|
| 394 |
-
def generate_commit_message_from_reference(
|
| 395 |
-
self, commit_messages: List[str], target_language: str, file_name: str
|
| 396 |
-
) -> str:
|
| 397 |
-
"""Generate simple commit message using template."""
|
| 398 |
-
return f"docs: {target_language}: {file_name}"
|
| 399 |
-
|
| 400 |
-
def get_branch_info(self, owner: str, repo_name: str, branch_name: str) -> str:
|
| 401 |
-
"""Get information about an existing branch."""
|
| 402 |
-
try:
|
| 403 |
-
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
| 404 |
-
branch = repo.get_branch(branch_name)
|
| 405 |
-
commit = branch.commit
|
| 406 |
-
commit_info = commit.commit
|
| 407 |
-
|
| 408 |
-
return f"""
|
| 409 |
-
📋 Existing branch information:
|
| 410 |
-
- Branch name: {branch_name}
|
| 411 |
-
- Latest commit: {commit.sha[:8]}
|
| 412 |
-
- Commit message: {commit_info.message.split(chr(10))[0][:80]}...
|
| 413 |
-
- Author: {commit_info.author.name}
|
| 414 |
-
- Date: {commit_info.author.date.strftime('%Y-%m-%d %H:%M:%S')}
|
| 415 |
-
"""
|
| 416 |
-
except Exception as e:
|
| 417 |
-
return f"Failed to retrieve branch information: {str(e)}"
|
| 418 |
-
|
| 419 |
-
def run_translation_pr_workflow(
|
| 420 |
-
self,
|
| 421 |
-
reference_pr_url: str,
|
| 422 |
-
target_language: str,
|
| 423 |
-
filepath: str,
|
| 424 |
-
translated_doc: str,
|
| 425 |
-
base_branch: str = "main",
|
| 426 |
-
) -> Dict[str, Any]:
|
| 427 |
-
"""Execute translation document PR creation workflow."""
|
| 428 |
-
try:
|
| 429 |
-
# 1. Analyze reference PR
|
| 430 |
-
print(f"🔍 Analyzing reference PR: {reference_pr_url}")
|
| 431 |
-
pr_analysis = self.analyze_reference_pr(reference_pr_url)
|
| 432 |
-
|
| 433 |
-
if "error" in pr_analysis:
|
| 434 |
-
return {"status": "error", "message": pr_analysis["error"]}
|
| 435 |
-
|
| 436 |
-
print("Reference PR analysis completed")
|
| 437 |
-
|
| 438 |
-
# 2. Generate translation file path and branch name
|
| 439 |
-
target_filepath = filepath.replace("/en/", f"/{target_language}/")
|
| 440 |
-
file_name = filepath.split("/")[-1] # Keep .md extension
|
| 441 |
-
|
| 442 |
-
print(f"🌿 Generating branch name...")
|
| 443 |
-
branch_name = self.generate_branch_name_from_reference(
|
| 444 |
-
pr_analysis["head_branch"], target_language, file_name
|
| 445 |
-
)
|
| 446 |
-
|
| 447 |
-
# 3. Get main branch SHA from upstream and create branch in fork
|
| 448 |
-
upstream_repo = self.github_client.get_repo(f"{self.base_owner}/{self.base_repo}")
|
| 449 |
-
main_branch = upstream_repo.get_branch(base_branch)
|
| 450 |
-
main_sha = main_branch.commit.sha
|
| 451 |
-
|
| 452 |
-
print(f"🌿 Creating branch: {branch_name} in fork repository")
|
| 453 |
-
branch_result = self.create_branch(self.user_owner, self.user_repo, branch_name, main_sha)
|
| 454 |
-
|
| 455 |
-
# Check branch creation result
|
| 456 |
-
if branch_result.startswith("ERROR"):
|
| 457 |
-
return {
|
| 458 |
-
"status": "error",
|
| 459 |
-
"message": f"Branch creation failed: {branch_result}\n\nTarget: {self.user_owner}/{self.user_repo}\nBranch: {branch_name}\nBase SHA: {main_sha[:8]}",
|
| 460 |
-
"branch": branch_name,
|
| 461 |
-
"error_details": branch_result,
|
| 462 |
-
}
|
| 463 |
-
elif branch_result.startswith("WARNING"):
|
| 464 |
-
print(f"⚠️ {branch_result}")
|
| 465 |
-
# Continue if branch already exists
|
| 466 |
-
elif branch_result.startswith("SUCCESS"):
|
| 467 |
-
print(f"✅ {branch_result}")
|
| 468 |
-
else:
|
| 469 |
-
print(f"⚠️ Unexpected branch creation result: {branch_result}")
|
| 470 |
-
# Continue anyway, might still work
|
| 471 |
-
|
| 472 |
-
# 4. Generate commit message and save file
|
| 473 |
-
commit_messages = [commit["message"] for commit in pr_analysis["commits"]]
|
| 474 |
-
commit_message = self.generate_commit_message_from_reference(
|
| 475 |
-
commit_messages, target_language, file_name
|
| 476 |
-
)
|
| 477 |
-
|
| 478 |
-
print(f"📄 Saving file: {target_filepath}")
|
| 479 |
-
file_result = self.create_or_update_file(
|
| 480 |
-
self.user_owner,
|
| 481 |
-
self.user_repo,
|
| 482 |
-
target_filepath,
|
| 483 |
-
commit_message,
|
| 484 |
-
translated_doc,
|
| 485 |
-
branch_name,
|
| 486 |
-
)
|
| 487 |
-
|
| 488 |
-
if not file_result.startswith("SUCCESS"):
|
| 489 |
-
return {
|
| 490 |
-
"status": "error",
|
| 491 |
-
"message": f"File save failed: {file_result}\n\n🎯 Target: {self.user_owner}/{self.user_repo} (expected: {target_language} fork of {self.base_owner}/{self.base_repo})\n🌿 Branch: {branch_name}\n📁 File: {target_filepath}",
|
| 492 |
-
"branch": branch_name,
|
| 493 |
-
"file_path": target_filepath,
|
| 494 |
-
"error_details": file_result,
|
| 495 |
-
}
|
| 496 |
-
|
| 497 |
-
print(f"{file_result}")
|
| 498 |
-
|
| 499 |
-
# 5. Create PR
|
| 500 |
-
pr_title, pr_body = self.generate_pr_content_from_reference(
|
| 501 |
-
pr_analysis["title"],
|
| 502 |
-
pr_analysis["body"],
|
| 503 |
-
target_language,
|
| 504 |
-
filepath,
|
| 505 |
-
target_filepath,
|
| 506 |
-
file_name,
|
| 507 |
-
)
|
| 508 |
-
|
| 509 |
-
print(f"🔄 Creating PR: {pr_title}")
|
| 510 |
-
print(f" Head: {self.user_owner}:{branch_name} → Base: {self.base_owner}:{base_branch}")
|
| 511 |
-
|
| 512 |
-
# Create PR from fork to upstream repository
|
| 513 |
-
pr_result = self.create_pull_request(
|
| 514 |
-
self.base_owner, self.base_repo, pr_title, f"{self.user_owner}:{branch_name}", base_branch, pr_body, draft=True
|
| 515 |
-
)
|
| 516 |
-
|
| 517 |
-
if pr_result.startswith("ERROR"):
|
| 518 |
-
print(f"❌ {pr_result}")
|
| 519 |
-
return {
|
| 520 |
-
"status": "partial_success",
|
| 521 |
-
"branch": branch_name,
|
| 522 |
-
"file_path": target_filepath,
|
| 523 |
-
"message": f"File was saved and commit was successful.\nPR creation failed: {pr_result}",
|
| 524 |
-
"error_details": pr_result,
|
| 525 |
-
}
|
| 526 |
-
elif "successful" in pr_result and "http" in pr_result:
|
| 527 |
-
print(f"{pr_result}")
|
| 528 |
-
return {
|
| 529 |
-
"status": "success",
|
| 530 |
-
"branch": branch_name,
|
| 531 |
-
"file_path": target_filepath,
|
| 532 |
-
"pr_url": pr_result.split(": ")[-1],
|
| 533 |
-
"message": "Translation document PR created successfully!",
|
| 534 |
-
}
|
| 535 |
-
else:
|
| 536 |
-
return {
|
| 537 |
-
"status": "partial_success",
|
| 538 |
-
"branch": branch_name,
|
| 539 |
-
"file_path": target_filepath,
|
| 540 |
-
"message": "File was saved but PR creation failed.",
|
| 541 |
-
}
|
| 542 |
-
|
| 543 |
-
except Exception as e:
|
| 544 |
-
return {
|
| 545 |
-
"status": "error",
|
| 546 |
-
"message": f"Workflow execution failed: {str(e)}\n\nConfig: {self.user_owner}/{self.user_repo} → {self.base_owner}/{self.base_repo}\nFile: {filepath if 'filepath' in locals() else 'Unknown'}",
|
| 547 |
-
"error_details": str(e),
|
| 548 |
-
}
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
# Backward compatibility functions (maintain compatibility with existing code)
|
| 552 |
-
_agent = GitHubPRAgent()
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
def get_github_client():
|
| 556 |
-
return _agent.github_client
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
def create_pull_request_func(*args, **kwargs):
|
| 560 |
-
return _agent.create_pull_request(*args, **kwargs)
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
def create_branch_func(*args, **kwargs):
|
| 564 |
-
return _agent.create_branch(*args, **kwargs)
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
def create_or_update_file_func(*args, **kwargs):
|
| 568 |
-
return _agent.create_or_update_file(*args, **kwargs)
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
def analyze_reference_pr_func(*args, **kwargs):
|
| 572 |
-
return _agent.analyze_reference_pr(*args, **kwargs)
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
def generate_branch_name_from_reference(*args, **kwargs):
|
| 576 |
-
return _agent.generate_branch_name_from_reference(*args, **kwargs)
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
def generate_pr_content_from_reference(*args, **kwargs):
|
| 580 |
-
return _agent.generate_pr_content_from_reference(*args, **kwargs)
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
def generate_default_pr_content(*args, **kwargs):
|
| 584 |
-
return _agent._generate_default_pr_content(*args, **kwargs)
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
def generate_commit_message_from_reference(*args, **kwargs):
|
| 588 |
-
return _agent.generate_commit_message_from_reference(*args, **kwargs)
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
def get_branch_info(*args, **kwargs):
|
| 592 |
-
return _agent.get_branch_info(*args, **kwargs)
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
def run_translation_pr_agent_simple(*args, **kwargs):
|
| 596 |
-
return _agent.run_translation_pr_workflow(*args, **kwargs)
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GitHub PR creation agent using Langchain.
|
| 3 |
+
This code integrates with the actual GitHub API using the PyGithub library.
|
| 4 |
+
Please set the GITHUB_TOKEN environment variable and install required libraries before running.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import re
|
| 9 |
+
import json
|
| 10 |
+
from typing import Optional, Dict, List, Tuple, Any
|
| 11 |
+
|
| 12 |
+
# Load environment variables from .env file
|
| 13 |
+
from dotenv import load_dotenv
|
| 14 |
+
from translator.content import llm_translate
|
| 15 |
+
|
| 16 |
+
load_dotenv()
|
| 17 |
+
|
| 18 |
+
# Constants definition
|
| 19 |
+
ANTHROPIC_MODEL_ID = "claude-sonnet-4-20250514"
|
| 20 |
+
DEFAULT_TEMPERATURE = 0.0
|
| 21 |
+
|
| 22 |
+
# Library imports and error handling
|
| 23 |
+
try:
|
| 24 |
+
from github import Github, GithubException
|
| 25 |
+
from github.GitRef import GitRef
|
| 26 |
+
from langchain_anthropic import ChatAnthropic
|
| 27 |
+
|
| 28 |
+
REQUIRED_LIBS_AVAILABLE = True
|
| 29 |
+
except ImportError as e:
|
| 30 |
+
print(f"Required libraries are not installed: {e}")
|
| 31 |
+
print("Please run: pip install PyGithub boto3 langchain-anthropic")
|
| 32 |
+
REQUIRED_LIBS_AVAILABLE = False
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class GitHubPRAgent:
|
| 36 |
+
"""Agent class for GitHub PR creation"""
|
| 37 |
+
|
| 38 |
+
def __init__(self, user_owner: str = None, user_repo: str = None, base_owner: str = None, base_repo: str = None):
|
| 39 |
+
self._github_client = None
|
| 40 |
+
self._llm = None
|
| 41 |
+
self.user_owner = user_owner
|
| 42 |
+
self.user_repo = user_repo
|
| 43 |
+
self.base_owner = base_owner
|
| 44 |
+
self.base_repo = base_repo
|
| 45 |
+
|
| 46 |
+
@property
|
| 47 |
+
def github_client(self) -> Optional[Github]:
|
| 48 |
+
"""Return GitHub API client with lazy initialization."""
|
| 49 |
+
if not REQUIRED_LIBS_AVAILABLE:
|
| 50 |
+
raise ImportError("Required libraries not found.")
|
| 51 |
+
|
| 52 |
+
if self._github_client is None:
|
| 53 |
+
token = os.environ.get("GITHUB_TOKEN")
|
| 54 |
+
if not token:
|
| 55 |
+
print("Warning: GITHUB_TOKEN environment variable not set.")
|
| 56 |
+
return Github() # Limited access
|
| 57 |
+
self._github_client = Github(token)
|
| 58 |
+
|
| 59 |
+
return self._github_client
|
| 60 |
+
|
| 61 |
+
@property
|
| 62 |
+
def llm(self):
|
| 63 |
+
"""Return LLM client with lazy initialization."""
|
| 64 |
+
if not REQUIRED_LIBS_AVAILABLE:
|
| 65 |
+
raise ImportError("Required libraries not found.")
|
| 66 |
+
|
| 67 |
+
if self._llm is None:
|
| 68 |
+
self._llm = ChatAnthropic(
|
| 69 |
+
model=ANTHROPIC_MODEL_ID,
|
| 70 |
+
temperature=DEFAULT_TEMPERATURE,
|
| 71 |
+
)
|
| 72 |
+
return self._llm
|
| 73 |
+
|
| 74 |
+
def _handle_github_error(self, e: Exception, operation: str) -> str:
|
| 75 |
+
"""Handle GitHub API errors consistently."""
|
| 76 |
+
if isinstance(e, GithubException):
|
| 77 |
+
return f"{operation} failed: {e.status} {e.data.get('message', e.data)}"
|
| 78 |
+
return f"Unexpected error during {operation}: {str(e)}"
|
| 79 |
+
|
| 80 |
+
def create_pull_request(
|
| 81 |
+
self,
|
| 82 |
+
owner: str,
|
| 83 |
+
repo_name: str,
|
| 84 |
+
title: str,
|
| 85 |
+
head: str,
|
| 86 |
+
base: str,
|
| 87 |
+
body: str = "",
|
| 88 |
+
draft: bool = False,
|
| 89 |
+
maintainer_can_modify: bool = True,
|
| 90 |
+
) -> str:
|
| 91 |
+
"""Create a new Pull Request."""
|
| 92 |
+
try:
|
| 93 |
+
# 1. Check if head and base are the same
|
| 94 |
+
if head == base:
|
| 95 |
+
return f"ERROR: head branch ({head}) and base branch ({base}) are identical."
|
| 96 |
+
|
| 97 |
+
# 2. Check for existing PR
|
| 98 |
+
existing_pr = self.check_existing_pr(owner, repo_name, head, base)
|
| 99 |
+
if existing_pr:
|
| 100 |
+
return f"ERROR: {existing_pr}"
|
| 101 |
+
|
| 102 |
+
# 3. Verify head and base branches exist
|
| 103 |
+
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
| 104 |
+
try:
|
| 105 |
+
# For fork-to-upstream PR, head format is "fork_owner:branch_name"
|
| 106 |
+
if ":" in head:
|
| 107 |
+
fork_owner, branch_name = head.split(":", 1)
|
| 108 |
+
fork_repo = self.github_client.get_repo(f"{fork_owner}/{repo_name}")
|
| 109 |
+
head_branch = fork_repo.get_branch(branch_name)
|
| 110 |
+
else:
|
| 111 |
+
head_branch = repo.get_branch(head)
|
| 112 |
+
|
| 113 |
+
base_branch = repo.get_branch(base)
|
| 114 |
+
|
| 115 |
+
# 4. Check if head and base branches point to the same commit
|
| 116 |
+
if head_branch.commit.sha == base_branch.commit.sha:
|
| 117 |
+
return f"ERROR: head branch ({head}) and base branch ({base}) point to the same commit. No changes to merge."
|
| 118 |
+
|
| 119 |
+
except GithubException as e:
|
| 120 |
+
if e.status == 404:
|
| 121 |
+
return f"ERROR: Branch not found. head: {head}, base: {base}"
|
| 122 |
+
|
| 123 |
+
# 5. Create PR
|
| 124 |
+
pr = repo.create_pull(
|
| 125 |
+
title=title,
|
| 126 |
+
body=body,
|
| 127 |
+
head=head,
|
| 128 |
+
base=base,
|
| 129 |
+
draft=draft,
|
| 130 |
+
maintainer_can_modify=maintainer_can_modify,
|
| 131 |
+
)
|
| 132 |
+
return f"PR creation successful: {pr.html_url}"
|
| 133 |
+
except GithubException as e:
|
| 134 |
+
if e.status == 422:
|
| 135 |
+
error_msg = e.data.get("message", "Unknown error")
|
| 136 |
+
errors = e.data.get("errors", [])
|
| 137 |
+
|
| 138 |
+
error_details = []
|
| 139 |
+
for error in errors:
|
| 140 |
+
if "message" in error:
|
| 141 |
+
error_details.append(error["message"])
|
| 142 |
+
|
| 143 |
+
detail_msg = " | ".join(error_details) if error_details else ""
|
| 144 |
+
return f"ERROR: PR creation failed (422): {error_msg}. {detail_msg}"
|
| 145 |
+
return self._handle_github_error(e, "PR creation")
|
| 146 |
+
except Exception as e:
|
| 147 |
+
return self._handle_github_error(e, "PR creation")
|
| 148 |
+
|
| 149 |
+
def create_branch(
|
| 150 |
+
self, owner: str, repo_name: str, branch_name: str, source_sha: str
|
| 151 |
+
) -> str:
|
| 152 |
+
"""Create a new branch."""
|
| 153 |
+
try:
|
| 154 |
+
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
| 155 |
+
ref_name = f"refs/heads/{branch_name}"
|
| 156 |
+
new_ref = repo.create_git_ref(ref=ref_name, sha=source_sha)
|
| 157 |
+
|
| 158 |
+
if isinstance(new_ref, GitRef):
|
| 159 |
+
return f"SUCCESS: Branch '{branch_name}' created successfully (ref: {new_ref.ref})"
|
| 160 |
+
return f"ERROR: Branch '{branch_name}' creation failed. Please check API response."
|
| 161 |
+
except GithubException as e:
|
| 162 |
+
if e.status == 422 and "Reference already exists" in str(e.data):
|
| 163 |
+
return f"WARNING: Branch '{branch_name}' already exists."
|
| 164 |
+
return self._handle_github_error(e, "branch creation")
|
| 165 |
+
except Exception as e:
|
| 166 |
+
return self._handle_github_error(e, "branch creation")
|
| 167 |
+
|
| 168 |
+
def check_existing_pr(
|
| 169 |
+
self, owner: str, repo_name: str, head: str, base: str
|
| 170 |
+
) -> Optional[str]:
|
| 171 |
+
"""Check if there's an existing PR with the same head and base."""
|
| 172 |
+
try:
|
| 173 |
+
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
| 174 |
+
# For head parameter, use exactly what was passed (could be "fork_owner:branch" or just "branch")
|
| 175 |
+
search_head = head if ":" in head else f"{owner}:{head}"
|
| 176 |
+
pulls = repo.get_pulls(state="open", head=search_head, base=base)
|
| 177 |
+
for pr in pulls:
|
| 178 |
+
return f"Existing PR found: {pr.html_url}"
|
| 179 |
+
return None
|
| 180 |
+
except Exception as e:
|
| 181 |
+
print(f"⚠️ Error checking existing PR: {str(e)}")
|
| 182 |
+
return None
|
| 183 |
+
|
| 184 |
+
def create_or_update_file(
|
| 185 |
+
self,
|
| 186 |
+
owner: str,
|
| 187 |
+
repo_name: str,
|
| 188 |
+
path: str,
|
| 189 |
+
message: str,
|
| 190 |
+
content: str,
|
| 191 |
+
branch_name: Optional[str] = None,
|
| 192 |
+
sha_blob: Optional[str] = None,
|
| 193 |
+
) -> str:
|
| 194 |
+
"""Create or update a single file."""
|
| 195 |
+
try:
|
| 196 |
+
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
| 197 |
+
|
| 198 |
+
args = {
|
| 199 |
+
"path": path,
|
| 200 |
+
"message": message,
|
| 201 |
+
"content": content,
|
| 202 |
+
}
|
| 203 |
+
if branch_name:
|
| 204 |
+
args["branch"] = branch_name
|
| 205 |
+
|
| 206 |
+
# Try to update file
|
| 207 |
+
if sha_blob:
|
| 208 |
+
args["sha"] = sha_blob
|
| 209 |
+
repo.update_file(**args)
|
| 210 |
+
return f"SUCCESS: File updated - {path}"
|
| 211 |
+
|
| 212 |
+
# Try to create file
|
| 213 |
+
repo.create_file(**args)
|
| 214 |
+
return f"SUCCESS: File created - {path}"
|
| 215 |
+
|
| 216 |
+
except GithubException as e:
|
| 217 |
+
# Try to update if file already exists
|
| 218 |
+
if e.status == 422:
|
| 219 |
+
try:
|
| 220 |
+
existing_file = repo.get_contents(
|
| 221 |
+
path, ref=branch_name or repo.default_branch
|
| 222 |
+
)
|
| 223 |
+
args["sha"] = existing_file.sha
|
| 224 |
+
repo.update_file(**args)
|
| 225 |
+
return f"SUCCESS: File updated - {path}"
|
| 226 |
+
except:
|
| 227 |
+
pass
|
| 228 |
+
return f"ERROR: File processing failed - {path}"
|
| 229 |
+
except Exception:
|
| 230 |
+
return f"ERROR: File processing failed - {path}"
|
| 231 |
+
|
| 232 |
+
def analyze_reference_pr(self, pr_url: str) -> Dict[str, Any]:
|
| 233 |
+
"""Analyze reference PR to extract style information."""
|
| 234 |
+
try:
|
| 235 |
+
# Parse PR URL
|
| 236 |
+
match = re.match(r"https://github\.com/([^/]+)/([^/]+)/pull/(\d+)", pr_url)
|
| 237 |
+
if not match:
|
| 238 |
+
return {"error": f"Invalid PR URL format: {pr_url}"}
|
| 239 |
+
|
| 240 |
+
owner, repo_name, pr_number = match.groups()
|
| 241 |
+
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
| 242 |
+
pr = repo.get_pull(int(pr_number))
|
| 243 |
+
|
| 244 |
+
return {
|
| 245 |
+
"title": pr.title,
|
| 246 |
+
"body": pr.body,
|
| 247 |
+
"head_branch": pr.head.ref,
|
| 248 |
+
"base_branch": pr.base.ref,
|
| 249 |
+
"files_changed": [f.filename for f in pr.get_files()],
|
| 250 |
+
"commits": [
|
| 251 |
+
{"message": c.commit.message, "sha": c.sha}
|
| 252 |
+
for c in pr.get_commits()
|
| 253 |
+
],
|
| 254 |
+
}
|
| 255 |
+
except Exception as e:
|
| 256 |
+
return {"error": f"Error occurred during PR analysis: {str(e)}"}
|
| 257 |
+
|
| 258 |
+
def _generate_with_llm(
|
| 259 |
+
self, prompt: str, fallback_value: str, operation: str
|
| 260 |
+
) -> str:
|
| 261 |
+
"""Generate text using LLM."""
|
| 262 |
+
try:
|
| 263 |
+
_usage_info, generated = llm_translate(prompt)
|
| 264 |
+
generated = generated.strip()
|
| 265 |
+
print(f"LLM generated {operation}: {generated}")
|
| 266 |
+
return generated
|
| 267 |
+
except Exception as e:
|
| 268 |
+
print(f"❌ Error generating {operation} with LLM: {e}")
|
| 269 |
+
print(f"Using fallback value: {fallback_value}")
|
| 270 |
+
return fallback_value
|
| 271 |
+
|
| 272 |
+
def generate_branch_name_from_reference(
|
| 273 |
+
self, reference_branch_name: str, target_language: str, file_name: str
|
| 274 |
+
) -> str:
|
| 275 |
+
"""Generate branch name using simple template."""
|
| 276 |
+
# Keep .md extension and make branch-safe
|
| 277 |
+
branch_safe_name = file_name.replace('_', '-')
|
| 278 |
+
return f"{target_language}-{branch_safe_name}"
|
| 279 |
+
|
| 280 |
+
def generate_pr_content_from_reference(
|
| 281 |
+
self,
|
| 282 |
+
reference_title: str,
|
| 283 |
+
reference_body: str,
|
| 284 |
+
target_language: str,
|
| 285 |
+
filepath: str,
|
| 286 |
+
target_filepath: str,
|
| 287 |
+
file_name: str,
|
| 288 |
+
) -> Tuple[str, str]:
|
| 289 |
+
"""Use LLM to analyze reference PR title and body and generate appropriate PR content."""
|
| 290 |
+
prompt = f"""Here is the reference PR information:
|
| 291 |
+
|
| 292 |
+
Reference PR title: {reference_title}
|
| 293 |
+
|
| 294 |
+
Reference PR body:
|
| 295 |
+
{reference_body}
|
| 296 |
+
|
| 297 |
+
Now I need to generate PR title and body for a new translation task:
|
| 298 |
+
- Target language: {target_language}
|
| 299 |
+
- Original file: {filepath}
|
| 300 |
+
- Translation file: {target_filepath}
|
| 301 |
+
- File name: {file_name}
|
| 302 |
+
|
| 303 |
+
Please analyze the style and format of the reference PR to generate consistent new PR title and body.
|
| 304 |
+
|
| 305 |
+
Requirements:
|
| 306 |
+
1. Follow the title format and pattern of the reference PR
|
| 307 |
+
2. Maintain the body style, markdown format, indentation, and line breaks of the reference PR
|
| 308 |
+
3. Appropriately reflect the target language ({target_language}) and file paths
|
| 309 |
+
4. If there are user mentions (@username), change them to general text instead of actual mentions
|
| 310 |
+
5. Adjust the content to fit the translation task
|
| 311 |
+
|
| 312 |
+
Response format:
|
| 313 |
+
Title: [PR title here]
|
| 314 |
+
Body: [PR body here, maintaining the exact markdown format and structure of the original]"""
|
| 315 |
+
|
| 316 |
+
try:
|
| 317 |
+
_usage_info, generated_content = llm_translate(prompt)
|
| 318 |
+
generated_content = generated_content.strip()
|
| 319 |
+
|
| 320 |
+
# Separate title and body from response
|
| 321 |
+
lines = generated_content.split("\n")
|
| 322 |
+
title_line = ""
|
| 323 |
+
body_lines = []
|
| 324 |
+
parsing_body = False
|
| 325 |
+
|
| 326 |
+
for line in lines:
|
| 327 |
+
if line.startswith("Title:"):
|
| 328 |
+
title_line = line.replace("Title:", "").strip()
|
| 329 |
+
elif line.startswith("Body:"):
|
| 330 |
+
parsing_body = True
|
| 331 |
+
body_content = line.replace("Body:", "").strip()
|
| 332 |
+
if body_content:
|
| 333 |
+
body_lines.append(body_content)
|
| 334 |
+
elif parsing_body:
|
| 335 |
+
body_lines.append(line)
|
| 336 |
+
|
| 337 |
+
generated_title = title_line if title_line else reference_title
|
| 338 |
+
generated_body = (
|
| 339 |
+
"\n".join(body_lines)
|
| 340 |
+
if body_lines
|
| 341 |
+
else f"Add {target_language} translation for `{filepath}`."
|
| 342 |
+
)
|
| 343 |
+
|
| 344 |
+
print(f"LLM generated PR title: {generated_title}")
|
| 345 |
+
print(f"LLM generated PR body (first 100 chars): {generated_body[:100]}...")
|
| 346 |
+
|
| 347 |
+
return generated_title, generated_body
|
| 348 |
+
|
| 349 |
+
except Exception as e:
|
| 350 |
+
print(f"❌ Error generating PR content with LLM: {e}")
|
| 351 |
+
return self._generate_default_pr_content(
|
| 352 |
+
target_language, filepath, target_filepath, file_name
|
| 353 |
+
)
|
| 354 |
+
|
| 355 |
+
def _generate_default_pr_content(
|
| 356 |
+
self, target_language: str, filepath: str, target_filepath: str, file_name: str
|
| 357 |
+
) -> Tuple[str, str]:
|
| 358 |
+
"""Generate default PR content."""
|
| 359 |
+
title = f"🌐 [i18n-{target_language}] Translated `{file_name}` to {target_language}"
|
| 360 |
+
body = f"""# What does this PR do?
|
| 361 |
+
|
| 362 |
+
Translated the `{filepath}` file of the documentation to {target_language} 😄
|
| 363 |
+
Thank you in advance for your review!
|
| 364 |
+
|
| 365 |
+
Part of https://github.com/huggingface/transformers/issues/20179
|
| 366 |
+
|
| 367 |
+
## Before reviewing
|
| 368 |
+
- [x] Check for missing / redundant translations (번역 누락/중복 검사)
|
| 369 |
+
- [x] Grammar Check (맞춤법 검사)
|
| 370 |
+
- [x] Review or Add new terms to glossary (용어 확인 및 추가)
|
| 371 |
+
- [x] Check Inline TOC (e.g. `[[lowercased-header]]`)
|
| 372 |
+
- [x] Check live-preview for gotchas (live-preview로 정상작동 확인)
|
| 373 |
+
|
| 374 |
+
## Who can review? (Initial)
|
| 375 |
+
{target_language} translation reviewers
|
| 376 |
+
|
| 377 |
+
## Before submitting
|
| 378 |
+
- [x] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
|
| 379 |
+
- [x] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
|
| 380 |
+
Pull Request section?
|
| 381 |
+
- [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
|
| 382 |
+
to it if that's the case.
|
| 383 |
+
- [x] Did you make sure to update the documentation with your changes? Here are the
|
| 384 |
+
[documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and
|
| 385 |
+
[here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
|
| 386 |
+
- [ ] Did you write any new necessary tests?
|
| 387 |
+
|
| 388 |
+
## Who can review? (Final)
|
| 389 |
+
May you please review this PR?
|
| 390 |
+
Documentation maintainers
|
| 391 |
+
"""
|
| 392 |
+
return title, body
|
| 393 |
+
|
| 394 |
+
def generate_commit_message_from_reference(
|
| 395 |
+
self, commit_messages: List[str], target_language: str, file_name: str
|
| 396 |
+
) -> str:
|
| 397 |
+
"""Generate simple commit message using template."""
|
| 398 |
+
return f"docs: {target_language}: {file_name}"
|
| 399 |
+
|
| 400 |
+
def get_branch_info(self, owner: str, repo_name: str, branch_name: str) -> str:
|
| 401 |
+
"""Get information about an existing branch."""
|
| 402 |
+
try:
|
| 403 |
+
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
| 404 |
+
branch = repo.get_branch(branch_name)
|
| 405 |
+
commit = branch.commit
|
| 406 |
+
commit_info = commit.commit
|
| 407 |
+
|
| 408 |
+
return f"""
|
| 409 |
+
📋 Existing branch information:
|
| 410 |
+
- Branch name: {branch_name}
|
| 411 |
+
- Latest commit: {commit.sha[:8]}
|
| 412 |
+
- Commit message: {commit_info.message.split(chr(10))[0][:80]}...
|
| 413 |
+
- Author: {commit_info.author.name}
|
| 414 |
+
- Date: {commit_info.author.date.strftime('%Y-%m-%d %H:%M:%S')}
|
| 415 |
+
"""
|
| 416 |
+
except Exception as e:
|
| 417 |
+
return f"Failed to retrieve branch information: {str(e)}"
|
| 418 |
+
|
| 419 |
+
def run_translation_pr_workflow(
|
| 420 |
+
self,
|
| 421 |
+
reference_pr_url: str,
|
| 422 |
+
target_language: str,
|
| 423 |
+
filepath: str,
|
| 424 |
+
translated_doc: str,
|
| 425 |
+
base_branch: str = "main",
|
| 426 |
+
) -> Dict[str, Any]:
|
| 427 |
+
"""Execute translation document PR creation workflow."""
|
| 428 |
+
try:
|
| 429 |
+
# 1. Analyze reference PR
|
| 430 |
+
print(f"🔍 Analyzing reference PR: {reference_pr_url}")
|
| 431 |
+
pr_analysis = self.analyze_reference_pr(reference_pr_url)
|
| 432 |
+
|
| 433 |
+
if "error" in pr_analysis:
|
| 434 |
+
return {"status": "error", "message": pr_analysis["error"]}
|
| 435 |
+
|
| 436 |
+
print("Reference PR analysis completed")
|
| 437 |
+
|
| 438 |
+
# 2. Generate translation file path and branch name
|
| 439 |
+
target_filepath = filepath.replace("/en/", f"/{target_language}/")
|
| 440 |
+
file_name = filepath.split("/")[-1] # Keep .md extension
|
| 441 |
+
|
| 442 |
+
print(f"🌿 Generating branch name...")
|
| 443 |
+
branch_name = self.generate_branch_name_from_reference(
|
| 444 |
+
pr_analysis["head_branch"], target_language, file_name
|
| 445 |
+
)
|
| 446 |
+
|
| 447 |
+
# 3. Get main branch SHA from upstream and create branch in fork
|
| 448 |
+
upstream_repo = self.github_client.get_repo(f"{self.base_owner}/{self.base_repo}")
|
| 449 |
+
main_branch = upstream_repo.get_branch(base_branch)
|
| 450 |
+
main_sha = main_branch.commit.sha
|
| 451 |
+
|
| 452 |
+
print(f"🌿 Creating branch: {branch_name} in fork repository")
|
| 453 |
+
branch_result = self.create_branch(self.user_owner, self.user_repo, branch_name, main_sha)
|
| 454 |
+
|
| 455 |
+
# Check branch creation result
|
| 456 |
+
if branch_result.startswith("ERROR"):
|
| 457 |
+
return {
|
| 458 |
+
"status": "error",
|
| 459 |
+
"message": f"Branch creation failed: {branch_result}\n\nTarget: {self.user_owner}/{self.user_repo}\nBranch: {branch_name}\nBase SHA: {main_sha[:8]}",
|
| 460 |
+
"branch": branch_name,
|
| 461 |
+
"error_details": branch_result,
|
| 462 |
+
}
|
| 463 |
+
elif branch_result.startswith("WARNING"):
|
| 464 |
+
print(f"⚠️ {branch_result}")
|
| 465 |
+
# Continue if branch already exists
|
| 466 |
+
elif branch_result.startswith("SUCCESS"):
|
| 467 |
+
print(f"✅ {branch_result}")
|
| 468 |
+
else:
|
| 469 |
+
print(f"⚠️ Unexpected branch creation result: {branch_result}")
|
| 470 |
+
# Continue anyway, might still work
|
| 471 |
+
|
| 472 |
+
# 4. Generate commit message and save file
|
| 473 |
+
commit_messages = [commit["message"] for commit in pr_analysis["commits"]]
|
| 474 |
+
commit_message = self.generate_commit_message_from_reference(
|
| 475 |
+
commit_messages, target_language, file_name
|
| 476 |
+
)
|
| 477 |
+
|
| 478 |
+
print(f"📄 Saving file: {target_filepath}")
|
| 479 |
+
file_result = self.create_or_update_file(
|
| 480 |
+
self.user_owner,
|
| 481 |
+
self.user_repo,
|
| 482 |
+
target_filepath,
|
| 483 |
+
commit_message,
|
| 484 |
+
translated_doc,
|
| 485 |
+
branch_name,
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
+
if not file_result.startswith("SUCCESS"):
|
| 489 |
+
return {
|
| 490 |
+
"status": "error",
|
| 491 |
+
"message": f"File save failed: {file_result}\n\n🎯 Target: {self.user_owner}/{self.user_repo} (expected: {target_language} fork of {self.base_owner}/{self.base_repo})\n🌿 Branch: {branch_name}\n📁 File: {target_filepath}",
|
| 492 |
+
"branch": branch_name,
|
| 493 |
+
"file_path": target_filepath,
|
| 494 |
+
"error_details": file_result,
|
| 495 |
+
}
|
| 496 |
+
|
| 497 |
+
print(f"{file_result}")
|
| 498 |
+
|
| 499 |
+
# 5. Create PR
|
| 500 |
+
pr_title, pr_body = self.generate_pr_content_from_reference(
|
| 501 |
+
pr_analysis["title"],
|
| 502 |
+
pr_analysis["body"],
|
| 503 |
+
target_language,
|
| 504 |
+
filepath,
|
| 505 |
+
target_filepath,
|
| 506 |
+
file_name,
|
| 507 |
+
)
|
| 508 |
+
|
| 509 |
+
print(f"🔄 Creating PR: {pr_title}")
|
| 510 |
+
print(f" Head: {self.user_owner}:{branch_name} → Base: {self.base_owner}:{base_branch}")
|
| 511 |
+
|
| 512 |
+
# Create PR from fork to upstream repository
|
| 513 |
+
pr_result = self.create_pull_request(
|
| 514 |
+
self.base_owner, self.base_repo, pr_title, f"{self.user_owner}:{branch_name}", base_branch, pr_body, draft=True
|
| 515 |
+
)
|
| 516 |
+
|
| 517 |
+
if pr_result.startswith("ERROR"):
|
| 518 |
+
print(f"❌ {pr_result}")
|
| 519 |
+
return {
|
| 520 |
+
"status": "partial_success",
|
| 521 |
+
"branch": branch_name,
|
| 522 |
+
"file_path": target_filepath,
|
| 523 |
+
"message": f"File was saved and commit was successful.\nPR creation failed: {pr_result}",
|
| 524 |
+
"error_details": pr_result,
|
| 525 |
+
}
|
| 526 |
+
elif "successful" in pr_result and "http" in pr_result:
|
| 527 |
+
print(f"{pr_result}")
|
| 528 |
+
return {
|
| 529 |
+
"status": "success",
|
| 530 |
+
"branch": branch_name,
|
| 531 |
+
"file_path": target_filepath,
|
| 532 |
+
"pr_url": pr_result.split(": ")[-1],
|
| 533 |
+
"message": "Translation document PR created successfully!",
|
| 534 |
+
}
|
| 535 |
+
else:
|
| 536 |
+
return {
|
| 537 |
+
"status": "partial_success",
|
| 538 |
+
"branch": branch_name,
|
| 539 |
+
"file_path": target_filepath,
|
| 540 |
+
"message": "File was saved but PR creation failed.",
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
except Exception as e:
|
| 544 |
+
return {
|
| 545 |
+
"status": "error",
|
| 546 |
+
"message": f"Workflow execution failed: {str(e)}\n\nConfig: {self.user_owner}/{self.user_repo} → {self.base_owner}/{self.base_repo}\nFile: {filepath if 'filepath' in locals() else 'Unknown'}",
|
| 547 |
+
"error_details": str(e),
|
| 548 |
+
}
|
| 549 |
+
|
| 550 |
+
|
| 551 |
+
# Backward compatibility functions (maintain compatibility with existing code)
|
| 552 |
+
_agent = GitHubPRAgent()
|
| 553 |
+
|
| 554 |
+
|
| 555 |
+
def get_github_client():
|
| 556 |
+
return _agent.github_client
|
| 557 |
+
|
| 558 |
+
|
| 559 |
+
def create_pull_request_func(*args, **kwargs):
|
| 560 |
+
return _agent.create_pull_request(*args, **kwargs)
|
| 561 |
+
|
| 562 |
+
|
| 563 |
+
def create_branch_func(*args, **kwargs):
|
| 564 |
+
return _agent.create_branch(*args, **kwargs)
|
| 565 |
+
|
| 566 |
+
|
| 567 |
+
def create_or_update_file_func(*args, **kwargs):
|
| 568 |
+
return _agent.create_or_update_file(*args, **kwargs)
|
| 569 |
+
|
| 570 |
+
|
| 571 |
+
def analyze_reference_pr_func(*args, **kwargs):
|
| 572 |
+
return _agent.analyze_reference_pr(*args, **kwargs)
|
| 573 |
+
|
| 574 |
+
|
| 575 |
+
def generate_branch_name_from_reference(*args, **kwargs):
|
| 576 |
+
return _agent.generate_branch_name_from_reference(*args, **kwargs)
|
| 577 |
+
|
| 578 |
+
|
| 579 |
+
def generate_pr_content_from_reference(*args, **kwargs):
|
| 580 |
+
return _agent.generate_pr_content_from_reference(*args, **kwargs)
|
| 581 |
+
|
| 582 |
+
|
| 583 |
+
def generate_default_pr_content(*args, **kwargs):
|
| 584 |
+
return _agent._generate_default_pr_content(*args, **kwargs)
|
| 585 |
+
|
| 586 |
+
|
| 587 |
+
def generate_commit_message_from_reference(*args, **kwargs):
|
| 588 |
+
return _agent.generate_commit_message_from_reference(*args, **kwargs)
|
| 589 |
+
|
| 590 |
+
|
| 591 |
+
def get_branch_info(*args, **kwargs):
|
| 592 |
+
return _agent.get_branch_info(*args, **kwargs)
|
| 593 |
+
|
| 594 |
+
|
| 595 |
+
def run_translation_pr_agent_simple(*args, **kwargs):
|
| 596 |
+
return _agent.run_translation_pr_workflow(*args, **kwargs)
|
pr_generator/searcher.py
CHANGED
|
@@ -1,238 +1,238 @@
|
|
| 1 |
-
"""
|
| 2 |
-
GitHub PR Search Agent
|
| 3 |
-
An agent that finds a suitable reference PR when a reference PR URL is not provided.
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
import os
|
| 7 |
-
import re
|
| 8 |
-
import logging
|
| 9 |
-
from typing import List, Dict, Any, Optional
|
| 10 |
-
|
| 11 |
-
# Load environment variables
|
| 12 |
-
from dotenv import load_dotenv
|
| 13 |
-
|
| 14 |
-
load_dotenv()
|
| 15 |
-
|
| 16 |
-
# Setup logging
|
| 17 |
-
logging.basicConfig(
|
| 18 |
-
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
| 19 |
-
)
|
| 20 |
-
logger = logging.getLogger(__name__)
|
| 21 |
-
|
| 22 |
-
# Langchain imports
|
| 23 |
-
try:
|
| 24 |
-
from langchain_anthropic import ChatAnthropic
|
| 25 |
-
from langchain.tools import StructuredTool
|
| 26 |
-
from langchain.agents import AgentExecutor, create_tool_calling_agent
|
| 27 |
-
from langchain_core.prompts import ChatPromptTemplate
|
| 28 |
-
from github import Github
|
| 29 |
-
|
| 30 |
-
REQUIRED_LIBS_AVAILABLE = True
|
| 31 |
-
except ImportError as e:
|
| 32 |
-
print(f"Required libraries are not installed: {e}")
|
| 33 |
-
REQUIRED_LIBS_AVAILABLE = False
|
| 34 |
-
|
| 35 |
-
# Constants
|
| 36 |
-
ANTHROPIC_MODEL_ID = "claude-sonnet-4-20250514"
|
| 37 |
-
DEFAULT_TEMPERATURE = 0.0
|
| 38 |
-
# Fallback PR URL to ensure a PR is always returned
|
| 39 |
-
DEFAULT_FALLBACK_PR_URL = "https://github.com/huggingface/transformers/pull/24968"
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
class GitHubPRSearcher:
|
| 43 |
-
"""GitHub PR Searcher - now using a LangChain agent."""
|
| 44 |
-
|
| 45 |
-
def _search_github_prs(self, query: str) -> List[Dict[str, Any]]:
|
| 46 |
-
"""
|
| 47 |
-
Searches GitHub for pull requests matching the query and returns the top 5 results.
|
| 48 |
-
The query should be a valid GitHub search query.
|
| 49 |
-
"""
|
| 50 |
-
logger.info(f"Executing GitHub search with query: {query}")
|
| 51 |
-
try:
|
| 52 |
-
issues = self.github_client.search_issues(query=query)
|
| 53 |
-
# Take top 5 to keep context small for the agent
|
| 54 |
-
top_issues = issues.get_page(0)[:5]
|
| 55 |
-
|
| 56 |
-
if not top_issues:
|
| 57 |
-
return []
|
| 58 |
-
|
| 59 |
-
return [
|
| 60 |
-
{"title": issue.title, "url": issue.html_url, "number": issue.number}
|
| 61 |
-
for issue in top_issues
|
| 62 |
-
]
|
| 63 |
-
except Exception as e:
|
| 64 |
-
logger.error(f"Error during GitHub search: {e}", exc_info=True)
|
| 65 |
-
# Return an error message that the agent can understand
|
| 66 |
-
return [{"error": f"An error occurred during search: {e}"}]
|
| 67 |
-
|
| 68 |
-
def __init__(self):
|
| 69 |
-
if not REQUIRED_LIBS_AVAILABLE:
|
| 70 |
-
raise ImportError("Required libraries for agent could not be found.")
|
| 71 |
-
|
| 72 |
-
self._github_client = None
|
| 73 |
-
self.llm = ChatAnthropic(
|
| 74 |
-
model=ANTHROPIC_MODEL_ID,
|
| 75 |
-
temperature=DEFAULT_TEMPERATURE,
|
| 76 |
-
)
|
| 77 |
-
|
| 78 |
-
search_tool = StructuredTool.from_function(
|
| 79 |
-
func=self._search_github_prs,
|
| 80 |
-
name="search_github_prs",
|
| 81 |
-
description="Searches GitHub for pull requests matching the query and returns the top 5 results. The query should be a valid GitHub search query.",
|
| 82 |
-
)
|
| 83 |
-
tools = [search_tool]
|
| 84 |
-
|
| 85 |
-
prompt_string = """You are a GitHub expert. Your mission is to find the best reference pull request (PR) for a given task.
|
| 86 |
-
|
| 87 |
-
You need to find a merged PR in the repository: {owner}/{repo_name}.
|
| 88 |
-
The PR should be for a documentation translation into **{target_language}**.
|
| 89 |
-
The context for the translation is: **{context}**.
|
| 90 |
-
|
| 91 |
-
Use the tools at your disposal to search for relevant PRs.
|
| 92 |
-
Analyze the search results and select the one that best matches the request. A good PR is usually one that has "translation", "docs", "i18n", and the target language in its title.
|
| 93 |
-
|
| 94 |
-
Here is an example of a good search query you could use:
|
| 95 |
-
`repo:{owner}/{repo_name} is:pr is:merged "{target_language}" "{context}" i18n translation docs`
|
| 96 |
-
|
| 97 |
-
After your analysis, you MUST output **only the final URL** of the best PR you have chosen. Do not include any other text in your final response."""
|
| 98 |
-
|
| 99 |
-
prompt = ChatPromptTemplate.from_messages(
|
| 100 |
-
[
|
| 101 |
-
("system", prompt_string),
|
| 102 |
-
(
|
| 103 |
-
"human",
|
| 104 |
-
"Find the best reference PR for translating docs to {target_language} about {context} in the {owner}/{repo_name} repository.",
|
| 105 |
-
),
|
| 106 |
-
("placeholder", "{agent_scratchpad}"),
|
| 107 |
-
]
|
| 108 |
-
)
|
| 109 |
-
|
| 110 |
-
agent = create_tool_calling_agent(self.llm, tools, prompt)
|
| 111 |
-
self.agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)
|
| 112 |
-
|
| 113 |
-
@property
|
| 114 |
-
def github_client(self) -> Optional[Github]:
|
| 115 |
-
"""Lazy initialization of the GitHub API client."""
|
| 116 |
-
if not REQUIRED_LIBS_AVAILABLE:
|
| 117 |
-
raise ImportError("Required libraries could not be found.")
|
| 118 |
-
|
| 119 |
-
if self._github_client is None:
|
| 120 |
-
token = os.environ.get("GITHUB_TOKEN")
|
| 121 |
-
if not token:
|
| 122 |
-
print("Warning: GITHUB_TOKEN environment variable is not set.")
|
| 123 |
-
self._github_client = Github() # Limited access
|
| 124 |
-
else:
|
| 125 |
-
self._github_client = Github(token)
|
| 126 |
-
return self._github_client
|
| 127 |
-
|
| 128 |
-
def find_best_reference_pr(
|
| 129 |
-
self, owner: str, repo_name: str, target_language: str, context: str
|
| 130 |
-
):
|
| 131 |
-
"""
|
| 132 |
-
Finds the best reference PR using a LangChain agent.
|
| 133 |
-
Yields progress and returns the final PR URL.
|
| 134 |
-
"""
|
| 135 |
-
message = "🤖 Agent is searching for the best reference PR..."
|
| 136 |
-
logger.info(message)
|
| 137 |
-
yield message
|
| 138 |
-
|
| 139 |
-
try:
|
| 140 |
-
agent_input = {
|
| 141 |
-
"owner": owner,
|
| 142 |
-
"repo_name": repo_name,
|
| 143 |
-
"target_language": target_language,
|
| 144 |
-
"context": context,
|
| 145 |
-
}
|
| 146 |
-
|
| 147 |
-
agent_output = None
|
| 148 |
-
for event in self.agent_executor.stream(agent_input):
|
| 149 |
-
if "actions" in event and event["actions"]:
|
| 150 |
-
action = event["actions"][0]
|
| 151 |
-
tool_query = action.tool_input.get("query", str(action.tool_input))
|
| 152 |
-
message = f"🔍 Agent is using tool `{action.tool}` with query:\n`{tool_query}`"
|
| 153 |
-
logger.info(message)
|
| 154 |
-
yield message
|
| 155 |
-
elif "steps" in event and event["steps"]:
|
| 156 |
-
message = "📊 Agent is analyzing the results from the tool..."
|
| 157 |
-
logger.info(message)
|
| 158 |
-
yield message
|
| 159 |
-
elif "output" in event and event["output"]:
|
| 160 |
-
agent_output = event["output"]
|
| 161 |
-
|
| 162 |
-
if not agent_output:
|
| 163 |
-
message = "⚠️ Agent failed to find a suitable PR. Using default PR."
|
| 164 |
-
logger.warning(message)
|
| 165 |
-
yield message
|
| 166 |
-
return DEFAULT_FALLBACK_PR_URL
|
| 167 |
-
|
| 168 |
-
# The agent's final output can be a string, a list of tool results,
|
| 169 |
-
# or a list of content blocks from the LLM. We'll find the URL
|
| 170 |
-
# by searching for it in the string representation of the output.
|
| 171 |
-
output_text = str(agent_output)
|
| 172 |
-
urls = re.findall(r"https?://github.com/[^/]+/[^/]+/pull/\d+", output_text)
|
| 173 |
-
|
| 174 |
-
final_url = ""
|
| 175 |
-
if urls:
|
| 176 |
-
final_url = urls[-1] # Take the last URL found
|
| 177 |
-
|
| 178 |
-
if not final_url:
|
| 179 |
-
message = f"⚠️ Agent returned unparsable output: {agent_output}. Using default PR."
|
| 180 |
-
logger.warning(message)
|
| 181 |
-
yield message
|
| 182 |
-
return DEFAULT_FALLBACK_PR_URL
|
| 183 |
-
|
| 184 |
-
message = f"✅ Selected the best PR:\n`{final_url}`"
|
| 185 |
-
logger.info(f"Selected the best PR: {final_url}")
|
| 186 |
-
yield message
|
| 187 |
-
return final_url
|
| 188 |
-
|
| 189 |
-
except Exception as e:
|
| 190 |
-
message = f"❌ Error during agent execution: {e}\nUsing default PR."
|
| 191 |
-
logger.error(message, exc_info=True)
|
| 192 |
-
yield message
|
| 193 |
-
return DEFAULT_FALLBACK_PR_URL
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
def find_reference_pr_simple_stream(target_language: str = "", context: str = ""):
|
| 197 |
-
"""
|
| 198 |
-
A simple function to find a reference PR, streaming progress.
|
| 199 |
-
This function always searches in the 'huggingface/transformers' repository.
|
| 200 |
-
"""
|
| 201 |
-
searcher = GitHubPRSearcher()
|
| 202 |
-
stream_generator = searcher.find_best_reference_pr(
|
| 203 |
-
"huggingface", "transformers", target_language, context
|
| 204 |
-
)
|
| 205 |
-
# The handler will receive the final URL from the generator's return statement
|
| 206 |
-
final_url = yield from stream_generator
|
| 207 |
-
|
| 208 |
-
# Format the final result as expected by the handler
|
| 209 |
-
return {
|
| 210 |
-
"status": "success",
|
| 211 |
-
"result": f"Recommended PR URL: {final_url}",
|
| 212 |
-
"repository": "huggingface/transformers",
|
| 213 |
-
"target_language": target_language,
|
| 214 |
-
}
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
# Example usage
|
| 218 |
-
if __name__ == "__main__":
|
| 219 |
-
# Example execution for streaming
|
| 220 |
-
# In a real application, a generator consumer (like the one in handler.py)
|
| 221 |
-
# would process the yielded values. This script simulates that.
|
| 222 |
-
print("--- Running Streaming Search Simulation ---")
|
| 223 |
-
|
| 224 |
-
def run_simulation():
|
| 225 |
-
"""Simulates the consumption of the streaming generator."""
|
| 226 |
-
test_gen = find_reference_pr_simple_stream(
|
| 227 |
-
target_language="korean", context="docs"
|
| 228 |
-
)
|
| 229 |
-
try:
|
| 230 |
-
while True:
|
| 231 |
-
# This will print progress messages
|
| 232 |
-
print(next(test_gen))
|
| 233 |
-
except StopIteration as e:
|
| 234 |
-
# When the generator is exhausted, the final result is in e.value
|
| 235 |
-
print("\n--- FINAL RESULT ---")
|
| 236 |
-
print(e.value)
|
| 237 |
-
|
| 238 |
-
run_simulation()
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
GitHub PR Search Agent
|
| 3 |
+
An agent that finds a suitable reference PR when a reference PR URL is not provided.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import re
|
| 8 |
+
import logging
|
| 9 |
+
from typing import List, Dict, Any, Optional
|
| 10 |
+
|
| 11 |
+
# Load environment variables
|
| 12 |
+
from dotenv import load_dotenv
|
| 13 |
+
|
| 14 |
+
load_dotenv()
|
| 15 |
+
|
| 16 |
+
# Setup logging
|
| 17 |
+
logging.basicConfig(
|
| 18 |
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
| 19 |
+
)
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
# Langchain imports
|
| 23 |
+
try:
|
| 24 |
+
from langchain_anthropic import ChatAnthropic
|
| 25 |
+
from langchain.tools import StructuredTool
|
| 26 |
+
from langchain.agents import AgentExecutor, create_tool_calling_agent
|
| 27 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 28 |
+
from github import Github
|
| 29 |
+
|
| 30 |
+
REQUIRED_LIBS_AVAILABLE = True
|
| 31 |
+
except ImportError as e:
|
| 32 |
+
print(f"Required libraries are not installed: {e}")
|
| 33 |
+
REQUIRED_LIBS_AVAILABLE = False
|
| 34 |
+
|
| 35 |
+
# Constants
|
| 36 |
+
ANTHROPIC_MODEL_ID = "claude-sonnet-4-20250514"
|
| 37 |
+
DEFAULT_TEMPERATURE = 0.0
|
| 38 |
+
# Fallback PR URL to ensure a PR is always returned
|
| 39 |
+
DEFAULT_FALLBACK_PR_URL = "https://github.com/huggingface/transformers/pull/24968"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class GitHubPRSearcher:
|
| 43 |
+
"""GitHub PR Searcher - now using a LangChain agent."""
|
| 44 |
+
|
| 45 |
+
def _search_github_prs(self, query: str) -> List[Dict[str, Any]]:
|
| 46 |
+
"""
|
| 47 |
+
Searches GitHub for pull requests matching the query and returns the top 5 results.
|
| 48 |
+
The query should be a valid GitHub search query.
|
| 49 |
+
"""
|
| 50 |
+
logger.info(f"Executing GitHub search with query: {query}")
|
| 51 |
+
try:
|
| 52 |
+
issues = self.github_client.search_issues(query=query)
|
| 53 |
+
# Take top 5 to keep context small for the agent
|
| 54 |
+
top_issues = issues.get_page(0)[:5]
|
| 55 |
+
|
| 56 |
+
if not top_issues:
|
| 57 |
+
return []
|
| 58 |
+
|
| 59 |
+
return [
|
| 60 |
+
{"title": issue.title, "url": issue.html_url, "number": issue.number}
|
| 61 |
+
for issue in top_issues
|
| 62 |
+
]
|
| 63 |
+
except Exception as e:
|
| 64 |
+
logger.error(f"Error during GitHub search: {e}", exc_info=True)
|
| 65 |
+
# Return an error message that the agent can understand
|
| 66 |
+
return [{"error": f"An error occurred during search: {e}"}]
|
| 67 |
+
|
| 68 |
+
def __init__(self):
|
| 69 |
+
if not REQUIRED_LIBS_AVAILABLE:
|
| 70 |
+
raise ImportError("Required libraries for agent could not be found.")
|
| 71 |
+
|
| 72 |
+
self._github_client = None
|
| 73 |
+
self.llm = ChatAnthropic(
|
| 74 |
+
model=ANTHROPIC_MODEL_ID,
|
| 75 |
+
temperature=DEFAULT_TEMPERATURE,
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
search_tool = StructuredTool.from_function(
|
| 79 |
+
func=self._search_github_prs,
|
| 80 |
+
name="search_github_prs",
|
| 81 |
+
description="Searches GitHub for pull requests matching the query and returns the top 5 results. The query should be a valid GitHub search query.",
|
| 82 |
+
)
|
| 83 |
+
tools = [search_tool]
|
| 84 |
+
|
| 85 |
+
prompt_string = """You are a GitHub expert. Your mission is to find the best reference pull request (PR) for a given task.
|
| 86 |
+
|
| 87 |
+
You need to find a merged PR in the repository: {owner}/{repo_name}.
|
| 88 |
+
The PR should be for a documentation translation into **{target_language}**.
|
| 89 |
+
The context for the translation is: **{context}**.
|
| 90 |
+
|
| 91 |
+
Use the tools at your disposal to search for relevant PRs.
|
| 92 |
+
Analyze the search results and select the one that best matches the request. A good PR is usually one that has "translation", "docs", "i18n", and the target language in its title.
|
| 93 |
+
|
| 94 |
+
Here is an example of a good search query you could use:
|
| 95 |
+
`repo:{owner}/{repo_name} is:pr is:merged "{target_language}" "{context}" i18n translation docs`
|
| 96 |
+
|
| 97 |
+
After your analysis, you MUST output **only the final URL** of the best PR you have chosen. Do not include any other text in your final response."""
|
| 98 |
+
|
| 99 |
+
prompt = ChatPromptTemplate.from_messages(
|
| 100 |
+
[
|
| 101 |
+
("system", prompt_string),
|
| 102 |
+
(
|
| 103 |
+
"human",
|
| 104 |
+
"Find the best reference PR for translating docs to {target_language} about {context} in the {owner}/{repo_name} repository.",
|
| 105 |
+
),
|
| 106 |
+
("placeholder", "{agent_scratchpad}"),
|
| 107 |
+
]
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
agent = create_tool_calling_agent(self.llm, tools, prompt)
|
| 111 |
+
self.agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)
|
| 112 |
+
|
| 113 |
+
@property
|
| 114 |
+
def github_client(self) -> Optional[Github]:
|
| 115 |
+
"""Lazy initialization of the GitHub API client."""
|
| 116 |
+
if not REQUIRED_LIBS_AVAILABLE:
|
| 117 |
+
raise ImportError("Required libraries could not be found.")
|
| 118 |
+
|
| 119 |
+
if self._github_client is None:
|
| 120 |
+
token = os.environ.get("GITHUB_TOKEN")
|
| 121 |
+
if not token:
|
| 122 |
+
print("Warning: GITHUB_TOKEN environment variable is not set.")
|
| 123 |
+
self._github_client = Github() # Limited access
|
| 124 |
+
else:
|
| 125 |
+
self._github_client = Github(token)
|
| 126 |
+
return self._github_client
|
| 127 |
+
|
| 128 |
+
def find_best_reference_pr(
|
| 129 |
+
self, owner: str, repo_name: str, target_language: str, context: str
|
| 130 |
+
):
|
| 131 |
+
"""
|
| 132 |
+
Finds the best reference PR using a LangChain agent.
|
| 133 |
+
Yields progress and returns the final PR URL.
|
| 134 |
+
"""
|
| 135 |
+
message = "🤖 Agent is searching for the best reference PR..."
|
| 136 |
+
logger.info(message)
|
| 137 |
+
yield message
|
| 138 |
+
|
| 139 |
+
try:
|
| 140 |
+
agent_input = {
|
| 141 |
+
"owner": owner,
|
| 142 |
+
"repo_name": repo_name,
|
| 143 |
+
"target_language": target_language,
|
| 144 |
+
"context": context,
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
agent_output = None
|
| 148 |
+
for event in self.agent_executor.stream(agent_input):
|
| 149 |
+
if "actions" in event and event["actions"]:
|
| 150 |
+
action = event["actions"][0]
|
| 151 |
+
tool_query = action.tool_input.get("query", str(action.tool_input))
|
| 152 |
+
message = f"🔍 Agent is using tool `{action.tool}` with query:\n`{tool_query}`"
|
| 153 |
+
logger.info(message)
|
| 154 |
+
yield message
|
| 155 |
+
elif "steps" in event and event["steps"]:
|
| 156 |
+
message = "📊 Agent is analyzing the results from the tool..."
|
| 157 |
+
logger.info(message)
|
| 158 |
+
yield message
|
| 159 |
+
elif "output" in event and event["output"]:
|
| 160 |
+
agent_output = event["output"]
|
| 161 |
+
|
| 162 |
+
if not agent_output:
|
| 163 |
+
message = "⚠️ Agent failed to find a suitable PR. Using default PR."
|
| 164 |
+
logger.warning(message)
|
| 165 |
+
yield message
|
| 166 |
+
return DEFAULT_FALLBACK_PR_URL
|
| 167 |
+
|
| 168 |
+
# The agent's final output can be a string, a list of tool results,
|
| 169 |
+
# or a list of content blocks from the LLM. We'll find the URL
|
| 170 |
+
# by searching for it in the string representation of the output.
|
| 171 |
+
output_text = str(agent_output)
|
| 172 |
+
urls = re.findall(r"https?://github.com/[^/]+/[^/]+/pull/\d+", output_text)
|
| 173 |
+
|
| 174 |
+
final_url = ""
|
| 175 |
+
if urls:
|
| 176 |
+
final_url = urls[-1] # Take the last URL found
|
| 177 |
+
|
| 178 |
+
if not final_url:
|
| 179 |
+
message = f"⚠️ Agent returned unparsable output: {agent_output}. Using default PR."
|
| 180 |
+
logger.warning(message)
|
| 181 |
+
yield message
|
| 182 |
+
return DEFAULT_FALLBACK_PR_URL
|
| 183 |
+
|
| 184 |
+
message = f"✅ Selected the best PR:\n`{final_url}`"
|
| 185 |
+
logger.info(f"Selected the best PR: {final_url}")
|
| 186 |
+
yield message
|
| 187 |
+
return final_url
|
| 188 |
+
|
| 189 |
+
except Exception as e:
|
| 190 |
+
message = f"❌ Error during agent execution: {e}\nUsing default PR."
|
| 191 |
+
logger.error(message, exc_info=True)
|
| 192 |
+
yield message
|
| 193 |
+
return DEFAULT_FALLBACK_PR_URL
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def find_reference_pr_simple_stream(target_language: str = "", context: str = ""):
|
| 197 |
+
"""
|
| 198 |
+
A simple function to find a reference PR, streaming progress.
|
| 199 |
+
This function always searches in the 'huggingface/transformers' repository.
|
| 200 |
+
"""
|
| 201 |
+
searcher = GitHubPRSearcher()
|
| 202 |
+
stream_generator = searcher.find_best_reference_pr(
|
| 203 |
+
"huggingface", "transformers", target_language, context
|
| 204 |
+
)
|
| 205 |
+
# The handler will receive the final URL from the generator's return statement
|
| 206 |
+
final_url = yield from stream_generator
|
| 207 |
+
|
| 208 |
+
# Format the final result as expected by the handler
|
| 209 |
+
return {
|
| 210 |
+
"status": "success",
|
| 211 |
+
"result": f"Recommended PR URL: {final_url}",
|
| 212 |
+
"repository": "huggingface/transformers",
|
| 213 |
+
"target_language": target_language,
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
# Example usage
|
| 218 |
+
if __name__ == "__main__":
|
| 219 |
+
# Example execution for streaming
|
| 220 |
+
# In a real application, a generator consumer (like the one in handler.py)
|
| 221 |
+
# would process the yielded values. This script simulates that.
|
| 222 |
+
print("--- Running Streaming Search Simulation ---")
|
| 223 |
+
|
| 224 |
+
def run_simulation():
|
| 225 |
+
"""Simulates the consumption of the streaming generator."""
|
| 226 |
+
test_gen = find_reference_pr_simple_stream(
|
| 227 |
+
target_language="korean", context="docs"
|
| 228 |
+
)
|
| 229 |
+
try:
|
| 230 |
+
while True:
|
| 231 |
+
# This will print progress messages
|
| 232 |
+
print(next(test_gen))
|
| 233 |
+
except StopIteration as e:
|
| 234 |
+
# When the generator is exhausted, the final result is in e.value
|
| 235 |
+
print("\n--- FINAL RESULT ---")
|
| 236 |
+
print(e.value)
|
| 237 |
+
|
| 238 |
+
run_simulation()
|
requirements.txt
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
-
gradio==5.33.0
|
| 2 |
-
requests
|
| 3 |
-
pydantic
|
| 4 |
-
langchain-anthropic
|
| 5 |
-
python-dotenv
|
| 6 |
-
langchain
|
| 7 |
-
PyGithub
|
| 8 |
-
langchain-core
|
| 9 |
-
langchain-community
|
| 10 |
-
boto3
|
| 11 |
PyYAML
|
|
|
|
| 1 |
+
gradio==5.33.0
|
| 2 |
+
requests
|
| 3 |
+
pydantic
|
| 4 |
+
langchain-anthropic
|
| 5 |
+
python-dotenv
|
| 6 |
+
langchain
|
| 7 |
+
PyGithub
|
| 8 |
+
langchain-core
|
| 9 |
+
langchain-community
|
| 10 |
+
boto3
|
| 11 |
PyYAML
|
test/test_final_translate.md
CHANGED
|
@@ -1,127 +1,127 @@
|
|
| 1 |
-
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
| 2 |
-
|
| 3 |
-
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
| 4 |
-
the License. You may obtain a copy of the License at
|
| 5 |
-
|
| 6 |
-
http://www.apache.org/licenses/LICENSE-2.0
|
| 7 |
-
|
| 8 |
-
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
| 9 |
-
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
| 10 |
-
specific language governing permissions and limitations under the License.
|
| 11 |
-
|
| 12 |
-
⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
|
| 13 |
-
rendered properly in your Markdown viewer.
|
| 14 |
-
|
| 15 |
-
-->
|
| 16 |
-
|
| 17 |
-
# 가속기 선택 [[accelerator-selection]]
|
| 18 |
-
|
| 19 |
-
분산 훈련 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 개수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기들이 있을 때 더 빠른 가속기를 먼저 사용하고 싶거나, 사용 가능한 가속기 중 일부만 사용하고 싶을 때 유용합니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
|
| 20 |
-
|
| 21 |
-
이 가이드에서는 사용할 가속기의 개수와 사용 순서를 선택하는 방법을 보여드립니다.
|
| 22 |
-
|
| 23 |
-
## 가속기 개수 [[number-of-accelerators]]
|
| 24 |
-
|
| 25 |
-
예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령어를 실행하세요.
|
| 26 |
-
|
| 27 |
-
<hfoptions id="select-accelerator">
|
| 28 |
-
<hfoption id="torchrun">
|
| 29 |
-
|
| 30 |
-
사용할 가속기 개수를 선택하려면 `--nproc_per_node`를 사용하세요.
|
| 31 |
-
|
| 32 |
-
```bash
|
| 33 |
-
torchrun --nproc_per_node=2 trainer-program.py ...
|
| 34 |
-
```
|
| 35 |
-
|
| 36 |
-
</hfoption>
|
| 37 |
-
<hfoption id="Accelerate">
|
| 38 |
-
|
| 39 |
-
사용할 가속기 개수를 선택하려면 `--num_processes`를 사용하세요.
|
| 40 |
-
|
| 41 |
-
```bash
|
| 42 |
-
accelerate launch --num_processes 2 trainer-program.py ...
|
| 43 |
-
```
|
| 44 |
-
|
| 45 |
-
</hfoption>
|
| 46 |
-
<hfoption id="🤗 DeepSpeed">
|
| 47 |
-
|
| 48 |
-
사용할 GPU 개수를 선택하려면 `--num_gpus`를 사용하세요.
|
| 49 |
-
|
| 50 |
-
```bash
|
| 51 |
-
deepspeed --num_gpus 2 trainer-program.py ...
|
| 52 |
-
```
|
| 53 |
-
|
| 54 |
-
</hfoption>
|
| 55 |
-
</hfoptions>
|
| 56 |
-
|
| 57 |
-
## 가속기 순서 [[order-of-accelerators]]
|
| 58 |
-
사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 보통 각 실행마다 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
|
| 59 |
-
|
| 60 |
-
예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
|
| 61 |
-
|
| 62 |
-
<hfoptions id="accelerator-type">
|
| 63 |
-
<hfoption id="CUDA">
|
| 64 |
-
|
| 65 |
-
```bash
|
| 66 |
-
CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
|
| 67 |
-
```
|
| 68 |
-
|
| 69 |
-
GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
|
| 70 |
-
순서를 바꾸려면(GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
```bash
|
| 74 |
-
CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
|
| 75 |
-
```
|
| 76 |
-
|
| 77 |
-
GPU 없이 실행하려면:
|
| 78 |
-
|
| 79 |
-
```bash
|
| 80 |
-
CUDA_VISIBLE_DEVICES= python trainer-program.py ...
|
| 81 |
-
```
|
| 82 |
-
|
| 83 |
-
`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
|
| 84 |
-
|
| 85 |
-
- PCIe 버스 ID 순서(`nvidia-smi`와 일치):
|
| 86 |
-
|
| 87 |
-
```bash
|
| 88 |
-
$hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
|
| 89 |
-
```
|
| 90 |
-
|
| 91 |
-
- 컴퓨팅 성능 순서(가장 빠른 것부터):
|
| 92 |
-
|
| 93 |
-
```bash
|
| 94 |
-
export CUDA_DEVICE_ORDER=FASTEST_FIRST
|
| 95 |
-
```
|
| 96 |
-
|
| 97 |
-
</hfoption>
|
| 98 |
-
<hfoption id="Intel XPU">
|
| 99 |
-
|
| 100 |
-
```bash
|
| 101 |
-
ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
|
| 102 |
-
```
|
| 103 |
-
|
| 104 |
-
XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
|
| 105 |
-
순서를 바꾸려면(XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
|
| 106 |
-
|
| 107 |
-
```bash
|
| 108 |
-
ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
|
| 109 |
-
```
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
다음으로 Intel XPU의 순서를 제어할 수도 있습니다:
|
| 113 |
-
|
| 114 |
-
```bash
|
| 115 |
-
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
|
| 116 |
-
```
|
| 117 |
-
|
| 118 |
-
Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
|
| 119 |
-
|
| 120 |
-
</hfoption>
|
| 121 |
-
</hfoptions>
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
> [!WARNING]
|
| 126 |
-
> 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신 특정 훈련 실행을 위한 환경 변수를 같은 명령줄에서 설정하는 것이 일반적인 관행입니다.
|
| 127 |
-
|
|
|
|
| 1 |
+
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
| 2 |
+
|
| 3 |
+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
| 4 |
+
the License. You may obtain a copy of the License at
|
| 5 |
+
|
| 6 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 7 |
+
|
| 8 |
+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
| 9 |
+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
| 10 |
+
specific language governing permissions and limitations under the License.
|
| 11 |
+
|
| 12 |
+
⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
|
| 13 |
+
rendered properly in your Markdown viewer.
|
| 14 |
+
|
| 15 |
+
-->
|
| 16 |
+
|
| 17 |
+
# 가속기 선택 [[accelerator-selection]]
|
| 18 |
+
|
| 19 |
+
분산 훈련 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 개수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기들이 있을 때 더 빠른 가속기를 먼저 사용하고 싶거나, 사용 가능한 가속기 중 일부만 사용하고 싶을 때 유용합니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
|
| 20 |
+
|
| 21 |
+
이 가이드에서는 사용할 가속기의 개수와 사용 순서를 선택하는 방법을 보여드립니다.
|
| 22 |
+
|
| 23 |
+
## 가속기 개수 [[number-of-accelerators]]
|
| 24 |
+
|
| 25 |
+
예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령어를 실행하세요.
|
| 26 |
+
|
| 27 |
+
<hfoptions id="select-accelerator">
|
| 28 |
+
<hfoption id="torchrun">
|
| 29 |
+
|
| 30 |
+
사용할 가속기 개수를 선택하려면 `--nproc_per_node`를 사용하세요.
|
| 31 |
+
|
| 32 |
+
```bash
|
| 33 |
+
torchrun --nproc_per_node=2 trainer-program.py ...
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
</hfoption>
|
| 37 |
+
<hfoption id="Accelerate">
|
| 38 |
+
|
| 39 |
+
사용할 가속기 개수를 선택하려면 `--num_processes`를 사용하세요.
|
| 40 |
+
|
| 41 |
+
```bash
|
| 42 |
+
accelerate launch --num_processes 2 trainer-program.py ...
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
</hfoption>
|
| 46 |
+
<hfoption id="🤗 DeepSpeed">
|
| 47 |
+
|
| 48 |
+
사용할 GPU 개수를 선택하려면 `--num_gpus`를 사용하세요.
|
| 49 |
+
|
| 50 |
+
```bash
|
| 51 |
+
deepspeed --num_gpus 2 trainer-program.py ...
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
</hfoption>
|
| 55 |
+
</hfoptions>
|
| 56 |
+
|
| 57 |
+
## 가속기 순서 [[order-of-accelerators]]
|
| 58 |
+
사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 보통 각 실행마다 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
|
| 59 |
+
|
| 60 |
+
예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
|
| 61 |
+
|
| 62 |
+
<hfoptions id="accelerator-type">
|
| 63 |
+
<hfoption id="CUDA">
|
| 64 |
+
|
| 65 |
+
```bash
|
| 66 |
+
CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
|
| 70 |
+
순서를 바꾸려면(GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
```bash
|
| 74 |
+
CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
GPU 없이 실행하려면:
|
| 78 |
+
|
| 79 |
+
```bash
|
| 80 |
+
CUDA_VISIBLE_DEVICES= python trainer-program.py ...
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
|
| 84 |
+
|
| 85 |
+
- PCIe 버스 ID 순서(`nvidia-smi`와 일치):
|
| 86 |
+
|
| 87 |
+
```bash
|
| 88 |
+
$hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
- 컴퓨팅 성능 순서(가장 빠른 것부터):
|
| 92 |
+
|
| 93 |
+
```bash
|
| 94 |
+
export CUDA_DEVICE_ORDER=FASTEST_FIRST
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
</hfoption>
|
| 98 |
+
<hfoption id="Intel XPU">
|
| 99 |
+
|
| 100 |
+
```bash
|
| 101 |
+
ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
|
| 105 |
+
순서를 바꾸려면(XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
|
| 106 |
+
|
| 107 |
+
```bash
|
| 108 |
+
ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
다음으로 Intel XPU의 순서를 제어할 수도 있습니다:
|
| 113 |
+
|
| 114 |
+
```bash
|
| 115 |
+
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
|
| 119 |
+
|
| 120 |
+
</hfoption>
|
| 121 |
+
</hfoptions>
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
> [!WARNING]
|
| 126 |
+
> 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신 특정 훈련 실행을 위한 환경 변수를 같은 명령줄에서 설정하는 것이 일반적인 관행입니다.
|
| 127 |
+
|
test/test_prompt.py
CHANGED
|
@@ -1,71 +1,71 @@
|
|
| 1 |
-
output = """
|
| 2 |
-
What do these sentences about Hugging Face Transformers (a machine learning library) mean in Korean? Please do not translate the word after a 🤗 emoji as it is a product name. Output only the translated result without any explanations or introductions.
|
| 3 |
-
```md
|
| 4 |
-
# Accelerator selection
|
| 5 |
-
|
| 6 |
-
During distributed training, you can specify the number and order of accelerators (CUDA, XPU, MPS, HPU, etc.) to use. This can be useful when you have accelerators with different computing power and you want to use the faster accelerator first. Or you could only use a subset of the available accelerators. The selection process works for both [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html). You don't need Accelerate or [DeepSpeed integration](./main_classes/deepspeed).
|
| 7 |
-
|
| 8 |
-
This guide will show you how to select the number of accelerators to use and the order to use them in.
|
| 9 |
-
|
| 10 |
-
## Number of accelerators
|
| 11 |
-
|
| 12 |
-
For example, if there are 4 accelerators and you only want to use the first 2, run the command below.
|
| 13 |
-
|
| 14 |
-
<hfoptions id="select-accelerator">
|
| 15 |
-
<hfoption id="torchrun">
|
| 16 |
-
|
| 17 |
-
Use the `--nproc_per_node` to select how many accelerators to use.
|
| 18 |
-
|
| 19 |
-
</hfoption>
|
| 20 |
-
<hfoption id="Accelerate">
|
| 21 |
-
|
| 22 |
-
Use `--num_processes` to select how many accelerators to use.
|
| 23 |
-
|
| 24 |
-
</hfoption>
|
| 25 |
-
<hfoption id="DeepSpeed">
|
| 26 |
-
|
| 27 |
-
Use `--num_gpus` to select how many GPUs to use.
|
| 28 |
-
|
| 29 |
-
</hfoption>
|
| 30 |
-
</hfoptions>
|
| 31 |
-
|
| 32 |
-
## Order of accelerators
|
| 33 |
-
To select specific accelerators to use and their order, use the environment variable appropriate for your hardware. This is often set on the command line for each run, but can also be added to your `~/.bashrc` or other startup config file.
|
| 34 |
-
|
| 35 |
-
For example, if there are 4 accelerators (0, 1, 2, 3) and you only want to run accelerators 0 and 2:
|
| 36 |
-
|
| 37 |
-
<hfoptions id="accelerator-type">
|
| 38 |
-
<hfoption id="CUDA">
|
| 39 |
-
|
| 40 |
-
Only GPUs 0 and 2 are "visible" to PyTorch and are mapped to `cuda:0` and `cuda:1` respectively.
|
| 41 |
-
To reverse the order (use GPU 2 as `cuda:0` and GPU 0 as `cuda:1`):
|
| 42 |
-
|
| 43 |
-
To run without any GPUs:
|
| 44 |
-
|
| 45 |
-
You can also control the order of CUDA devices using `CUDA_DEVICE_ORDER`:
|
| 46 |
-
|
| 47 |
-
- Order by PCIe bus ID (matches `nvidia-smi`):
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
- Order by compute capability (fastest first):
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
</hfoption>
|
| 56 |
-
<hfoption id="Intel XPU">
|
| 57 |
-
|
| 58 |
-
Only XPUs 0 and 2 are "visible" to PyTorch and are mapped to `xpu:0` and `xpu:1` respectively.
|
| 59 |
-
To reverse the order (use XPU 2 as `xpu:0` and XPU 0 as `xpu:1`):
|
| 60 |
-
|
| 61 |
-
You can also control the order of Intel XPUs with:
|
| 62 |
-
|
| 63 |
-
For more information about device enumeration and sorting on Intel XPU, please refer to the [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) documentation.
|
| 64 |
-
|
| 65 |
-
</hfoption>
|
| 66 |
-
</hfoptions>
|
| 67 |
-
|
| 68 |
-
> [!WARNING]
|
| 69 |
-
> Environment variables can be exported instead of being added to the command line. This is not recommended because it can be confusing if you forget how the environment variable was set up and you end up using the wrong accelerators. Instead, it is common practice to set the environment variable for a specific training run on the same command line.
|
| 70 |
-
```
|
| 71 |
-
"""
|
|
|
|
| 1 |
+
output = """
|
| 2 |
+
What do these sentences about Hugging Face Transformers (a machine learning library) mean in Korean? Please do not translate the word after a 🤗 emoji as it is a product name. Output only the translated result without any explanations or introductions.
|
| 3 |
+
```md
|
| 4 |
+
# Accelerator selection
|
| 5 |
+
|
| 6 |
+
During distributed training, you can specify the number and order of accelerators (CUDA, XPU, MPS, HPU, etc.) to use. This can be useful when you have accelerators with different computing power and you want to use the faster accelerator first. Or you could only use a subset of the available accelerators. The selection process works for both [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html). You don't need Accelerate or [DeepSpeed integration](./main_classes/deepspeed).
|
| 7 |
+
|
| 8 |
+
This guide will show you how to select the number of accelerators to use and the order to use them in.
|
| 9 |
+
|
| 10 |
+
## Number of accelerators
|
| 11 |
+
|
| 12 |
+
For example, if there are 4 accelerators and you only want to use the first 2, run the command below.
|
| 13 |
+
|
| 14 |
+
<hfoptions id="select-accelerator">
|
| 15 |
+
<hfoption id="torchrun">
|
| 16 |
+
|
| 17 |
+
Use the `--nproc_per_node` to select how many accelerators to use.
|
| 18 |
+
|
| 19 |
+
</hfoption>
|
| 20 |
+
<hfoption id="Accelerate">
|
| 21 |
+
|
| 22 |
+
Use `--num_processes` to select how many accelerators to use.
|
| 23 |
+
|
| 24 |
+
</hfoption>
|
| 25 |
+
<hfoption id="DeepSpeed">
|
| 26 |
+
|
| 27 |
+
Use `--num_gpus` to select how many GPUs to use.
|
| 28 |
+
|
| 29 |
+
</hfoption>
|
| 30 |
+
</hfoptions>
|
| 31 |
+
|
| 32 |
+
## Order of accelerators
|
| 33 |
+
To select specific accelerators to use and their order, use the environment variable appropriate for your hardware. This is often set on the command line for each run, but can also be added to your `~/.bashrc` or other startup config file.
|
| 34 |
+
|
| 35 |
+
For example, if there are 4 accelerators (0, 1, 2, 3) and you only want to run accelerators 0 and 2:
|
| 36 |
+
|
| 37 |
+
<hfoptions id="accelerator-type">
|
| 38 |
+
<hfoption id="CUDA">
|
| 39 |
+
|
| 40 |
+
Only GPUs 0 and 2 are "visible" to PyTorch and are mapped to `cuda:0` and `cuda:1` respectively.
|
| 41 |
+
To reverse the order (use GPU 2 as `cuda:0` and GPU 0 as `cuda:1`):
|
| 42 |
+
|
| 43 |
+
To run without any GPUs:
|
| 44 |
+
|
| 45 |
+
You can also control the order of CUDA devices using `CUDA_DEVICE_ORDER`:
|
| 46 |
+
|
| 47 |
+
- Order by PCIe bus ID (matches `nvidia-smi`):
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
- Order by compute capability (fastest first):
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
</hfoption>
|
| 56 |
+
<hfoption id="Intel XPU">
|
| 57 |
+
|
| 58 |
+
Only XPUs 0 and 2 are "visible" to PyTorch and are mapped to `xpu:0` and `xpu:1` respectively.
|
| 59 |
+
To reverse the order (use XPU 2 as `xpu:0` and XPU 0 as `xpu:1`):
|
| 60 |
+
|
| 61 |
+
You can also control the order of Intel XPUs with:
|
| 62 |
+
|
| 63 |
+
For more information about device enumeration and sorting on Intel XPU, please refer to the [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) documentation.
|
| 64 |
+
|
| 65 |
+
</hfoption>
|
| 66 |
+
</hfoptions>
|
| 67 |
+
|
| 68 |
+
> [!WARNING]
|
| 69 |
+
> Environment variables can be exported instead of being added to the command line. This is not recommended because it can be confusing if you forget how the environment variable was set up and you end up using the wrong accelerators. Instead, it is common practice to set the environment variable for a specific training run on the same command line.
|
| 70 |
+
```
|
| 71 |
+
"""
|
test/test_translate.py
CHANGED
|
@@ -1,68 +1,68 @@
|
|
| 1 |
-
translated_content = """
|
| 2 |
-
# 가속기 선택
|
| 3 |
-
|
| 4 |
-
분산 훈련 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 개수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기들이 있을 때 더 빠른 가속기를 먼저 사용하고 싶거나, 사용 가능한 가속기 중 일부만 사용하고 싶을 때 유용합니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
|
| 5 |
-
|
| 6 |
-
이 가이드에서는 사용할 가속기의 개수와 사용 순서를 선택하는 방법을 보여드립니다.
|
| 7 |
-
|
| 8 |
-
## 가속기 개수
|
| 9 |
-
|
| 10 |
-
예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령어를 실행하세요.
|
| 11 |
-
|
| 12 |
-
<hfoptions id="select-accelerator">
|
| 13 |
-
<hfoption id="torchrun">
|
| 14 |
-
|
| 15 |
-
사용할 가속기 개수를 선택하려면 `--nproc_per_node`를 사용하세요.
|
| 16 |
-
|
| 17 |
-
</hfoption>
|
| 18 |
-
<hfoption id="Accelerate">
|
| 19 |
-
|
| 20 |
-
사용할 가속기 개수를 선택하려면 `--num_processes`를 사용하세요.
|
| 21 |
-
|
| 22 |
-
</hfoption>
|
| 23 |
-
<hfoption id="🤗 DeepSpeed">
|
| 24 |
-
|
| 25 |
-
사용할 GPU 개수를 선택하려면 `--num_gpus`를 사용하세요.
|
| 26 |
-
|
| 27 |
-
</hfoption>
|
| 28 |
-
</hfoptions>
|
| 29 |
-
|
| 30 |
-
## 가속기 순서
|
| 31 |
-
사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 보통 각 실행마다 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
|
| 32 |
-
|
| 33 |
-
예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
|
| 34 |
-
|
| 35 |
-
<hfoptions id="accelerator-type">
|
| 36 |
-
<hfoption id="CUDA">
|
| 37 |
-
|
| 38 |
-
GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
|
| 39 |
-
순서를 바꾸려면(GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
|
| 40 |
-
|
| 41 |
-
GPU 없이 실행하려면:
|
| 42 |
-
|
| 43 |
-
`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
|
| 44 |
-
|
| 45 |
-
- PCIe 버스 ID 순서(`nvidia-smi`와 일치):
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
- 컴퓨팅 성능 순서(가장 빠른 것부터):
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
</hfoption>
|
| 54 |
-
<hfoption id="Intel XPU">
|
| 55 |
-
|
| 56 |
-
XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
|
| 57 |
-
순서를 바꾸려면(XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
|
| 58 |
-
|
| 59 |
-
다음으로 Intel XPU의 순서를 제어할 수도 있습니다:
|
| 60 |
-
|
| 61 |
-
Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
|
| 62 |
-
|
| 63 |
-
</hfoption>
|
| 64 |
-
</hfoptions>
|
| 65 |
-
|
| 66 |
-
> [!WARNING]
|
| 67 |
-
> 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신 특정 훈련 실행을 위한 환경 변수를 같은 명령줄에서 설정하는 것이 일반적인 관행입니다.
|
| 68 |
-
"""
|
|
|
|
| 1 |
+
translated_content = """
|
| 2 |
+
# 가속기 선택
|
| 3 |
+
|
| 4 |
+
분산 훈련 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 개수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기들이 있을 때 더 빠른 가속기를 먼저 사용하고 싶거나, 사용 가능한 가속기 중 일부만 사용하고 싶을 때 유용합니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
|
| 5 |
+
|
| 6 |
+
이 가이드에서는 사용할 가속기의 개수와 사용 순서를 선택하는 방법을 보여드립니다.
|
| 7 |
+
|
| 8 |
+
## 가속기 개수
|
| 9 |
+
|
| 10 |
+
예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령어를 실행하세요.
|
| 11 |
+
|
| 12 |
+
<hfoptions id="select-accelerator">
|
| 13 |
+
<hfoption id="torchrun">
|
| 14 |
+
|
| 15 |
+
사용할 가속기 개수를 선택하려면 `--nproc_per_node`를 사용하세요.
|
| 16 |
+
|
| 17 |
+
</hfoption>
|
| 18 |
+
<hfoption id="Accelerate">
|
| 19 |
+
|
| 20 |
+
사용할 가속기 개수를 선택하려면 `--num_processes`를 사용하세요.
|
| 21 |
+
|
| 22 |
+
</hfoption>
|
| 23 |
+
<hfoption id="🤗 DeepSpeed">
|
| 24 |
+
|
| 25 |
+
사용할 GPU 개수를 선택하려면 `--num_gpus`를 사용하세요.
|
| 26 |
+
|
| 27 |
+
</hfoption>
|
| 28 |
+
</hfoptions>
|
| 29 |
+
|
| 30 |
+
## 가속기 순서
|
| 31 |
+
사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 보통 각 실행마다 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
|
| 32 |
+
|
| 33 |
+
예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
|
| 34 |
+
|
| 35 |
+
<hfoptions id="accelerator-type">
|
| 36 |
+
<hfoption id="CUDA">
|
| 37 |
+
|
| 38 |
+
GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
|
| 39 |
+
순서를 바꾸려면(GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
|
| 40 |
+
|
| 41 |
+
GPU 없이 실행하려면:
|
| 42 |
+
|
| 43 |
+
`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
|
| 44 |
+
|
| 45 |
+
- PCIe 버스 ID 순서(`nvidia-smi`와 일치):
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
- 컴퓨팅 성능 순서(가장 빠른 것부터):
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
</hfoption>
|
| 54 |
+
<hfoption id="Intel XPU">
|
| 55 |
+
|
| 56 |
+
XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
|
| 57 |
+
순서를 바꾸려면(XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
|
| 58 |
+
|
| 59 |
+
다음으로 Intel XPU의 순서를 제어할 수도 있습니다:
|
| 60 |
+
|
| 61 |
+
Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
|
| 62 |
+
|
| 63 |
+
</hfoption>
|
| 64 |
+
</hfoptions>
|
| 65 |
+
|
| 66 |
+
> [!WARNING]
|
| 67 |
+
> 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신 특정 훈련 실행을 위한 환경 변수를 같은 명령줄에서 설정하는 것이 일반적인 관행입니다.
|
| 68 |
+
"""
|
translation_result/docs/source/en/accelerator_selection.md
CHANGED
|
@@ -1,127 +1,127 @@
|
|
| 1 |
-
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
| 2 |
-
|
| 3 |
-
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
| 4 |
-
the License. You may obtain a copy of the License at
|
| 5 |
-
|
| 6 |
-
http://www.apache.org/licenses/LICENSE-2.0
|
| 7 |
-
|
| 8 |
-
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
| 9 |
-
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
| 10 |
-
specific language governing permissions and limitations under the License.
|
| 11 |
-
|
| 12 |
-
⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
|
| 13 |
-
rendered properly in your Markdown viewer.
|
| 14 |
-
|
| 15 |
-
-->
|
| 16 |
-
|
| 17 |
-
# 가속기 선택 [[accelerator-selection]]
|
| 18 |
-
|
| 19 |
-
분산 학습 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기가 있을 때 더 빠른 가속기를 먼저 사용하고 싶은 경우에 유용할 수 있습니다. 또는 사용 가능한 가속기의 일부만 사용할 수도 있습니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)는 필요하지 않습니다.
|
| 20 |
-
|
| 21 |
-
이 가이드는 사용할 가속기의 수와 사용 순서를 선택하는 방법을 보여줍니다.
|
| 22 |
-
|
| 23 |
-
## 가속기 수 [[number-of-accelerators]]
|
| 24 |
-
|
| 25 |
-
예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령을 실행하세요.
|
| 26 |
-
|
| 27 |
-
<hfoptions id="select-accelerator">
|
| 28 |
-
<hfoption id="torchrun">
|
| 29 |
-
|
| 30 |
-
`--nproc_per_node`를 사용하여 사용할 가속기 수를 선택합니다.
|
| 31 |
-
|
| 32 |
-
```bash
|
| 33 |
-
torchrun --nproc_per_node=2 trainer-program.py ...
|
| 34 |
-
```
|
| 35 |
-
|
| 36 |
-
</hfoption>
|
| 37 |
-
<hfoption id="Accelerate">
|
| 38 |
-
|
| 39 |
-
`--num_processes`를 사용하여 사용할 가속기 수를 선택합니다.
|
| 40 |
-
|
| 41 |
-
```bash
|
| 42 |
-
accelerate launch --num_processes 2 trainer-program.py ...
|
| 43 |
-
```
|
| 44 |
-
|
| 45 |
-
</hfoption>
|
| 46 |
-
<hfoption id="DeepSpeed">
|
| 47 |
-
|
| 48 |
-
`--num_gpus`를 사용하여 사용할 GPU 수를 선택합니다.
|
| 49 |
-
|
| 50 |
-
```bash
|
| 51 |
-
deepspeed --num_gpus 2 trainer-program.py ...
|
| 52 |
-
```
|
| 53 |
-
|
| 54 |
-
</hfoption>
|
| 55 |
-
</hfoptions>
|
| 56 |
-
|
| 57 |
-
## 가속기 순서 [[order-of-accelerators]]
|
| 58 |
-
사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 종종 각 실행에 대해 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 구성 파일에 추가할 수도 있습니다.
|
| 59 |
-
|
| 60 |
-
예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
|
| 61 |
-
|
| 62 |
-
<hfoptions id="accelerator-type">
|
| 63 |
-
<hfoption id="CUDA">
|
| 64 |
-
|
| 65 |
-
```bash
|
| 66 |
-
CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
|
| 67 |
-
```
|
| 68 |
-
|
| 69 |
-
GPU 0과 2만 PyTorch에서 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
|
| 70 |
-
순서를 바꾸려면 (GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
```bash
|
| 74 |
-
CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
|
| 75 |
-
```
|
| 76 |
-
|
| 77 |
-
GPU 없이 실행하려면:
|
| 78 |
-
|
| 79 |
-
```bash
|
| 80 |
-
CUDA_VISIBLE_DEVICES= python trainer-program.py ...
|
| 81 |
-
```
|
| 82 |
-
|
| 83 |
-
`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
|
| 84 |
-
|
| 85 |
-
- PCIe 버스 ID 순서 (`nvidia-smi`와 일치):
|
| 86 |
-
|
| 87 |
-
```bash
|
| 88 |
-
$hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
|
| 89 |
-
```
|
| 90 |
-
|
| 91 |
-
- 컴퓨팅 성능 순서 (가장 빠른 것부터):
|
| 92 |
-
|
| 93 |
-
```bash
|
| 94 |
-
export CUDA_DEVICE_ORDER=FASTEST_FIRST
|
| 95 |
-
```
|
| 96 |
-
|
| 97 |
-
</hfoption>
|
| 98 |
-
<hfoption id="Intel XPU">
|
| 99 |
-
|
| 100 |
-
```bash
|
| 101 |
-
ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
|
| 102 |
-
```
|
| 103 |
-
|
| 104 |
-
XPU 0과 2만 PyTorch에서 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
|
| 105 |
-
순서를 바꾸려면 (XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
|
| 106 |
-
|
| 107 |
-
```bash
|
| 108 |
-
ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
|
| 109 |
-
```
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
다음을 사용하여 Intel XPU의 순서를 제어할 수도 있습니다:
|
| 113 |
-
|
| 114 |
-
```bash
|
| 115 |
-
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
|
| 116 |
-
```
|
| 117 |
-
|
| 118 |
-
Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
|
| 119 |
-
|
| 120 |
-
</hfoption>
|
| 121 |
-
</hfoptions>
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
> [!WARNING]
|
| 126 |
-
> 환경 변수는 명령줄에 추가하는 대신 내보낼 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란을 야기할 수 있으므로 권장하지 않습니다. 대신, 같은 명령줄에서 특정 훈련 실행을 위해 환경 변수를 설정하는 것이 일반적인 관례입니다.
|
| 127 |
```
|
|
|
|
| 1 |
+
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
| 2 |
+
|
| 3 |
+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
| 4 |
+
the License. You may obtain a copy of the License at
|
| 5 |
+
|
| 6 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 7 |
+
|
| 8 |
+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
| 9 |
+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
| 10 |
+
specific language governing permissions and limitations under the License.
|
| 11 |
+
|
| 12 |
+
⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
|
| 13 |
+
rendered properly in your Markdown viewer.
|
| 14 |
+
|
| 15 |
+
-->
|
| 16 |
+
|
| 17 |
+
# 가속기 선택 [[accelerator-selection]]
|
| 18 |
+
|
| 19 |
+
분산 학습 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기가 있을 때 더 빠른 가속기를 먼저 사용하고 싶은 경우에 유용할 수 있습니다. 또는 사용 가능한 가속기의 일부만 사용할 수도 있습니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)는 필요하지 않습니다.
|
| 20 |
+
|
| 21 |
+
이 가이드는 사용할 가속기의 수와 사용 순서를 선택하는 방법을 보여줍니다.
|
| 22 |
+
|
| 23 |
+
## 가속기 수 [[number-of-accelerators]]
|
| 24 |
+
|
| 25 |
+
예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령을 실행하세요.
|
| 26 |
+
|
| 27 |
+
<hfoptions id="select-accelerator">
|
| 28 |
+
<hfoption id="torchrun">
|
| 29 |
+
|
| 30 |
+
`--nproc_per_node`를 사용하여 사용할 가속기 수를 선택합니다.
|
| 31 |
+
|
| 32 |
+
```bash
|
| 33 |
+
torchrun --nproc_per_node=2 trainer-program.py ...
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
</hfoption>
|
| 37 |
+
<hfoption id="Accelerate">
|
| 38 |
+
|
| 39 |
+
`--num_processes`를 사용하여 사용할 가속기 수를 선택합니다.
|
| 40 |
+
|
| 41 |
+
```bash
|
| 42 |
+
accelerate launch --num_processes 2 trainer-program.py ...
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
</hfoption>
|
| 46 |
+
<hfoption id="DeepSpeed">
|
| 47 |
+
|
| 48 |
+
`--num_gpus`를 사용하여 사용할 GPU 수를 선택합니다.
|
| 49 |
+
|
| 50 |
+
```bash
|
| 51 |
+
deepspeed --num_gpus 2 trainer-program.py ...
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
</hfoption>
|
| 55 |
+
</hfoptions>
|
| 56 |
+
|
| 57 |
+
## 가속기 순서 [[order-of-accelerators]]
|
| 58 |
+
사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 종종 각 실행에 대해 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 구성 파일에 추가할 수도 있습니다.
|
| 59 |
+
|
| 60 |
+
예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
|
| 61 |
+
|
| 62 |
+
<hfoptions id="accelerator-type">
|
| 63 |
+
<hfoption id="CUDA">
|
| 64 |
+
|
| 65 |
+
```bash
|
| 66 |
+
CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
GPU 0과 2만 PyTorch에서 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
|
| 70 |
+
순서를 바꾸려면 (GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
```bash
|
| 74 |
+
CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
GPU 없이 실행하려면:
|
| 78 |
+
|
| 79 |
+
```bash
|
| 80 |
+
CUDA_VISIBLE_DEVICES= python trainer-program.py ...
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
|
| 84 |
+
|
| 85 |
+
- PCIe 버스 ID 순서 (`nvidia-smi`와 일치):
|
| 86 |
+
|
| 87 |
+
```bash
|
| 88 |
+
$hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
- 컴퓨팅 성능 순서 (가장 빠른 것부터):
|
| 92 |
+
|
| 93 |
+
```bash
|
| 94 |
+
export CUDA_DEVICE_ORDER=FASTEST_FIRST
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
</hfoption>
|
| 98 |
+
<hfoption id="Intel XPU">
|
| 99 |
+
|
| 100 |
+
```bash
|
| 101 |
+
ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
XPU 0과 2만 PyTorch에서 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
|
| 105 |
+
순서를 바꾸려면 (XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
|
| 106 |
+
|
| 107 |
+
```bash
|
| 108 |
+
ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
다음을 사용하여 Intel XPU의 순서를 제어할 수도 있습니다:
|
| 113 |
+
|
| 114 |
+
```bash
|
| 115 |
+
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
|
| 119 |
+
|
| 120 |
+
</hfoption>
|
| 121 |
+
</hfoptions>
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
> [!WARNING]
|
| 126 |
+
> 환경 변수는 명령줄에 추가하는 대신 내보낼 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란을 야기할 수 있으므로 권장하지 않습니다. 대신, 같은 명령줄에서 특정 훈련 실행을 위해 환경 변수를 설정하는 것이 일반적인 관례입니다.
|
| 127 |
```
|
translator/content.py
CHANGED
|
@@ -1,214 +1,214 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import re
|
| 3 |
-
import string
|
| 4 |
-
|
| 5 |
-
import requests
|
| 6 |
-
from langchain.callbacks import get_openai_callback
|
| 7 |
-
from langchain_anthropic import ChatAnthropic
|
| 8 |
-
import boto3
|
| 9 |
-
import json
|
| 10 |
-
|
| 11 |
-
from translator.prompt_glossary import PROMPT_WITH_GLOSSARY
|
| 12 |
-
from translator.project_config import get_project_config
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
def get_content(filepath: str, project: str = "transformers") -> str:
|
| 16 |
-
if filepath == "":
|
| 17 |
-
raise ValueError("No files selected for translation.")
|
| 18 |
-
|
| 19 |
-
config = get_project_config(project)
|
| 20 |
-
# Extract repo path from repo_url (e.g., "huggingface/transformers")
|
| 21 |
-
repo_path = config.repo_url.replace("https://github.com/", "")
|
| 22 |
-
|
| 23 |
-
url = f"https://raw.githubusercontent.com/{repo_path}/main/{filepath}"
|
| 24 |
-
response = requests.get(url)
|
| 25 |
-
if response.status_code == 200:
|
| 26 |
-
content = response.text
|
| 27 |
-
return content
|
| 28 |
-
else:
|
| 29 |
-
raise ValueError("Failed to retrieve content from the URL.", url)
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
def preprocess_content(content: str) -> str:
|
| 33 |
-
# Extract text to translate from document
|
| 34 |
-
|
| 35 |
-
## ignore top license comment
|
| 36 |
-
to_translate = content[content.find("#") :]
|
| 37 |
-
## remove code blocks from text
|
| 38 |
-
# to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
|
| 39 |
-
## remove markdown tables from text
|
| 40 |
-
# to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
|
| 41 |
-
## remove empty lines from text
|
| 42 |
-
to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
|
| 43 |
-
return to_translate
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
def get_full_prompt(language: str, to_translate: str, additional_instruction: str = "") -> str:
|
| 47 |
-
base_prompt = string.Template(
|
| 48 |
-
"What do these sentences about Hugging Face Transformers "
|
| 49 |
-
"(a machine learning library) mean in $language? "
|
| 50 |
-
"Please do not translate the word after a 🤗 emoji "
|
| 51 |
-
"as it is a product name. Output the complete markdown file**, with prose translated and all other content intact"
|
| 52 |
-
"No explanations or extras—only the translated markdown. Also translate all comments within code blocks as well."
|
| 53 |
-
).safe_substitute(language=language)
|
| 54 |
-
|
| 55 |
-
base_prompt += "\n\n```md"
|
| 56 |
-
|
| 57 |
-
full_prompt = "\n".join([base_prompt, to_translate.strip(), "```", PROMPT_WITH_GLOSSARY])
|
| 58 |
-
|
| 59 |
-
if additional_instruction.strip():
|
| 60 |
-
full_prompt += f"\n\n🗒️ Additional instructions: {additional_instruction.strip()}"
|
| 61 |
-
|
| 62 |
-
return full_prompt
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
def split_markdown_sections(markdown: str) -> list:
|
| 66 |
-
# Find all titles using regular expressions
|
| 67 |
-
return re.split(r"^(#+\s+)(.*)$", markdown, flags=re.MULTILINE)[1:]
|
| 68 |
-
# format is like [level, title, content, level, title, content, ...]
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
def get_anchors(divided: list) -> list:
|
| 72 |
-
anchors = []
|
| 73 |
-
# from https://github.com/huggingface/doc-builder/blob/01b262bae90d66e1150cdbf58c83c02733ed4366/src/doc_builder/build_doc.py#L300-L302
|
| 74 |
-
for title in divided[1::3]:
|
| 75 |
-
anchor = re.sub(r"[^a-z0-9\s]+", "", title.lower())
|
| 76 |
-
anchor = re.sub(r"\s{2,}", " ", anchor.strip()).replace(" ", "-")
|
| 77 |
-
anchors.append(f"[[{anchor}]]")
|
| 78 |
-
return anchors
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
def make_scaffold(content: str, to_translate: str) -> string.Template:
|
| 82 |
-
scaffold = content
|
| 83 |
-
for i, text in enumerate(to_translate.split("\n\n")):
|
| 84 |
-
scaffold = scaffold.replace(text, f"$hf_i18n_placeholder{i}", 1)
|
| 85 |
-
print("inner scaffold:")
|
| 86 |
-
print(scaffold)
|
| 87 |
-
return string.Template(scaffold)
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
def is_in_code_block(text: str, position: int) -> bool:
|
| 91 |
-
"""Check if a position in text is inside a code block"""
|
| 92 |
-
text_before = text[:position]
|
| 93 |
-
code_block_starts = text_before.count("```")
|
| 94 |
-
return code_block_starts % 2 == 1
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
|
| 98 |
-
scaffold = make_scaffold(content, to_translate)
|
| 99 |
-
print("scaffold:")
|
| 100 |
-
print(scaffold.template)
|
| 101 |
-
|
| 102 |
-
# Get original text sections to maintain structure
|
| 103 |
-
original_sections = to_translate.split("\n\n")
|
| 104 |
-
|
| 105 |
-
# Split markdown sections to get headers and anchors
|
| 106 |
-
divided = split_markdown_sections(to_translate)
|
| 107 |
-
print("divided:")
|
| 108 |
-
print(divided)
|
| 109 |
-
anchors = get_anchors(divided)
|
| 110 |
-
|
| 111 |
-
# Split translated content by markdown sections
|
| 112 |
-
translated_divided = split_markdown_sections(translated)
|
| 113 |
-
print("translated divided:")
|
| 114 |
-
print(translated_divided)
|
| 115 |
-
|
| 116 |
-
# Ensure we have the same number of headers as the original
|
| 117 |
-
if len(translated_divided[1::3]) != len(anchors):
|
| 118 |
-
print(f"Warning: Header count mismatch. Original: {len(anchors)}, Translated: {len(translated_divided[1::3])}")
|
| 119 |
-
# Adjust anchors list to match translated headers
|
| 120 |
-
if len(translated_divided[1::3]) < len(anchors):
|
| 121 |
-
anchors = anchors[:len(translated_divided[1::3])]
|
| 122 |
-
else:
|
| 123 |
-
# Add empty anchors for extra headers
|
| 124 |
-
anchors.extend([""] * (len(translated_divided[1::3]) - len(anchors)))
|
| 125 |
-
|
| 126 |
-
# Add anchors to translated headers only if they're not in code blocks
|
| 127 |
-
for i, korean_title in enumerate(translated_divided[1::3]):
|
| 128 |
-
if i < len(anchors):
|
| 129 |
-
# Find the position of this header in the original translated text
|
| 130 |
-
header_pos = translated.find(korean_title.strip())
|
| 131 |
-
if header_pos != -1 and not is_in_code_block(translated, header_pos):
|
| 132 |
-
translated_divided[1 + i * 3] = f"{korean_title} {anchors[i]}"
|
| 133 |
-
else:
|
| 134 |
-
translated_divided[1 + i * 3] = korean_title
|
| 135 |
-
|
| 136 |
-
# Reconstruct translated content with proper structure
|
| 137 |
-
reconstructed_translated = "".join([
|
| 138 |
-
"".join(translated_divided[i * 3 : i * 3 + 3])
|
| 139 |
-
for i in range(len(translated_divided) // 3)
|
| 140 |
-
])
|
| 141 |
-
|
| 142 |
-
# Split by double newlines to match original structure
|
| 143 |
-
translated_sections = reconstructed_translated.split("\n\n")
|
| 144 |
-
|
| 145 |
-
print("scaffold template count:")
|
| 146 |
-
print(scaffold.template.count("$hf_i18n_placeholder"))
|
| 147 |
-
print("original sections length:")
|
| 148 |
-
print(len(original_sections))
|
| 149 |
-
print("translated sections length:")
|
| 150 |
-
print(len(translated_sections))
|
| 151 |
-
|
| 152 |
-
# Ensure section counts match
|
| 153 |
-
placeholder_count = scaffold.template.count("$hf_i18n_placeholder")
|
| 154 |
-
|
| 155 |
-
if len(translated_sections) < placeholder_count:
|
| 156 |
-
# Add empty sections if translated has fewer sections
|
| 157 |
-
translated_sections.extend([""] * (placeholder_count - len(translated_sections)))
|
| 158 |
-
elif len(translated_sections) > placeholder_count:
|
| 159 |
-
# Truncate if translated has more sections
|
| 160 |
-
translated_sections = translated_sections[:placeholder_count]
|
| 161 |
-
|
| 162 |
-
# Final check
|
| 163 |
-
if len(translated_sections) != placeholder_count:
|
| 164 |
-
return f"Error: Section count mismatch. Expected: {placeholder_count}, Got: {len(translated_sections)}"
|
| 165 |
-
|
| 166 |
-
translated_doc = scaffold.safe_substitute(
|
| 167 |
-
{f"hf_i18n_placeholder{i}": text for i, text in enumerate(translated_sections)}
|
| 168 |
-
)
|
| 169 |
-
return translated_doc
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
def llm_translate(to_translate: str) -> tuple[str, str]:
|
| 173 |
-
anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY")
|
| 174 |
-
aws_bearer_token_bedrock = os.environ.get("AWS_BEARER_TOKEN_BEDROCK")
|
| 175 |
-
|
| 176 |
-
if anthropic_api_key:
|
| 177 |
-
# Use Anthropic API Key
|
| 178 |
-
model = ChatAnthropic(
|
| 179 |
-
model="claude-sonnet-4-20250514", max_tokens=64000, streaming=True
|
| 180 |
-
)
|
| 181 |
-
ai_message = model.invoke(to_translate)
|
| 182 |
-
cb = "Anthropic API Key used"
|
| 183 |
-
return str(cb), ai_message.content
|
| 184 |
-
|
| 185 |
-
elif aws_bearer_token_bedrock:
|
| 186 |
-
# Use AWS Bedrock with bearer token (assuming standard AWS credential chain is configured)
|
| 187 |
-
# Note: boto3 does not directly use a 'bearer_token' named environment variable for SigV4 authentication.
|
| 188 |
-
# It relies on AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, or IAM roles.
|
| 189 |
-
# If AWS_BEARER_TOKEN_BEDROCK is meant to be one of these, it should be renamed accordingly.
|
| 190 |
-
# For now, we proceed assuming standard AWS credential chain is configured to pick up credentials.
|
| 191 |
-
client = boto3.client("bedrock-runtime", region_name="eu-north-1")
|
| 192 |
-
|
| 193 |
-
body = {
|
| 194 |
-
"messages": [
|
| 195 |
-
{"role": "user", "content": to_translate}
|
| 196 |
-
],
|
| 197 |
-
"max_tokens": 128000,
|
| 198 |
-
"anthropic_version": "bedrock-2023-05-31"
|
| 199 |
-
}
|
| 200 |
-
|
| 201 |
-
response = client.invoke_model(
|
| 202 |
-
modelId="arn:aws:bedrock:eu-north-1:235729104418:inference-profile/eu.anthropic.claude-3-7-sonnet-20250219-v1:0",
|
| 203 |
-
contentType="application/json",
|
| 204 |
-
accept="application/json",
|
| 205 |
-
body=json.dumps(body),
|
| 206 |
-
)
|
| 207 |
-
result = json.loads(response["body"].read())
|
| 208 |
-
cb = result["usage"]
|
| 209 |
-
ai_message = result["content"][0]["text"]
|
| 210 |
-
|
| 211 |
-
return str(cb), ai_message
|
| 212 |
-
|
| 213 |
-
else:
|
| 214 |
-
raise ValueError("No API key found for translation. Please set ANTHROPIC_API_KEY or AWS_BEARER_TOKEN_BEDROCK environment variable.")
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import string
|
| 4 |
+
|
| 5 |
+
import requests
|
| 6 |
+
from langchain.callbacks import get_openai_callback
|
| 7 |
+
from langchain_anthropic import ChatAnthropic
|
| 8 |
+
import boto3
|
| 9 |
+
import json
|
| 10 |
+
|
| 11 |
+
from translator.prompt_glossary import PROMPT_WITH_GLOSSARY
|
| 12 |
+
from translator.project_config import get_project_config
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def get_content(filepath: str, project: str = "transformers") -> str:
|
| 16 |
+
if filepath == "":
|
| 17 |
+
raise ValueError("No files selected for translation.")
|
| 18 |
+
|
| 19 |
+
config = get_project_config(project)
|
| 20 |
+
# Extract repo path from repo_url (e.g., "huggingface/transformers")
|
| 21 |
+
repo_path = config.repo_url.replace("https://github.com/", "")
|
| 22 |
+
|
| 23 |
+
url = f"https://raw.githubusercontent.com/{repo_path}/main/{filepath}"
|
| 24 |
+
response = requests.get(url)
|
| 25 |
+
if response.status_code == 200:
|
| 26 |
+
content = response.text
|
| 27 |
+
return content
|
| 28 |
+
else:
|
| 29 |
+
raise ValueError("Failed to retrieve content from the URL.", url)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def preprocess_content(content: str) -> str:
|
| 33 |
+
# Extract text to translate from document
|
| 34 |
+
|
| 35 |
+
## ignore top license comment
|
| 36 |
+
to_translate = content[content.find("#") :]
|
| 37 |
+
## remove code blocks from text
|
| 38 |
+
# to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
|
| 39 |
+
## remove markdown tables from text
|
| 40 |
+
# to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
|
| 41 |
+
## remove empty lines from text
|
| 42 |
+
to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
|
| 43 |
+
return to_translate
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def get_full_prompt(language: str, to_translate: str, additional_instruction: str = "") -> str:
|
| 47 |
+
base_prompt = string.Template(
|
| 48 |
+
"What do these sentences about Hugging Face Transformers "
|
| 49 |
+
"(a machine learning library) mean in $language? "
|
| 50 |
+
"Please do not translate the word after a 🤗 emoji "
|
| 51 |
+
"as it is a product name. Output the complete markdown file**, with prose translated and all other content intact"
|
| 52 |
+
"No explanations or extras—only the translated markdown. Also translate all comments within code blocks as well."
|
| 53 |
+
).safe_substitute(language=language)
|
| 54 |
+
|
| 55 |
+
base_prompt += "\n\n```md"
|
| 56 |
+
|
| 57 |
+
full_prompt = "\n".join([base_prompt, to_translate.strip(), "```", PROMPT_WITH_GLOSSARY])
|
| 58 |
+
|
| 59 |
+
if additional_instruction.strip():
|
| 60 |
+
full_prompt += f"\n\n🗒️ Additional instructions: {additional_instruction.strip()}"
|
| 61 |
+
|
| 62 |
+
return full_prompt
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def split_markdown_sections(markdown: str) -> list:
|
| 66 |
+
# Find all titles using regular expressions
|
| 67 |
+
return re.split(r"^(#+\s+)(.*)$", markdown, flags=re.MULTILINE)[1:]
|
| 68 |
+
# format is like [level, title, content, level, title, content, ...]
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def get_anchors(divided: list) -> list:
|
| 72 |
+
anchors = []
|
| 73 |
+
# from https://github.com/huggingface/doc-builder/blob/01b262bae90d66e1150cdbf58c83c02733ed4366/src/doc_builder/build_doc.py#L300-L302
|
| 74 |
+
for title in divided[1::3]:
|
| 75 |
+
anchor = re.sub(r"[^a-z0-9\s]+", "", title.lower())
|
| 76 |
+
anchor = re.sub(r"\s{2,}", " ", anchor.strip()).replace(" ", "-")
|
| 77 |
+
anchors.append(f"[[{anchor}]]")
|
| 78 |
+
return anchors
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def make_scaffold(content: str, to_translate: str) -> string.Template:
|
| 82 |
+
scaffold = content
|
| 83 |
+
for i, text in enumerate(to_translate.split("\n\n")):
|
| 84 |
+
scaffold = scaffold.replace(text, f"$hf_i18n_placeholder{i}", 1)
|
| 85 |
+
print("inner scaffold:")
|
| 86 |
+
print(scaffold)
|
| 87 |
+
return string.Template(scaffold)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def is_in_code_block(text: str, position: int) -> bool:
|
| 91 |
+
"""Check if a position in text is inside a code block"""
|
| 92 |
+
text_before = text[:position]
|
| 93 |
+
code_block_starts = text_before.count("```")
|
| 94 |
+
return code_block_starts % 2 == 1
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
|
| 98 |
+
scaffold = make_scaffold(content, to_translate)
|
| 99 |
+
print("scaffold:")
|
| 100 |
+
print(scaffold.template)
|
| 101 |
+
|
| 102 |
+
# Get original text sections to maintain structure
|
| 103 |
+
original_sections = to_translate.split("\n\n")
|
| 104 |
+
|
| 105 |
+
# Split markdown sections to get headers and anchors
|
| 106 |
+
divided = split_markdown_sections(to_translate)
|
| 107 |
+
print("divided:")
|
| 108 |
+
print(divided)
|
| 109 |
+
anchors = get_anchors(divided)
|
| 110 |
+
|
| 111 |
+
# Split translated content by markdown sections
|
| 112 |
+
translated_divided = split_markdown_sections(translated)
|
| 113 |
+
print("translated divided:")
|
| 114 |
+
print(translated_divided)
|
| 115 |
+
|
| 116 |
+
# Ensure we have the same number of headers as the original
|
| 117 |
+
if len(translated_divided[1::3]) != len(anchors):
|
| 118 |
+
print(f"Warning: Header count mismatch. Original: {len(anchors)}, Translated: {len(translated_divided[1::3])}")
|
| 119 |
+
# Adjust anchors list to match translated headers
|
| 120 |
+
if len(translated_divided[1::3]) < len(anchors):
|
| 121 |
+
anchors = anchors[:len(translated_divided[1::3])]
|
| 122 |
+
else:
|
| 123 |
+
# Add empty anchors for extra headers
|
| 124 |
+
anchors.extend([""] * (len(translated_divided[1::3]) - len(anchors)))
|
| 125 |
+
|
| 126 |
+
# Add anchors to translated headers only if they're not in code blocks
|
| 127 |
+
for i, korean_title in enumerate(translated_divided[1::3]):
|
| 128 |
+
if i < len(anchors):
|
| 129 |
+
# Find the position of this header in the original translated text
|
| 130 |
+
header_pos = translated.find(korean_title.strip())
|
| 131 |
+
if header_pos != -1 and not is_in_code_block(translated, header_pos):
|
| 132 |
+
translated_divided[1 + i * 3] = f"{korean_title} {anchors[i]}"
|
| 133 |
+
else:
|
| 134 |
+
translated_divided[1 + i * 3] = korean_title
|
| 135 |
+
|
| 136 |
+
# Reconstruct translated content with proper structure
|
| 137 |
+
reconstructed_translated = "".join([
|
| 138 |
+
"".join(translated_divided[i * 3 : i * 3 + 3])
|
| 139 |
+
for i in range(len(translated_divided) // 3)
|
| 140 |
+
])
|
| 141 |
+
|
| 142 |
+
# Split by double newlines to match original structure
|
| 143 |
+
translated_sections = reconstructed_translated.split("\n\n")
|
| 144 |
+
|
| 145 |
+
print("scaffold template count:")
|
| 146 |
+
print(scaffold.template.count("$hf_i18n_placeholder"))
|
| 147 |
+
print("original sections length:")
|
| 148 |
+
print(len(original_sections))
|
| 149 |
+
print("translated sections length:")
|
| 150 |
+
print(len(translated_sections))
|
| 151 |
+
|
| 152 |
+
# Ensure section counts match
|
| 153 |
+
placeholder_count = scaffold.template.count("$hf_i18n_placeholder")
|
| 154 |
+
|
| 155 |
+
if len(translated_sections) < placeholder_count:
|
| 156 |
+
# Add empty sections if translated has fewer sections
|
| 157 |
+
translated_sections.extend([""] * (placeholder_count - len(translated_sections)))
|
| 158 |
+
elif len(translated_sections) > placeholder_count:
|
| 159 |
+
# Truncate if translated has more sections
|
| 160 |
+
translated_sections = translated_sections[:placeholder_count]
|
| 161 |
+
|
| 162 |
+
# Final check
|
| 163 |
+
if len(translated_sections) != placeholder_count:
|
| 164 |
+
return f"Error: Section count mismatch. Expected: {placeholder_count}, Got: {len(translated_sections)}"
|
| 165 |
+
|
| 166 |
+
translated_doc = scaffold.safe_substitute(
|
| 167 |
+
{f"hf_i18n_placeholder{i}": text for i, text in enumerate(translated_sections)}
|
| 168 |
+
)
|
| 169 |
+
return translated_doc
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def llm_translate(to_translate: str) -> tuple[str, str]:
|
| 173 |
+
anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY")
|
| 174 |
+
aws_bearer_token_bedrock = os.environ.get("AWS_BEARER_TOKEN_BEDROCK")
|
| 175 |
+
|
| 176 |
+
if anthropic_api_key:
|
| 177 |
+
# Use Anthropic API Key
|
| 178 |
+
model = ChatAnthropic(
|
| 179 |
+
model="claude-sonnet-4-20250514", max_tokens=64000, streaming=True
|
| 180 |
+
)
|
| 181 |
+
ai_message = model.invoke(to_translate)
|
| 182 |
+
cb = "Anthropic API Key used"
|
| 183 |
+
return str(cb), ai_message.content
|
| 184 |
+
|
| 185 |
+
elif aws_bearer_token_bedrock:
|
| 186 |
+
# Use AWS Bedrock with bearer token (assuming standard AWS credential chain is configured)
|
| 187 |
+
# Note: boto3 does not directly use a 'bearer_token' named environment variable for SigV4 authentication.
|
| 188 |
+
# It relies on AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, or IAM roles.
|
| 189 |
+
# If AWS_BEARER_TOKEN_BEDROCK is meant to be one of these, it should be renamed accordingly.
|
| 190 |
+
# For now, we proceed assuming standard AWS credential chain is configured to pick up credentials.
|
| 191 |
+
client = boto3.client("bedrock-runtime", region_name="eu-north-1")
|
| 192 |
+
|
| 193 |
+
body = {
|
| 194 |
+
"messages": [
|
| 195 |
+
{"role": "user", "content": to_translate}
|
| 196 |
+
],
|
| 197 |
+
"max_tokens": 128000,
|
| 198 |
+
"anthropic_version": "bedrock-2023-05-31"
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
response = client.invoke_model(
|
| 202 |
+
modelId="arn:aws:bedrock:eu-north-1:235729104418:inference-profile/eu.anthropic.claude-3-7-sonnet-20250219-v1:0",
|
| 203 |
+
contentType="application/json",
|
| 204 |
+
accept="application/json",
|
| 205 |
+
body=json.dumps(body),
|
| 206 |
+
)
|
| 207 |
+
result = json.loads(response["body"].read())
|
| 208 |
+
cb = result["usage"]
|
| 209 |
+
ai_message = result["content"][0]["text"]
|
| 210 |
+
|
| 211 |
+
return str(cb), ai_message
|
| 212 |
+
|
| 213 |
+
else:
|
| 214 |
+
raise ValueError("No API key found for translation. Please set ANTHROPIC_API_KEY or AWS_BEARER_TOKEN_BEDROCK environment variable.")
|
translator/model.py
CHANGED
|
@@ -1,70 +1,70 @@
|
|
| 1 |
-
from enum import Enum, unique
|
| 2 |
-
|
| 3 |
-
from pydantic import BaseModel, computed_field
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
@unique
|
| 7 |
-
class Languages(Enum):
|
| 8 |
-
az = "az"
|
| 9 |
-
bn = "bn"
|
| 10 |
-
de = "de"
|
| 11 |
-
em = "em"
|
| 12 |
-
es = "es"
|
| 13 |
-
fa = "fa"
|
| 14 |
-
fr = "fr"
|
| 15 |
-
he = "he"
|
| 16 |
-
hu = "hu"
|
| 17 |
-
id = "id"
|
| 18 |
-
it = "it"
|
| 19 |
-
ja = "ja"
|
| 20 |
-
ko = "ko"
|
| 21 |
-
pl = "pl"
|
| 22 |
-
pt = "pt"
|
| 23 |
-
ru = "ru"
|
| 24 |
-
tr = "tr"
|
| 25 |
-
uk = "uk"
|
| 26 |
-
ur = "ur"
|
| 27 |
-
vi = "vi"
|
| 28 |
-
yo = "yo"
|
| 29 |
-
zh = "zh"
|
| 30 |
-
zh_hant = "zh-hant"
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
class TranslationDoc(BaseModel):
|
| 34 |
-
official_lang: str = "en"
|
| 35 |
-
translation_lang: str
|
| 36 |
-
original_file: str
|
| 37 |
-
translation_file: str | None = None
|
| 38 |
-
translation_exists: bool
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
class Summary(BaseModel):
|
| 42 |
-
lang: str
|
| 43 |
-
files_analyzed: int = 0
|
| 44 |
-
files_translated: int = 0
|
| 45 |
-
files_outdated: int = 0
|
| 46 |
-
files_missing_translation: int = 0
|
| 47 |
-
files: list[TranslationDoc] = []
|
| 48 |
-
|
| 49 |
-
@computed_field # type: ignore
|
| 50 |
-
@property
|
| 51 |
-
def percentage_missing_translation(self) -> float:
|
| 52 |
-
try:
|
| 53 |
-
return (
|
| 54 |
-
100 * float(self.files_missing_translation) / float(self.files_analyzed)
|
| 55 |
-
)
|
| 56 |
-
except Exception:
|
| 57 |
-
return 0.0
|
| 58 |
-
|
| 59 |
-
def append_file(self, doc: TranslationDoc) -> None:
|
| 60 |
-
self.files.append(doc)
|
| 61 |
-
self.files_analyzed += 1
|
| 62 |
-
|
| 63 |
-
if doc.translation_exists:
|
| 64 |
-
self.files_translated += 1
|
| 65 |
-
|
| 66 |
-
if not doc.translation_exists:
|
| 67 |
-
self.files_missing_translation += 1
|
| 68 |
-
|
| 69 |
-
def first_missing_translation_files(self, length: int = 10) -> list[TranslationDoc]:
|
| 70 |
-
return list(filter(lambda d: not d.translation_exists, self.files))[:length]
|
|
|
|
| 1 |
+
from enum import Enum, unique
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, computed_field
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@unique
|
| 7 |
+
class Languages(Enum):
|
| 8 |
+
az = "az"
|
| 9 |
+
bn = "bn"
|
| 10 |
+
de = "de"
|
| 11 |
+
em = "em"
|
| 12 |
+
es = "es"
|
| 13 |
+
fa = "fa"
|
| 14 |
+
fr = "fr"
|
| 15 |
+
he = "he"
|
| 16 |
+
hu = "hu"
|
| 17 |
+
id = "id"
|
| 18 |
+
it = "it"
|
| 19 |
+
ja = "ja"
|
| 20 |
+
ko = "ko"
|
| 21 |
+
pl = "pl"
|
| 22 |
+
pt = "pt"
|
| 23 |
+
ru = "ru"
|
| 24 |
+
tr = "tr"
|
| 25 |
+
uk = "uk"
|
| 26 |
+
ur = "ur"
|
| 27 |
+
vi = "vi"
|
| 28 |
+
yo = "yo"
|
| 29 |
+
zh = "zh"
|
| 30 |
+
zh_hant = "zh-hant"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class TranslationDoc(BaseModel):
|
| 34 |
+
official_lang: str = "en"
|
| 35 |
+
translation_lang: str
|
| 36 |
+
original_file: str
|
| 37 |
+
translation_file: str | None = None
|
| 38 |
+
translation_exists: bool
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class Summary(BaseModel):
|
| 42 |
+
lang: str
|
| 43 |
+
files_analyzed: int = 0
|
| 44 |
+
files_translated: int = 0
|
| 45 |
+
files_outdated: int = 0
|
| 46 |
+
files_missing_translation: int = 0
|
| 47 |
+
files: list[TranslationDoc] = []
|
| 48 |
+
|
| 49 |
+
@computed_field # type: ignore
|
| 50 |
+
@property
|
| 51 |
+
def percentage_missing_translation(self) -> float:
|
| 52 |
+
try:
|
| 53 |
+
return (
|
| 54 |
+
100 * float(self.files_missing_translation) / float(self.files_analyzed)
|
| 55 |
+
)
|
| 56 |
+
except Exception:
|
| 57 |
+
return 0.0
|
| 58 |
+
|
| 59 |
+
def append_file(self, doc: TranslationDoc) -> None:
|
| 60 |
+
self.files.append(doc)
|
| 61 |
+
self.files_analyzed += 1
|
| 62 |
+
|
| 63 |
+
if doc.translation_exists:
|
| 64 |
+
self.files_translated += 1
|
| 65 |
+
|
| 66 |
+
if not doc.translation_exists:
|
| 67 |
+
self.files_missing_translation += 1
|
| 68 |
+
|
| 69 |
+
def first_missing_translation_files(self, length: int = 10) -> list[TranslationDoc]:
|
| 70 |
+
return list(filter(lambda d: not d.translation_exists, self.files))[:length]
|
translator/project_config.py
CHANGED
|
@@ -1,48 +1,48 @@
|
|
| 1 |
-
"""Project configuration for different HuggingFace repositories."""
|
| 2 |
-
|
| 3 |
-
from dataclasses import dataclass
|
| 4 |
-
from typing import Dict
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
@dataclass
|
| 8 |
-
class ProjectConfig:
|
| 9 |
-
"""Configuration for a specific HuggingFace project."""
|
| 10 |
-
name: str
|
| 11 |
-
repo_url: str
|
| 12 |
-
api_url: str
|
| 13 |
-
docs_path: str
|
| 14 |
-
github_issues: Dict[str, str] # language -> issue_id
|
| 15 |
-
reference_pr_url: str
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
# Project configurations
|
| 19 |
-
PROJECTS = {
|
| 20 |
-
"transformers": ProjectConfig(
|
| 21 |
-
name="Transformers",
|
| 22 |
-
repo_url="https://github.com/huggingface/transformers",
|
| 23 |
-
api_url="https://api.github.com/repos/huggingface/transformers/git/trees/main?recursive=1",
|
| 24 |
-
docs_path="docs/source",
|
| 25 |
-
github_issues={"ko": "20179"},
|
| 26 |
-
reference_pr_url="https://github.com/huggingface/transformers/pull/24968"
|
| 27 |
-
),
|
| 28 |
-
"smolagents": ProjectConfig(
|
| 29 |
-
name="SmolAgents",
|
| 30 |
-
repo_url="https://github.com/huggingface/smolagents",
|
| 31 |
-
api_url="https://api.github.com/repos/huggingface/smolagents/git/trees/main?recursive=1",
|
| 32 |
-
docs_path="docs/source",
|
| 33 |
-
github_issues={"ko": "20179"}, # To be filled when issue is created
|
| 34 |
-
reference_pr_url="https://github.com/huggingface/smolagents/pull/1581" # To be filled with actual PR URL
|
| 35 |
-
)
|
| 36 |
-
}
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
def get_project_config(project_key: str) -> ProjectConfig:
|
| 40 |
-
"""Get project configuration by key."""
|
| 41 |
-
if project_key not in PROJECTS:
|
| 42 |
-
raise ValueError(f"Unknown project: {project_key}. Available: {list(PROJECTS.keys())}")
|
| 43 |
-
return PROJECTS[project_key]
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
def get_available_projects() -> list[str]:
|
| 47 |
-
"""Get list of available project keys."""
|
| 48 |
return list(PROJECTS.keys())
|
|
|
|
| 1 |
+
"""Project configuration for different HuggingFace repositories."""
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import Dict
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
@dataclass
|
| 8 |
+
class ProjectConfig:
|
| 9 |
+
"""Configuration for a specific HuggingFace project."""
|
| 10 |
+
name: str
|
| 11 |
+
repo_url: str
|
| 12 |
+
api_url: str
|
| 13 |
+
docs_path: str
|
| 14 |
+
github_issues: Dict[str, str] # language -> issue_id
|
| 15 |
+
reference_pr_url: str
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# Project configurations
|
| 19 |
+
PROJECTS = {
|
| 20 |
+
"transformers": ProjectConfig(
|
| 21 |
+
name="Transformers",
|
| 22 |
+
repo_url="https://github.com/huggingface/transformers",
|
| 23 |
+
api_url="https://api.github.com/repos/huggingface/transformers/git/trees/main?recursive=1",
|
| 24 |
+
docs_path="docs/source",
|
| 25 |
+
github_issues={"ko": "20179"},
|
| 26 |
+
reference_pr_url="https://github.com/huggingface/transformers/pull/24968"
|
| 27 |
+
),
|
| 28 |
+
"smolagents": ProjectConfig(
|
| 29 |
+
name="SmolAgents",
|
| 30 |
+
repo_url="https://github.com/huggingface/smolagents",
|
| 31 |
+
api_url="https://api.github.com/repos/huggingface/smolagents/git/trees/main?recursive=1",
|
| 32 |
+
docs_path="docs/source",
|
| 33 |
+
github_issues={"ko": "20179"}, # To be filled when issue is created
|
| 34 |
+
reference_pr_url="https://github.com/huggingface/smolagents/pull/1581" # To be filled with actual PR URL
|
| 35 |
+
)
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def get_project_config(project_key: str) -> ProjectConfig:
|
| 40 |
+
"""Get project configuration by key."""
|
| 41 |
+
if project_key not in PROJECTS:
|
| 42 |
+
raise ValueError(f"Unknown project: {project_key}. Available: {list(PROJECTS.keys())}")
|
| 43 |
+
return PROJECTS[project_key]
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def get_available_projects() -> list[str]:
|
| 47 |
+
"""Get list of available project keys."""
|
| 48 |
return list(PROJECTS.keys())
|
translator/prompt_glossary.py
CHANGED
|
@@ -1,126 +1,126 @@
|
|
| 1 |
-
PROMPT_WITH_GLOSSARY = """
|
| 2 |
-
You have a glossary of terms with their Korean translations. When translating a sentence, you need to check if any of the words in the sentence are in the glossary, and if so, translate them according to the provided Korean terms. Here is the glossary:
|
| 3 |
-
|
| 4 |
-
🔹 Glossary (English → Korean):
|
| 5 |
-
- revision: 개정
|
| 6 |
-
- method: 메소드
|
| 7 |
-
- secrets: 비밀값
|
| 8 |
-
- search helper: 검색 헬퍼
|
| 9 |
-
- logging level: 로그 레벨
|
| 10 |
-
- workflow: 워크플로우
|
| 11 |
-
- corner case: 코너 케이스
|
| 12 |
-
- tokenization: 토큰화
|
| 13 |
-
- architecture: 아키텍처
|
| 14 |
-
- attention mask: 어텐션 마스크
|
| 15 |
-
- backbone: 백본
|
| 16 |
-
- argmax: argmax
|
| 17 |
-
- beam search: 빔 서치
|
| 18 |
-
- clustering: 군집화
|
| 19 |
-
- configuration: 구성
|
| 20 |
-
- context: 문맥
|
| 21 |
-
- cross entropy: 교차 엔트로피
|
| 22 |
-
- cross-attention: 크로스 어텐션
|
| 23 |
-
- dictionary: 딕셔너리
|
| 24 |
-
- entry: 엔트리
|
| 25 |
-
- few shot: 퓨샷
|
| 26 |
-
- flatten: 평탄화
|
| 27 |
-
- ground truth: 정답
|
| 28 |
-
- head: 헤드
|
| 29 |
-
- helper function: 헬퍼 함수
|
| 30 |
-
- image captioning: 이미지 캡셔닝
|
| 31 |
-
- image patch: 이미지 패치
|
| 32 |
-
- inference: 추론
|
| 33 |
-
- instance: 인스턴스
|
| 34 |
-
- Instantiate: 인스턴스화
|
| 35 |
-
- knowledge distillation: 지식 증류
|
| 36 |
-
- labels: 레이블
|
| 37 |
-
- large language models (LLM): 대규모 언어 모델
|
| 38 |
-
- layer: 레이어
|
| 39 |
-
- learning rate scheduler: Learning Rate Scheduler
|
| 40 |
-
- localization: 로컬리제이션
|
| 41 |
-
- log mel-filter bank: 로그 멜 필터 뱅크
|
| 42 |
-
- look-up table: 룩업 테이블
|
| 43 |
-
- loss function: 손실 함수
|
| 44 |
-
- machine learning: 머신 러닝
|
| 45 |
-
- mapping: 매핑
|
| 46 |
-
- masked language modeling (MLM): 마스크드 언어 모델
|
| 47 |
-
- malware: 악성코드
|
| 48 |
-
- metric: 지표
|
| 49 |
-
- mixed precision: 혼합 정밀도
|
| 50 |
-
- modality: 모달리티
|
| 51 |
-
- monolingual model: 단일 언어 모델
|
| 52 |
-
- multi gpu: 다중 GPU
|
| 53 |
-
- multilingual model: 다국어 모델
|
| 54 |
-
- parsing: 파싱
|
| 55 |
-
- perplexity (PPL): 펄플렉서티(Perplexity)
|
| 56 |
-
- pipeline: 파이프라인
|
| 57 |
-
- pixel values: 픽셀 값
|
| 58 |
-
- pooling: 풀링
|
| 59 |
-
- position IDs: 위치 ID
|
| 60 |
-
- preprocessing: 전처리
|
| 61 |
-
- prompt: 프롬프트
|
| 62 |
-
- pythonic: 파이써닉
|
| 63 |
-
- query: 쿼리
|
| 64 |
-
- question answering: 질의 응답
|
| 65 |
-
- raw audio waveform: 원시 오디오 파형
|
| 66 |
-
- recurrent neural network (RNN): 순환 신경망
|
| 67 |
-
- accelerator: 가속기
|
| 68 |
-
- Accelerate: Accelerate
|
| 69 |
-
- architecture: 아키텍처
|
| 70 |
-
- arguments: 인수
|
| 71 |
-
- attention mask: 어텐션 마스크
|
| 72 |
-
- augmentation: 증강
|
| 73 |
-
- autoencoding models: 오토인코딩 모델
|
| 74 |
-
- autoregressive models: 자기회귀 모델
|
| 75 |
-
- backward: 역방향
|
| 76 |
-
- bounding box: 바운딩 박스
|
| 77 |
-
- causal language modeling: 인과적 언어 모델링(causal language modeling)
|
| 78 |
-
- channel: 채널
|
| 79 |
-
- checkpoint: 체크포인트(checkpoint)
|
| 80 |
-
- chunk: 묶음
|
| 81 |
-
- computer vision: 컴퓨터 비전
|
| 82 |
-
- convolution: 합성곱
|
| 83 |
-
- crop: 자르기
|
| 84 |
-
- custom: 사용자 정의
|
| 85 |
-
- customize: 맞춤 설정하다
|
| 86 |
-
- data collator: 데이터 콜레이터
|
| 87 |
-
- dataset: 데이터 세트
|
| 88 |
-
- decoder input IDs: 디코더 입력 ID
|
| 89 |
-
- decoder models: 디코더 모델
|
| 90 |
-
- deep learning (DL): 딥러닝
|
| 91 |
-
- directory: 디렉터리
|
| 92 |
-
- distributed training: 분산 학습
|
| 93 |
-
- downstream: 다운스트림
|
| 94 |
-
- encoder models: 인코더 모델
|
| 95 |
-
- entity: 개체
|
| 96 |
-
- epoch: 에폭
|
| 97 |
-
- evaluation method: 평가 방법
|
| 98 |
-
- feature extraction: 특성 추출
|
| 99 |
-
- feature matrix: 특성 행렬(feature matrix)
|
| 100 |
-
- fine-tunning: 미세 조정
|
| 101 |
-
- finetuned models: 미세 조정 모델
|
| 102 |
-
- hidden state: 은닉 상태
|
| 103 |
-
- hyperparameter: 하이퍼파라미터
|
| 104 |
-
- learning: 학습
|
| 105 |
-
- load: 가져오다
|
| 106 |
-
- method: 메소드
|
| 107 |
-
- optimizer: 옵티마이저
|
| 108 |
-
- pad (padding): 패드 (패딩)
|
| 109 |
-
- parameter: 매개변수
|
| 110 |
-
- pretrained model: 사전훈련된 모델
|
| 111 |
-
- separator (* [SEP]를 부르는 이름): 분할 토큰
|
| 112 |
-
- sequence: 시퀀스
|
| 113 |
-
- silent error: 조용한 오류
|
| 114 |
-
- token: 토큰
|
| 115 |
-
- tokenizer: 토크나이저
|
| 116 |
-
- training: 훈련
|
| 117 |
-
- workflow: 워크플로우
|
| 118 |
-
|
| 119 |
-
📌 Instructions:
|
| 120 |
-
1. Whenever a source term from the glossary appears **in any form** (full match or partial match within a larger phrase), **replace it with the exact Korean translation** from the glossary, keeping the rest of the phrase in Korean.
|
| 121 |
-
- Example: “Attention Interface” → “어텐션 인터페이스”
|
| 122 |
-
- Example: “Architecture details” → “아키텍처 상세”
|
| 123 |
-
2. Non-glossary words should be translated naturally, respecting context and technical nuance.
|
| 124 |
-
|
| 125 |
-
Please revise the translated sentences accordingly using the terms provided in this glossary.
|
| 126 |
-
"""
|
|
|
|
| 1 |
+
PROMPT_WITH_GLOSSARY = """
|
| 2 |
+
You have a glossary of terms with their Korean translations. When translating a sentence, you need to check if any of the words in the sentence are in the glossary, and if so, translate them according to the provided Korean terms. Here is the glossary:
|
| 3 |
+
|
| 4 |
+
🔹 Glossary (English → Korean):
|
| 5 |
+
- revision: 개정
|
| 6 |
+
- method: 메소드
|
| 7 |
+
- secrets: 비밀값
|
| 8 |
+
- search helper: 검색 헬퍼
|
| 9 |
+
- logging level: 로그 레벨
|
| 10 |
+
- workflow: 워크플로우
|
| 11 |
+
- corner case: 코너 케이스
|
| 12 |
+
- tokenization: 토큰화
|
| 13 |
+
- architecture: 아키텍처
|
| 14 |
+
- attention mask: 어텐션 마스크
|
| 15 |
+
- backbone: 백본
|
| 16 |
+
- argmax: argmax
|
| 17 |
+
- beam search: 빔 서치
|
| 18 |
+
- clustering: 군집화
|
| 19 |
+
- configuration: 구성
|
| 20 |
+
- context: 문맥
|
| 21 |
+
- cross entropy: 교차 엔트로피
|
| 22 |
+
- cross-attention: 크로스 어텐션
|
| 23 |
+
- dictionary: 딕셔너리
|
| 24 |
+
- entry: 엔트리
|
| 25 |
+
- few shot: 퓨샷
|
| 26 |
+
- flatten: 평탄화
|
| 27 |
+
- ground truth: 정답
|
| 28 |
+
- head: 헤드
|
| 29 |
+
- helper function: 헬퍼 함수
|
| 30 |
+
- image captioning: 이미지 캡셔닝
|
| 31 |
+
- image patch: 이미지 패치
|
| 32 |
+
- inference: 추론
|
| 33 |
+
- instance: 인스턴스
|
| 34 |
+
- Instantiate: 인스턴스화
|
| 35 |
+
- knowledge distillation: 지식 증류
|
| 36 |
+
- labels: 레이블
|
| 37 |
+
- large language models (LLM): 대규모 언어 모델
|
| 38 |
+
- layer: 레이어
|
| 39 |
+
- learning rate scheduler: Learning Rate Scheduler
|
| 40 |
+
- localization: 로컬리제이션
|
| 41 |
+
- log mel-filter bank: 로그 멜 필터 뱅크
|
| 42 |
+
- look-up table: 룩업 테이블
|
| 43 |
+
- loss function: 손실 함수
|
| 44 |
+
- machine learning: 머신 러닝
|
| 45 |
+
- mapping: 매핑
|
| 46 |
+
- masked language modeling (MLM): 마스크드 언어 모델
|
| 47 |
+
- malware: 악성코드
|
| 48 |
+
- metric: 지표
|
| 49 |
+
- mixed precision: 혼합 정밀도
|
| 50 |
+
- modality: 모달리티
|
| 51 |
+
- monolingual model: 단일 언어 모델
|
| 52 |
+
- multi gpu: 다중 GPU
|
| 53 |
+
- multilingual model: 다국어 모델
|
| 54 |
+
- parsing: 파싱
|
| 55 |
+
- perplexity (PPL): 펄플렉서티(Perplexity)
|
| 56 |
+
- pipeline: 파이프라인
|
| 57 |
+
- pixel values: 픽셀 값
|
| 58 |
+
- pooling: 풀링
|
| 59 |
+
- position IDs: 위치 ID
|
| 60 |
+
- preprocessing: 전처리
|
| 61 |
+
- prompt: 프롬프트
|
| 62 |
+
- pythonic: 파이써닉
|
| 63 |
+
- query: 쿼리
|
| 64 |
+
- question answering: 질의 응답
|
| 65 |
+
- raw audio waveform: 원시 오디오 파형
|
| 66 |
+
- recurrent neural network (RNN): 순환 신경망
|
| 67 |
+
- accelerator: 가속기
|
| 68 |
+
- Accelerate: Accelerate
|
| 69 |
+
- architecture: 아키텍처
|
| 70 |
+
- arguments: 인수
|
| 71 |
+
- attention mask: 어텐션 마스크
|
| 72 |
+
- augmentation: 증강
|
| 73 |
+
- autoencoding models: 오토인코딩 모델
|
| 74 |
+
- autoregressive models: 자기회귀 모델
|
| 75 |
+
- backward: 역방향
|
| 76 |
+
- bounding box: 바운딩 박스
|
| 77 |
+
- causal language modeling: 인과적 언어 모델링(causal language modeling)
|
| 78 |
+
- channel: 채널
|
| 79 |
+
- checkpoint: 체크포인트(checkpoint)
|
| 80 |
+
- chunk: 묶음
|
| 81 |
+
- computer vision: 컴퓨터 비전
|
| 82 |
+
- convolution: 합성곱
|
| 83 |
+
- crop: 자르기
|
| 84 |
+
- custom: 사용자 정의
|
| 85 |
+
- customize: 맞춤 설정하다
|
| 86 |
+
- data collator: 데이터 콜레이터
|
| 87 |
+
- dataset: 데이터 세트
|
| 88 |
+
- decoder input IDs: 디코더 입력 ID
|
| 89 |
+
- decoder models: 디코더 모델
|
| 90 |
+
- deep learning (DL): 딥러닝
|
| 91 |
+
- directory: 디렉터리
|
| 92 |
+
- distributed training: 분산 학습
|
| 93 |
+
- downstream: 다운스트림
|
| 94 |
+
- encoder models: 인코더 모델
|
| 95 |
+
- entity: 개체
|
| 96 |
+
- epoch: 에폭
|
| 97 |
+
- evaluation method: 평가 방법
|
| 98 |
+
- feature extraction: 특성 추출
|
| 99 |
+
- feature matrix: 특성 행렬(feature matrix)
|
| 100 |
+
- fine-tunning: 미세 조정
|
| 101 |
+
- finetuned models: 미세 조정 모델
|
| 102 |
+
- hidden state: 은닉 상태
|
| 103 |
+
- hyperparameter: 하이퍼파라미터
|
| 104 |
+
- learning: 학습
|
| 105 |
+
- load: 가져오다
|
| 106 |
+
- method: 메소드
|
| 107 |
+
- optimizer: 옵티마이저
|
| 108 |
+
- pad (padding): 패드 (패딩)
|
| 109 |
+
- parameter: 매개변수
|
| 110 |
+
- pretrained model: 사전훈련된 모델
|
| 111 |
+
- separator (* [SEP]를 부르는 이름): 분할 토큰
|
| 112 |
+
- sequence: 시퀀스
|
| 113 |
+
- silent error: 조용한 오류
|
| 114 |
+
- token: 토큰
|
| 115 |
+
- tokenizer: 토크나이저
|
| 116 |
+
- training: 훈련
|
| 117 |
+
- workflow: 워크플로우
|
| 118 |
+
|
| 119 |
+
📌 Instructions:
|
| 120 |
+
1. Whenever a source term from the glossary appears **in any form** (full match or partial match within a larger phrase), **replace it with the exact Korean translation** from the glossary, keeping the rest of the phrase in Korean.
|
| 121 |
+
- Example: “Attention Interface” → “어텐션 인터페이스”
|
| 122 |
+
- Example: “Architecture details” → “아키텍처 상세”
|
| 123 |
+
2. Non-glossary words should be translated naturally, respecting context and technical nuance.
|
| 124 |
+
|
| 125 |
+
Please revise the translated sentences accordingly using the terms provided in this glossary.
|
| 126 |
+
"""
|
translator/retriever.py
CHANGED
|
@@ -1,199 +1,199 @@
|
|
| 1 |
-
import re
|
| 2 |
-
import os
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
|
| 5 |
-
import requests
|
| 6 |
-
|
| 7 |
-
from .model import Languages, Summary, TranslationDoc
|
| 8 |
-
from .project_config import get_project_config
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
def get_github_repo_files(project: str = "transformers"):
|
| 12 |
-
"""
|
| 13 |
-
Get github repo files
|
| 14 |
-
"""
|
| 15 |
-
config = get_project_config(project)
|
| 16 |
-
|
| 17 |
-
# Add GitHub token if available to avoid rate limiting (optional)
|
| 18 |
-
headers = {}
|
| 19 |
-
github_token = os.environ.get("GITHUB_TOKEN")
|
| 20 |
-
if github_token:
|
| 21 |
-
headers["Authorization"] = f"token {github_token}"
|
| 22 |
-
|
| 23 |
-
response = requests.get(config.api_url, headers=headers)
|
| 24 |
-
|
| 25 |
-
# Handle rate limit with helpful message
|
| 26 |
-
if response.status_code == 403 and "rate limit" in response.text.lower():
|
| 27 |
-
raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
|
| 28 |
-
|
| 29 |
-
data = response.json()
|
| 30 |
-
all_items = data.get("tree", [])
|
| 31 |
-
|
| 32 |
-
file_paths = [
|
| 33 |
-
item["path"]
|
| 34 |
-
for item in all_items
|
| 35 |
-
if item["type"] == "blob" and (item["path"].startswith("docs"))
|
| 36 |
-
]
|
| 37 |
-
return file_paths
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko", all_files: list = None):
|
| 41 |
-
"""
|
| 42 |
-
Get open PR in the github issue, filtered by title containing '[i18n-KO]'.
|
| 43 |
-
"""
|
| 44 |
-
config = get_project_config(project)
|
| 45 |
-
issue_id = config.github_issues.get(lang)
|
| 46 |
-
|
| 47 |
-
# For projects without GitHub issue tracking, still search for PRs
|
| 48 |
-
if not issue_id:
|
| 49 |
-
raise ValueError(f"⚠️ No GitHub issue registered for {project}.")
|
| 50 |
-
|
| 51 |
-
# Require all_files parameter
|
| 52 |
-
if all_files is None:
|
| 53 |
-
raise ValueError("Repository file list must be provided")
|
| 54 |
-
|
| 55 |
-
headers = {
|
| 56 |
-
"Accept": "application/vnd.github+json",
|
| 57 |
-
}
|
| 58 |
-
|
| 59 |
-
# Add GitHub token if available to avoid rate limiting (optional)
|
| 60 |
-
github_token = os.environ.get("GITHUB_TOKEN")
|
| 61 |
-
if github_token:
|
| 62 |
-
headers["Authorization"] = f"token {github_token}"
|
| 63 |
-
|
| 64 |
-
all_open_prs = []
|
| 65 |
-
page = 1
|
| 66 |
-
per_page = 100 # Maximum allowed by GitHub API
|
| 67 |
-
|
| 68 |
-
while True:
|
| 69 |
-
repo_path = config.repo_url.replace("https://github.com/", "")
|
| 70 |
-
url = f"https://api.github.com/repos/{repo_path}/pulls?state=open&page={page}&per_page={per_page}"
|
| 71 |
-
response = requests.get(url, headers=headers)
|
| 72 |
-
|
| 73 |
-
if response.status_code == 403 and "rate limit" in response.text.lower():
|
| 74 |
-
raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
|
| 75 |
-
elif response.status_code != 200:
|
| 76 |
-
raise Exception(f"GitHub API error: {response.status_code} {response.text}")
|
| 77 |
-
|
| 78 |
-
page_prs = response.json()
|
| 79 |
-
if not page_prs: # No more PRs
|
| 80 |
-
break
|
| 81 |
-
|
| 82 |
-
all_open_prs.extend(page_prs)
|
| 83 |
-
page += 1
|
| 84 |
-
|
| 85 |
-
# Break if we got less than per_page results (last page)
|
| 86 |
-
if len(page_prs) < per_page:
|
| 87 |
-
break
|
| 88 |
-
|
| 89 |
-
filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]]
|
| 90 |
-
|
| 91 |
-
# Pattern to match filenames after "Translated" keyword
|
| 92 |
-
pattern = re.compile(r"Translated\s+(?:`([^`]+)`|(\S+))\s+to")
|
| 93 |
-
|
| 94 |
-
def find_original_file_path(filename_from_title, all_files):
|
| 95 |
-
"""Find the exact file path from repo files by matching filename"""
|
| 96 |
-
if not filename_from_title:
|
| 97 |
-
return None
|
| 98 |
-
|
| 99 |
-
# Remove .md extension for matching
|
| 100 |
-
base_name = filename_from_title.replace('.md', '')
|
| 101 |
-
|
| 102 |
-
# Look for exact matches in repo files
|
| 103 |
-
for file_path in all_files:
|
| 104 |
-
if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
|
| 105 |
-
file_base = file_path.split("/")[-1].replace('.md', '')
|
| 106 |
-
if file_base == base_name:
|
| 107 |
-
return file_path
|
| 108 |
-
|
| 109 |
-
# If no exact match, fallback to simple path
|
| 110 |
-
return f"docs/source/en/{filename_from_title}"
|
| 111 |
-
|
| 112 |
-
filenames = []
|
| 113 |
-
pr_info_list = []
|
| 114 |
-
|
| 115 |
-
for pr in filtered_prs:
|
| 116 |
-
match = pattern.search(pr["title"])
|
| 117 |
-
if match:
|
| 118 |
-
# Use group 1 (with backticks) or group 2 (without backticks)
|
| 119 |
-
filename = match.group(1) or match.group(2)
|
| 120 |
-
# Add .md extension if not present
|
| 121 |
-
if not filename.endswith('.md'):
|
| 122 |
-
filename += '.md'
|
| 123 |
-
|
| 124 |
-
# Find the correct file path by matching filename
|
| 125 |
-
correct_path = None
|
| 126 |
-
if filename:
|
| 127 |
-
# Remove .md extension for matching
|
| 128 |
-
base_name = filename.replace('.md', '')
|
| 129 |
-
|
| 130 |
-
# Look for exact matches in repo files
|
| 131 |
-
for file_path in all_files:
|
| 132 |
-
if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
|
| 133 |
-
file_base = file_path.split("/")[-1].replace('.md', '')
|
| 134 |
-
if file_base == base_name:
|
| 135 |
-
correct_path = file_path
|
| 136 |
-
break
|
| 137 |
-
|
| 138 |
-
# If no exact match, fallback to simple path
|
| 139 |
-
if not correct_path:
|
| 140 |
-
correct_path = f"docs/source/en/{filename}"
|
| 141 |
-
if correct_path:
|
| 142 |
-
filenames.append(correct_path)
|
| 143 |
-
pr_info_list.append(f"{config.repo_url}/pull/{pr['url'].rstrip('/').split('/')[-1]}")
|
| 144 |
-
return filenames, pr_info_list
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
|
| 148 |
-
"""
|
| 149 |
-
Retrieve missing docs
|
| 150 |
-
"""
|
| 151 |
-
|
| 152 |
-
report = f"""
|
| 153 |
-
| Item | Count | Percentage |
|
| 154 |
-
|------|-------|------------|
|
| 155 |
-
| 📂 HuggingFaces docs | {summary.files_analyzed} | - |
|
| 156 |
-
| 🪹 Missing translations | {summary.files_missing_translation} | {summary.percentage_missing_translation:.2f}% |
|
| 157 |
-
"""
|
| 158 |
-
print(report)
|
| 159 |
-
first_missing_docs = list()
|
| 160 |
-
for file in summary.first_missing_translation_files(table_size):
|
| 161 |
-
first_missing_docs.append(file.original_file)
|
| 162 |
-
|
| 163 |
-
print(first_missing_docs)
|
| 164 |
-
return report, first_missing_docs
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
def report(project: str, target_lang: str, top_k: int = 1, docs_file: list = None) -> tuple[str, list[str]]:
|
| 168 |
-
"""
|
| 169 |
-
Generate a report for the translated docs
|
| 170 |
-
"""
|
| 171 |
-
if docs_file is None:
|
| 172 |
-
raise ValueError("Repository file list must be provided")
|
| 173 |
-
|
| 174 |
-
base_docs_path = Path("docs/source")
|
| 175 |
-
en_docs_path = Path("docs/source/en")
|
| 176 |
-
|
| 177 |
-
lang = Languages[target_lang]
|
| 178 |
-
summary = Summary(lang=lang.value)
|
| 179 |
-
|
| 180 |
-
for file in docs_file:
|
| 181 |
-
if file.endswith(".md"):
|
| 182 |
-
try:
|
| 183 |
-
file_relative_path = Path(file).relative_to(en_docs_path)
|
| 184 |
-
except ValueError:
|
| 185 |
-
continue
|
| 186 |
-
|
| 187 |
-
translated_path = os.path.join(
|
| 188 |
-
base_docs_path, lang.value, file_relative_path
|
| 189 |
-
)
|
| 190 |
-
translation_exists = translated_path in docs_file
|
| 191 |
-
|
| 192 |
-
doc = TranslationDoc(
|
| 193 |
-
translation_lang=lang.value,
|
| 194 |
-
original_file=file,
|
| 195 |
-
translation_file=translated_path,
|
| 196 |
-
translation_exists=translation_exists,
|
| 197 |
-
)
|
| 198 |
-
summary.append_file(doc)
|
| 199 |
-
return retrieve(summary, top_k)
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import requests
|
| 6 |
+
|
| 7 |
+
from .model import Languages, Summary, TranslationDoc
|
| 8 |
+
from .project_config import get_project_config
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def get_github_repo_files(project: str = "transformers"):
|
| 12 |
+
"""
|
| 13 |
+
Get github repo files
|
| 14 |
+
"""
|
| 15 |
+
config = get_project_config(project)
|
| 16 |
+
|
| 17 |
+
# Add GitHub token if available to avoid rate limiting (optional)
|
| 18 |
+
headers = {}
|
| 19 |
+
github_token = os.environ.get("GITHUB_TOKEN")
|
| 20 |
+
if github_token:
|
| 21 |
+
headers["Authorization"] = f"token {github_token}"
|
| 22 |
+
|
| 23 |
+
response = requests.get(config.api_url, headers=headers)
|
| 24 |
+
|
| 25 |
+
# Handle rate limit with helpful message
|
| 26 |
+
if response.status_code == 403 and "rate limit" in response.text.lower():
|
| 27 |
+
raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
|
| 28 |
+
|
| 29 |
+
data = response.json()
|
| 30 |
+
all_items = data.get("tree", [])
|
| 31 |
+
|
| 32 |
+
file_paths = [
|
| 33 |
+
item["path"]
|
| 34 |
+
for item in all_items
|
| 35 |
+
if item["type"] == "blob" and (item["path"].startswith("docs"))
|
| 36 |
+
]
|
| 37 |
+
return file_paths
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko", all_files: list = None):
|
| 41 |
+
"""
|
| 42 |
+
Get open PR in the github issue, filtered by title containing '[i18n-KO]'.
|
| 43 |
+
"""
|
| 44 |
+
config = get_project_config(project)
|
| 45 |
+
issue_id = config.github_issues.get(lang)
|
| 46 |
+
|
| 47 |
+
# For projects without GitHub issue tracking, still search for PRs
|
| 48 |
+
if not issue_id:
|
| 49 |
+
raise ValueError(f"⚠️ No GitHub issue registered for {project}.")
|
| 50 |
+
|
| 51 |
+
# Require all_files parameter
|
| 52 |
+
if all_files is None:
|
| 53 |
+
raise ValueError("Repository file list must be provided")
|
| 54 |
+
|
| 55 |
+
headers = {
|
| 56 |
+
"Accept": "application/vnd.github+json",
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
# Add GitHub token if available to avoid rate limiting (optional)
|
| 60 |
+
github_token = os.environ.get("GITHUB_TOKEN")
|
| 61 |
+
if github_token:
|
| 62 |
+
headers["Authorization"] = f"token {github_token}"
|
| 63 |
+
|
| 64 |
+
all_open_prs = []
|
| 65 |
+
page = 1
|
| 66 |
+
per_page = 100 # Maximum allowed by GitHub API
|
| 67 |
+
|
| 68 |
+
while True:
|
| 69 |
+
repo_path = config.repo_url.replace("https://github.com/", "")
|
| 70 |
+
url = f"https://api.github.com/repos/{repo_path}/pulls?state=open&page={page}&per_page={per_page}"
|
| 71 |
+
response = requests.get(url, headers=headers)
|
| 72 |
+
|
| 73 |
+
if response.status_code == 403 and "rate limit" in response.text.lower():
|
| 74 |
+
raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
|
| 75 |
+
elif response.status_code != 200:
|
| 76 |
+
raise Exception(f"GitHub API error: {response.status_code} {response.text}")
|
| 77 |
+
|
| 78 |
+
page_prs = response.json()
|
| 79 |
+
if not page_prs: # No more PRs
|
| 80 |
+
break
|
| 81 |
+
|
| 82 |
+
all_open_prs.extend(page_prs)
|
| 83 |
+
page += 1
|
| 84 |
+
|
| 85 |
+
# Break if we got less than per_page results (last page)
|
| 86 |
+
if len(page_prs) < per_page:
|
| 87 |
+
break
|
| 88 |
+
|
| 89 |
+
filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]]
|
| 90 |
+
|
| 91 |
+
# Pattern to match filenames after "Translated" keyword
|
| 92 |
+
pattern = re.compile(r"Translated\s+(?:`([^`]+)`|(\S+))\s+to")
|
| 93 |
+
|
| 94 |
+
def find_original_file_path(filename_from_title, all_files):
|
| 95 |
+
"""Find the exact file path from repo files by matching filename"""
|
| 96 |
+
if not filename_from_title:
|
| 97 |
+
return None
|
| 98 |
+
|
| 99 |
+
# Remove .md extension for matching
|
| 100 |
+
base_name = filename_from_title.replace('.md', '')
|
| 101 |
+
|
| 102 |
+
# Look for exact matches in repo files
|
| 103 |
+
for file_path in all_files:
|
| 104 |
+
if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
|
| 105 |
+
file_base = file_path.split("/")[-1].replace('.md', '')
|
| 106 |
+
if file_base == base_name:
|
| 107 |
+
return file_path
|
| 108 |
+
|
| 109 |
+
# If no exact match, fallback to simple path
|
| 110 |
+
return f"docs/source/en/{filename_from_title}"
|
| 111 |
+
|
| 112 |
+
filenames = []
|
| 113 |
+
pr_info_list = []
|
| 114 |
+
|
| 115 |
+
for pr in filtered_prs:
|
| 116 |
+
match = pattern.search(pr["title"])
|
| 117 |
+
if match:
|
| 118 |
+
# Use group 1 (with backticks) or group 2 (without backticks)
|
| 119 |
+
filename = match.group(1) or match.group(2)
|
| 120 |
+
# Add .md extension if not present
|
| 121 |
+
if not filename.endswith('.md'):
|
| 122 |
+
filename += '.md'
|
| 123 |
+
|
| 124 |
+
# Find the correct file path by matching filename
|
| 125 |
+
correct_path = None
|
| 126 |
+
if filename:
|
| 127 |
+
# Remove .md extension for matching
|
| 128 |
+
base_name = filename.replace('.md', '')
|
| 129 |
+
|
| 130 |
+
# Look for exact matches in repo files
|
| 131 |
+
for file_path in all_files:
|
| 132 |
+
if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
|
| 133 |
+
file_base = file_path.split("/")[-1].replace('.md', '')
|
| 134 |
+
if file_base == base_name:
|
| 135 |
+
correct_path = file_path
|
| 136 |
+
break
|
| 137 |
+
|
| 138 |
+
# If no exact match, fallback to simple path
|
| 139 |
+
if not correct_path:
|
| 140 |
+
correct_path = f"docs/source/en/{filename}"
|
| 141 |
+
if correct_path:
|
| 142 |
+
filenames.append(correct_path)
|
| 143 |
+
pr_info_list.append(f"{config.repo_url}/pull/{pr['url'].rstrip('/').split('/')[-1]}")
|
| 144 |
+
return filenames, pr_info_list
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
|
| 148 |
+
"""
|
| 149 |
+
Retrieve missing docs
|
| 150 |
+
"""
|
| 151 |
+
|
| 152 |
+
report = f"""
|
| 153 |
+
| Item | Count | Percentage |
|
| 154 |
+
|------|-------|------------|
|
| 155 |
+
| 📂 HuggingFaces docs | {summary.files_analyzed} | - |
|
| 156 |
+
| 🪹 Missing translations | {summary.files_missing_translation} | {summary.percentage_missing_translation:.2f}% |
|
| 157 |
+
"""
|
| 158 |
+
print(report)
|
| 159 |
+
first_missing_docs = list()
|
| 160 |
+
for file in summary.first_missing_translation_files(table_size):
|
| 161 |
+
first_missing_docs.append(file.original_file)
|
| 162 |
+
|
| 163 |
+
print(first_missing_docs)
|
| 164 |
+
return report, first_missing_docs
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def report(project: str, target_lang: str, top_k: int = 1, docs_file: list = None) -> tuple[str, list[str]]:
|
| 168 |
+
"""
|
| 169 |
+
Generate a report for the translated docs
|
| 170 |
+
"""
|
| 171 |
+
if docs_file is None:
|
| 172 |
+
raise ValueError("Repository file list must be provided")
|
| 173 |
+
|
| 174 |
+
base_docs_path = Path("docs/source")
|
| 175 |
+
en_docs_path = Path("docs/source/en")
|
| 176 |
+
|
| 177 |
+
lang = Languages[target_lang]
|
| 178 |
+
summary = Summary(lang=lang.value)
|
| 179 |
+
|
| 180 |
+
for file in docs_file:
|
| 181 |
+
if file.endswith(".md"):
|
| 182 |
+
try:
|
| 183 |
+
file_relative_path = Path(file).relative_to(en_docs_path)
|
| 184 |
+
except ValueError:
|
| 185 |
+
continue
|
| 186 |
+
|
| 187 |
+
translated_path = os.path.join(
|
| 188 |
+
base_docs_path, lang.value, file_relative_path
|
| 189 |
+
)
|
| 190 |
+
translation_exists = translated_path in docs_file
|
| 191 |
+
|
| 192 |
+
doc = TranslationDoc(
|
| 193 |
+
translation_lang=lang.value,
|
| 194 |
+
original_file=file,
|
| 195 |
+
translation_file=translated_path,
|
| 196 |
+
translation_exists=translation_exists,
|
| 197 |
+
)
|
| 198 |
+
summary.append_file(doc)
|
| 199 |
+
return retrieve(summary, top_k)
|