Hyeonseo commited on
Commit
432715c
·
1 Parent(s): 1d2a0a6

Add MCP servers under external/ (docs-explorer & translation-reviewer)2

Browse files
.github/workflows/main.yml CHANGED
@@ -1,82 +1,82 @@
1
- name: Deploy to HF Space
2
-
3
- on:
4
- push:
5
- branches: [mcp/docs-search-review]
6
- workflow_dispatch:
7
-
8
- jobs:
9
- deploy:
10
- runs-on: ubuntu-latest
11
-
12
- steps:
13
- - name: Checkout repository
14
- uses: actions/checkout@v3
15
- with:
16
- fetch-depth: 0
17
- lfs: true
18
- ref: mcp/docs-search-review # 추후 main 수정
19
-
20
- - name: Setup LFS & migrate images
21
- run: |
22
- git config --global user.email "[email protected]"
23
- git config --global user.name "GitHub Actions"
24
- git lfs install
25
- git lfs track "images/**"
26
- echo "images/** filter=lfs diff=lfs merge=lfs -text" >> .gitattributes
27
- git add .gitattributes
28
- git commit -m "Add images to LFS tracking" || echo "No changes"
29
- git add -A
30
- git diff --cached --quiet || git commit -m "Pre-migrate: commit all changes" || echo "No changes"
31
- git lfs migrate import --include="images/**" --include-ref=refs/heads/mcp/docs-search-review # 추후 main 수정
32
-
33
- - name: Deploy to Hugging Face Space
34
- env:
35
- HF_USERNAME: ${{ secrets.HF_USERNAME }}
36
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
37
- HF_SPACE_NAME: ${{ secrets.HF_SPACE_NAME }}
38
- run: |
39
- git remote add space https://$HF_USERNAME:[email protected]/spaces/$HF_USERNAME/$HF_SPACE_NAME
40
- git push --force space mcp/docs-search-review:main # 추후 main 수정
41
-
42
- deploy_docs_explorer:
43
- runs-on: ubuntu-latest
44
-
45
- steps:
46
- - name: Checkout repository
47
- uses: actions/checkout@v3
48
- with:
49
- fetch-depth: 0
50
- ref: mcp/docs-search-review # 추후 main 수정
51
-
52
- - name: Push hf-translation-docs-explorer to HF Space
53
- env:
54
- HF_USERNAME: ${{ secrets.HF_USERNAME }}
55
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
56
- HF_SPACE_NAME_DOCS_EXPLORER: ${{ secrets.HF_SPACE_NAME_DOCS_EXPLORER }}
57
- run: |
58
- git subtree split --prefix=external/mcp-servers/hf-translation-docs-explorer -b docs-explorer-branch
59
-
60
- git remote add space-docs-explorer https://$HF_USERNAME:[email protected]/spaces/$HF_USERNAME/$HF_SPACE_NAME_DOCS_EXPLORER
61
- git push --force space-docs-explorer docs-explorer-branch:main
62
-
63
- deploy_translation_reviewer:
64
- runs-on: ubuntu-latest
65
-
66
- steps:
67
- - name: Checkout repository
68
- uses: actions/checkout@v3
69
- with:
70
- fetch-depth: 0
71
- ref: mcp/docs-search-review # 추후 main 수정
72
-
73
- - name: Push hf-translation-reviewer to HF Space
74
- env:
75
- HF_USERNAME: ${{ secrets.HF_USERNAME }}
76
- HF_TOKEN: ${{ secrets.HF_TOKEN }}
77
- HF_SPACE_NAME_TRANSLATION_REVIEWER: ${{ secrets.HF_SPACE_NAME_TRANSLATION_REVIEWER }}
78
- run: |
79
- git subtree split --prefix=external/mcp-servers/hf-translation-reviewer -b translation-reviewer-branch
80
-
81
- git remote add space-translation-reviewer https://$HF_USERNAME:[email protected]/spaces/$HF_USERNAME/$HF_SPACE_NAME_TRANSLATION_REVIEWER
82
- git push --force space-translation-reviewer translation-reviewer-branch:main
 
1
+ name: Deploy to HF Space
2
+
3
+ on:
4
+ push:
5
+ branches: [mcp/docs-search-review]
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ deploy:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout repository
14
+ uses: actions/checkout@v3
15
+ with:
16
+ fetch-depth: 0
17
+ lfs: true
18
+ ref: mcp/docs-search-review # 추후 main 수정
19
+
20
+ - name: Setup LFS & migrate images
21
+ run: |
22
+ git config --global user.email "[email protected]"
23
+ git config --global user.name "GitHub Actions"
24
+ git lfs install
25
+ git lfs track "images/**"
26
+ echo "images/** filter=lfs diff=lfs merge=lfs -text" >> .gitattributes
27
+ git add .gitattributes
28
+ git commit -m "Add images to LFS tracking" || echo "No changes"
29
+ git add -A
30
+ git diff --cached --quiet || git commit -m "Pre-migrate: commit all changes" || echo "No changes"
31
+ git lfs migrate import --include="images/**" --include-ref=refs/heads/mcp/docs-search-review # 추후 main 수정
32
+
33
+ - name: Deploy to Hugging Face Space
34
+ env:
35
+ HF_USERNAME: ${{ secrets.HF_USERNAME }}
36
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
37
+ HF_SPACE_NAME: ${{ secrets.HF_SPACE_NAME }}
38
+ run: |
39
+ git remote add space https://$HF_USERNAME:[email protected]/spaces/$HF_USERNAME/$HF_SPACE_NAME
40
+ git push --force space mcp/docs-search-review:main # 추후 main 수정
41
+
42
+ deploy_docs_explorer:
43
+ runs-on: ubuntu-latest
44
+
45
+ steps:
46
+ - name: Checkout repository
47
+ uses: actions/checkout@v3
48
+ with:
49
+ fetch-depth: 0
50
+ ref: mcp/docs-search-review # 추후 main 수정
51
+
52
+ - name: Push hf-translation-docs-explorer to HF Space
53
+ env:
54
+ HF_USERNAME: ${{ secrets.HF_USERNAME }}
55
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
56
+ HF_SPACE_NAME_DOCS_EXPLORER: ${{ secrets.HF_SPACE_NAME_DOCS_EXPLORER }}
57
+ run: |
58
+ git subtree split --prefix=external/mcp-servers/hf-translation-docs-explorer -b docs-explorer-branch
59
+
60
+ git remote add space-docs-explorer https://$HF_USERNAME:[email protected]/spaces/$HF_USERNAME/$HF_SPACE_NAME_DOCS_EXPLORER
61
+ git push --force space-docs-explorer docs-explorer-branch:main
62
+
63
+ deploy_translation_reviewer:
64
+ runs-on: ubuntu-latest
65
+
66
+ steps:
67
+ - name: Checkout repository
68
+ uses: actions/checkout@v3
69
+ with:
70
+ fetch-depth: 0
71
+ ref: mcp/docs-search-review # 추후 main 수정
72
+
73
+ - name: Push hf-translation-reviewer to HF Space
74
+ env:
75
+ HF_USERNAME: ${{ secrets.HF_USERNAME }}
76
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
77
+ HF_SPACE_NAME_TRANSLATION_REVIEWER: ${{ secrets.HF_SPACE_NAME_TRANSLATION_REVIEWER }}
78
+ run: |
79
+ git subtree split --prefix=external/mcp-servers/hf-translation-reviewer -b translation-reviewer-branch
80
+
81
+ git remote add space-translation-reviewer https://$HF_USERNAME:[email protected]/spaces/$HF_USERNAME/$HF_SPACE_NAME_TRANSLATION_REVIEWER
82
+ git push --force space-translation-reviewer translation-reviewer-branch:main
.gitignore CHANGED
@@ -1,3 +1,3 @@
1
- .env
2
- */__pycache__/
3
- pr_success.log
 
1
+ .env
2
+ */__pycache__/
3
+ pr_success.log
README.md CHANGED
@@ -1,307 +1,307 @@
1
- ---
2
- title: i18n Agent - Contribute in Just 5 Minutes
3
- emoji: 🤗
4
- colorFrom: yellow
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 5.33.1
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
-
13
- <div align="center">
14
-
15
- # 🌐 [i18n-agent] Hugging Face i18n made easy
16
-
17
- *AI-powered translation agent for Hugging Face Transformers documentation internationalization*
18
-
19
- [![Hugging Face Spaces](https://img.shields.io/badge/🤗%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Agents-MCP-Hackathon/hf-transformers-docs-i18n-agent)
20
- [![agent-demo-track](https://img.shields.io/badge/🤖-agent--demo--track-ff6b6b)](https://github.com/topics/agent-demo-track)
21
-
22
- ![KREW x Hugging Face Logo](images/title.png)
23
-
24
- **Streamline Hugging Face Transformers documentation translation with Claude AI**
25
- • **Auto-generate GitHub PRs** • **Multi-language support**
26
-
27
- > 🎯 **Created to address [Hugging Face Transformers Issue #20179](https://github.com/huggingface/transformers/issues/20179)** - Making documentation translation more accessible and automated for the global community.
28
-
29
- [🚀 Try Live Demo](https://huggingface.co/spaces/Agents-MCP-Hackathon/hf-transformers-docs-i18n-agent) • [📹 Watch Demo](#-demo-video) • [📖 Documentation](#-quick-start)
30
-
31
- </div>
32
-
33
- ---
34
-
35
- ## ✨ What is this?
36
-
37
- Transform the way you contribute to Hugging Face Transformers' global community! This AI agent automatically:
38
-
39
- - 🔍 **Discovers** missing translations in [Transformers documentation](https://huggingface.co/docs/transformers/en/index)
40
- - 🤖 **Translates** using Claude Sonnet 4 with technical precision
41
- - 📝 **Creates** GitHub pull requests ready for review on [huggingface/transformers](https://github.com/huggingface/transformers)
42
- - 💬 **Guides** you through the entire process
43
-
44
- > **Perfect for**: Contributors addressing [Issue #20179](https://github.com/huggingface/transformers/issues/20179), documentation maintainers, and international communities wanting to make transformer models accessible worldwide.
45
-
46
- ## 🎯 Addressing the Community Need
47
-
48
- This project was specifically created to solve [Hugging Face Transformers Issue #20179](https://github.com/huggingface/transformers/issues/20179), which highlights the need for better internationalization tooling. Our agent tackles the core challenges mentioned in the issue:
49
-
50
- - **🚧 Translation Bottlenecks**: Automates the manual translation process
51
- - **📊 Consistency Issues**: Maintains uniform translation quality within each languages
52
- - **⚡ Scalability Problems**: Handles batch translations efficiently
53
- - **🤝 Contributor Barriers**: Simplifies the PR creation workflow for translators
54
-
55
- ## 🎥 Demo Video
56
-
57
- [Hugging Face i18n Agent Demo](https://youtu.be/J2MBMNk7la8?si=7867ztaU2nPN0UEo)
58
-
59
- *Watch the complete walkthrough: from setup to PR creation in under 5 minutes*
60
-
61
- ## 🚀 Quick Start
62
-
63
- ![KREW x Hugging Face Logo](images/demo.png)
64
-
65
- ### Option 1: One-Click Demo (Recommended)
66
- [![Open in Spaces](https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-md-dark.svg)](https://huggingface.co/spaces/YOUR_USERNAME/i18n-agent)
67
-
68
- ✅ **No setup required!** Just click and start translating.
69
- ✅ **Pre-requisites!** Need `Anthropic API key` and `Github token`.
70
-
71
- ### Option 2: Run Locally
72
-
73
- <details>
74
- <summary>🛠️ Local Installation Guide</summary>
75
-
76
- ```bash
77
- # Clone the repo
78
- git clone https://github.com/Hugging-Face-KREW/i18n-agent.git
79
- cd i18n-agent
80
-
81
- # Install dependencies
82
- make install
83
- source .venv/bin/activate
84
-
85
- # Set up your keys
86
- cp .env.example .env
87
- # Add your Anthropic API key and GitHub token
88
-
89
- # Launch the app
90
- python app.py
91
- ```
92
-
93
- </details>
94
-
95
- ## 🎯 How It Works
96
-
97
- This agent specifically targets the [Hugging Face Transformers documentation](https://huggingface.co/docs/transformers/en/index) and submits PRs addressing [Issue #20179](https://github.com/huggingface/transformers/issues/20179) in the [huggingface/transformers](https://github.com/huggingface/transformers) repository.
98
-
99
- ```mermaid
100
- graph LR
101
- A[🔍 Find Files] --> B[🤖 Translate] --> C[📝 Create PR]
102
-
103
- A --> A1[Scan transformers docs]
104
- A --> A2[Identify translation gaps]
105
-
106
- B --> B1[Claude AI translation]
107
- B --> B2[Preserve formatting]
108
-
109
- C --> C1[Auto-branch creation]
110
- C --> C2[Submit to huggingface/transformers]
111
- ```
112
-
113
- ### Step 1: 🔍 Discover Translation Targets
114
- - Select your target language (Korean, and more languages will be supported)
115
- - Set how many files to process
116
- - Let AI identify the most impactful Transformers docs translations
117
-
118
- ### Step 2: 🤖 Smart Translation
119
- - Enter your Anthropic API key
120
- - Claude Sonnet 4 translates with context awareness
121
- - Technical terms and code blocks preserved automatically
122
-
123
- ### Step 3: 📝 Automated PR Creation
124
- - Configure GitHub credentials
125
- - System creates properly formatted pull requests for [huggingface/transformers](https://github.com/huggingface/transformers)
126
- - Optional: Use reference PRs for consistency
127
-
128
- ## 🌍 Supported Languages
129
-
130
- <div align="center">
131
-
132
- | Language | Code | Status |
133
- |----------|------|--------|
134
- | 🇰🇷 Korean | `ko` | ✅ Fully Supported |
135
-
136
- *And more languages coming soon...*
137
-
138
- </div>
139
-
140
- ## 🏗️ Architecture
141
-
142
- <div align="center">
143
- <a href="images/workflow.png" target="_blank">
144
- <img src="images/workflow.png" alt="KREW x Hugging Face Logo" width="400">
145
- </a>
146
- </div>
147
-
148
- <details>
149
-
150
- <summary>📊 System Design Overview</summary>
151
-
152
- **Frontend Layer**
153
- - Gradio web interface with modern styling
154
- - Real-time chat & quick controls with AI agent
155
- - Progress tracking missing
156
-
157
- **AI Processing Layer**
158
- - File discovery with intelligent prioritization for Transformers docs
159
- - Claude Sonnet 4 for context-aware translation
160
- - LangChain integration for PR research
161
-
162
- **Integration Layer**
163
- - GitHub API for automated PR creation to [huggingface/transformers](https://github.com/huggingface/transformers)
164
- - Branch management and commit structuring
165
- - Template matching from reference PRs
166
-
167
- </details>
168
-
169
- ## ⚙️ Configuration
170
-
171
- ### For Spaces Deployment
172
- Prepare these secrets:
173
-
174
- ```bash
175
- ANTHROPIC_API_KEY=your_claude_api_key
176
- GITHUB_TOKEN=your_github_token
177
- ```
178
-
179
- ### For Local Development
180
- Create `.env` file:
181
-
182
- ```bash
183
- ANTHROPIC_API_KEY=<your api key>
184
-
185
- # GitHub PR Agent Configuration
186
- GITHUB_TOKEN=<your github token>
187
- GITHUB_OWNER=<your github username>
188
- GITHUB_REPO=<your repository name>
189
- REFERENCE_PR_URL=<reference pr url for style analysis>
190
- ```
191
-
192
- ## 🤝 Contributing
193
-
194
- <div align="center">
195
-
196
- **Love this project? Here's how you can help:**
197
-
198
- [![Fork](https://img.shields.io/github/forks/username/repo?style=social)](https://github.com/Hugging-Face-KREW/i18n-agent.git)
199
- [![Star](https://img.shields.io/github/stars/username/repo?style=social)](https://github.com/Hugging-Face-KREW/i18n-agent.git)
200
- [![Issues](https://img.shields.io/github/issues/username/repo)](https://github.com/Hugging-Face-KREW/i18n-agent.git)
201
-
202
- </div>
203
-
204
- ### 👥 Contributors
205
-
206
- 🤗 [email protected] / @harheem
207
- 🤗 [email protected] / @Jwaminju
208
-
209
- ## 💡 Use Cases
210
-
211
- > **🌟 Real-world scenarios where this agent shines:**
212
-
213
- - **📚 Documentation Teams**: Batch translate Transformers documentation updates
214
- - **🌍 Community Contributors**: Help make Transformers accessible in your language
215
- - **🏢 Organizations**: Streamline i18n workflows for Transformers library
216
- - **👨‍💻 Developers**: Contribute Transformers translations without manual GitHub workflow
217
- - **🎯 Issue #20179 Contributors**: Directly address the internationalization challenges raised by the community
218
-
219
- ## 🛠️ Tech Stack
220
-
221
- <div align="center">
222
-
223
- ![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)
224
- ![Gradio](https://img.shields.io/badge/gradio-FF6B35?style=for-the-badge&logo=gradio&logoColor=white)
225
- ![Anthropic](https://img.shields.io/badge/Claude-191919?style=for-the-badge&logo=anthropic&logoColor=white)
226
- ![GitHub](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)
227
- ![LangChain](https://img.shields.io/badge/LangChain-121212?style=for-the-badge&logo=chainlink&logoColor=white)
228
-
229
- </div>
230
-
231
- ## ❓ FAQ
232
-
233
- <details>
234
- <summary><strong>Q: How does this relate to Issue #20179?</strong></summary>
235
- <br>
236
- This agent directly addresses the pain points raised in <a href="https://github.com/huggingface/transformers/issues/20179">Issue #20179</a> by automating the translation workflow, reducing manual overhead, and making it easier for contributors to submit high-quality translations.
237
- </details>
238
-
239
- <details>
240
- <summary><strong>Q: How accurate are the translations?</strong></summary>
241
- <br>
242
- The agent uses Claude Sonnet 4, which provides high-quality translations with technical context awareness. It preserves code blocks, maintains formatting, and follows established translation patterns.
243
- </details>
244
-
245
- <details>
246
- <summary><strong>Q: What permissions do I need for GitHub integration?</strong></summary>
247
- <br>
248
- Your GitHub token needs repository read/write permissions and the ability to create branches and pull requests on the target repository.
249
- </details>
250
-
251
- <details>
252
- <summary><strong>Q: Can I customize the translation style?</strong></summary>
253
- <br>
254
- Yes! You can provide reference PR URLs to match existing translation patterns and maintain consistency with community standards.
255
- </details>
256
-
257
- ## 🐛 Troubleshooting
258
-
259
- ### Common Issues
260
-
261
- <details>
262
- <summary><strong>API Key Issues</strong></summary>
263
-
264
- - Ensure your Anthropic API key is valid and has sufficient credits
265
- - Check that your GitHub token has the necessary repository permissions
266
-
267
- </details>
268
-
269
- <details>
270
- <summary><strong>Translation Quality</strong></summary>
271
-
272
- - The system uses Claude Sonnet 4 for high-quality translations
273
- - Formatting and markdown structure is maintained
274
- - Please restart the translation again if you met format issue
275
-
276
- </details>
277
-
278
- <details>
279
- <summary><strong>GitHub PR Creation</strong></summary>
280
-
281
- - Verify repository permissions and branch protection rules
282
- - Check that the reference PR URL is accessible and valid
283
-
284
- </details>
285
-
286
-
287
- ## 🙏 Acknowledgments
288
-
289
- Special thanks to the amazing communities that make this possible:
290
-
291
- - **🤗 Hugging Face** - For building the Transformers library and comprehensive documentation
292
- - **🎭 Anthropic** - For Claude's incredible language capabilities
293
- - **👥 Hugging Face KREW Community** - For championing Korean AI translation
294
- - **🎨 Gradio** - For making beautiful AI interfaces simple
295
- - **🌍 Community Contributors** - For raising awareness through [Issue #20179](https://github.com/huggingface/transformers/issues/20179)
296
-
297
- ---
298
-
299
- <div align="center">
300
-
301
- **Made with ❤️ for global accessibility of Hugging Face Transformers documentation.**
302
-
303
- **🎯 Solving [Issue #20179](https://github.com/huggingface/transformers/issues/20179) one translation at a time.**
304
-
305
- [⭐ Star this repo](https://github.com/Hugging-Face-KREW/i18n-agent.git) • [🐛 Report Bug](https://github.com/Hugging-Face-KREW/i18n-agent.git) • [💡 Request Feature](https://github.com/Hugging-Face-KREW/i18n-agent.git)
306
-
307
- </div>
 
1
+ ---
2
+ title: i18n Agent - Contribute in Just 5 Minutes
3
+ emoji: 🤗
4
+ colorFrom: yellow
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 5.33.1
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+
13
+ <div align="center">
14
+
15
+ # 🌐 [i18n-agent] Hugging Face i18n made easy
16
+
17
+ *AI-powered translation agent for Hugging Face Transformers documentation internationalization*
18
+
19
+ [![Hugging Face Spaces](https://img.shields.io/badge/🤗%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/Agents-MCP-Hackathon/hf-transformers-docs-i18n-agent)
20
+ [![agent-demo-track](https://img.shields.io/badge/🤖-agent--demo--track-ff6b6b)](https://github.com/topics/agent-demo-track)
21
+
22
+ ![KREW x Hugging Face Logo](images/title.png)
23
+
24
+ **Streamline Hugging Face Transformers documentation translation with Claude AI**
25
+ • **Auto-generate GitHub PRs** • **Multi-language support**
26
+
27
+ > 🎯 **Created to address [Hugging Face Transformers Issue #20179](https://github.com/huggingface/transformers/issues/20179)** - Making documentation translation more accessible and automated for the global community.
28
+
29
+ [🚀 Try Live Demo](https://huggingface.co/spaces/Agents-MCP-Hackathon/hf-transformers-docs-i18n-agent) • [📹 Watch Demo](#-demo-video) • [📖 Documentation](#-quick-start)
30
+
31
+ </div>
32
+
33
+ ---
34
+
35
+ ## ✨ What is this?
36
+
37
+ Transform the way you contribute to Hugging Face Transformers' global community! This AI agent automatically:
38
+
39
+ - 🔍 **Discovers** missing translations in [Transformers documentation](https://huggingface.co/docs/transformers/en/index)
40
+ - 🤖 **Translates** using Claude Sonnet 4 with technical precision
41
+ - 📝 **Creates** GitHub pull requests ready for review on [huggingface/transformers](https://github.com/huggingface/transformers)
42
+ - 💬 **Guides** you through the entire process
43
+
44
+ > **Perfect for**: Contributors addressing [Issue #20179](https://github.com/huggingface/transformers/issues/20179), documentation maintainers, and international communities wanting to make transformer models accessible worldwide.
45
+
46
+ ## 🎯 Addressing the Community Need
47
+
48
+ This project was specifically created to solve [Hugging Face Transformers Issue #20179](https://github.com/huggingface/transformers/issues/20179), which highlights the need for better internationalization tooling. Our agent tackles the core challenges mentioned in the issue:
49
+
50
+ - **🚧 Translation Bottlenecks**: Automates the manual translation process
51
+ - **📊 Consistency Issues**: Maintains uniform translation quality within each languages
52
+ - **⚡ Scalability Problems**: Handles batch translations efficiently
53
+ - **🤝 Contributor Barriers**: Simplifies the PR creation workflow for translators
54
+
55
+ ## 🎥 Demo Video
56
+
57
+ [Hugging Face i18n Agent Demo](https://youtu.be/J2MBMNk7la8?si=7867ztaU2nPN0UEo)
58
+
59
+ *Watch the complete walkthrough: from setup to PR creation in under 5 minutes*
60
+
61
+ ## 🚀 Quick Start
62
+
63
+ ![KREW x Hugging Face Logo](images/demo.png)
64
+
65
+ ### Option 1: One-Click Demo (Recommended)
66
+ [![Open in Spaces](https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-md-dark.svg)](https://huggingface.co/spaces/YOUR_USERNAME/i18n-agent)
67
+
68
+ ✅ **No setup required!** Just click and start translating.
69
+ ✅ **Pre-requisites!** Need `Anthropic API key` and `Github token`.
70
+
71
+ ### Option 2: Run Locally
72
+
73
+ <details>
74
+ <summary>🛠️ Local Installation Guide</summary>
75
+
76
+ ```bash
77
+ # Clone the repo
78
+ git clone https://github.com/Hugging-Face-KREW/i18n-agent.git
79
+ cd i18n-agent
80
+
81
+ # Install dependencies
82
+ make install
83
+ source .venv/bin/activate
84
+
85
+ # Set up your keys
86
+ cp .env.example .env
87
+ # Add your Anthropic API key and GitHub token
88
+
89
+ # Launch the app
90
+ python app.py
91
+ ```
92
+
93
+ </details>
94
+
95
+ ## 🎯 How It Works
96
+
97
+ This agent specifically targets the [Hugging Face Transformers documentation](https://huggingface.co/docs/transformers/en/index) and submits PRs addressing [Issue #20179](https://github.com/huggingface/transformers/issues/20179) in the [huggingface/transformers](https://github.com/huggingface/transformers) repository.
98
+
99
+ ```mermaid
100
+ graph LR
101
+ A[🔍 Find Files] --> B[🤖 Translate] --> C[📝 Create PR]
102
+
103
+ A --> A1[Scan transformers docs]
104
+ A --> A2[Identify translation gaps]
105
+
106
+ B --> B1[Claude AI translation]
107
+ B --> B2[Preserve formatting]
108
+
109
+ C --> C1[Auto-branch creation]
110
+ C --> C2[Submit to huggingface/transformers]
111
+ ```
112
+
113
+ ### Step 1: 🔍 Discover Translation Targets
114
+ - Select your target language (Korean, and more languages will be supported)
115
+ - Set how many files to process
116
+ - Let AI identify the most impactful Transformers docs translations
117
+
118
+ ### Step 2: 🤖 Smart Translation
119
+ - Enter your Anthropic API key
120
+ - Claude Sonnet 4 translates with context awareness
121
+ - Technical terms and code blocks preserved automatically
122
+
123
+ ### Step 3: 📝 Automated PR Creation
124
+ - Configure GitHub credentials
125
+ - System creates properly formatted pull requests for [huggingface/transformers](https://github.com/huggingface/transformers)
126
+ - Optional: Use reference PRs for consistency
127
+
128
+ ## 🌍 Supported Languages
129
+
130
+ <div align="center">
131
+
132
+ | Language | Code | Status |
133
+ |----------|------|--------|
134
+ | 🇰🇷 Korean | `ko` | ✅ Fully Supported |
135
+
136
+ *And more languages coming soon...*
137
+
138
+ </div>
139
+
140
+ ## 🏗️ Architecture
141
+
142
+ <div align="center">
143
+ <a href="images/workflow.png" target="_blank">
144
+ <img src="images/workflow.png" alt="KREW x Hugging Face Logo" width="400">
145
+ </a>
146
+ </div>
147
+
148
+ <details>
149
+
150
+ <summary>📊 System Design Overview</summary>
151
+
152
+ **Frontend Layer**
153
+ - Gradio web interface with modern styling
154
+ - Real-time chat & quick controls with AI agent
155
+ - Progress tracking missing
156
+
157
+ **AI Processing Layer**
158
+ - File discovery with intelligent prioritization for Transformers docs
159
+ - Claude Sonnet 4 for context-aware translation
160
+ - LangChain integration for PR research
161
+
162
+ **Integration Layer**
163
+ - GitHub API for automated PR creation to [huggingface/transformers](https://github.com/huggingface/transformers)
164
+ - Branch management and commit structuring
165
+ - Template matching from reference PRs
166
+
167
+ </details>
168
+
169
+ ## ⚙️ Configuration
170
+
171
+ ### For Spaces Deployment
172
+ Prepare these secrets:
173
+
174
+ ```bash
175
+ ANTHROPIC_API_KEY=your_claude_api_key
176
+ GITHUB_TOKEN=your_github_token
177
+ ```
178
+
179
+ ### For Local Development
180
+ Create `.env` file:
181
+
182
+ ```bash
183
+ ANTHROPIC_API_KEY=<your api key>
184
+
185
+ # GitHub PR Agent Configuration
186
+ GITHUB_TOKEN=<your github token>
187
+ GITHUB_OWNER=<your github username>
188
+ GITHUB_REPO=<your repository name>
189
+ REFERENCE_PR_URL=<reference pr url for style analysis>
190
+ ```
191
+
192
+ ## 🤝 Contributing
193
+
194
+ <div align="center">
195
+
196
+ **Love this project? Here's how you can help:**
197
+
198
+ [![Fork](https://img.shields.io/github/forks/username/repo?style=social)](https://github.com/Hugging-Face-KREW/i18n-agent.git)
199
+ [![Star](https://img.shields.io/github/stars/username/repo?style=social)](https://github.com/Hugging-Face-KREW/i18n-agent.git)
200
+ [![Issues](https://img.shields.io/github/issues/username/repo)](https://github.com/Hugging-Face-KREW/i18n-agent.git)
201
+
202
+ </div>
203
+
204
+ ### 👥 Contributors
205
+
206
+ 🤗 [email protected] / @harheem
207
+ 🤗 [email protected] / @Jwaminju
208
+
209
+ ## 💡 Use Cases
210
+
211
+ > **🌟 Real-world scenarios where this agent shines:**
212
+
213
+ - **📚 Documentation Teams**: Batch translate Transformers documentation updates
214
+ - **🌍 Community Contributors**: Help make Transformers accessible in your language
215
+ - **🏢 Organizations**: Streamline i18n workflows for Transformers library
216
+ - **👨‍💻 Developers**: Contribute Transformers translations without manual GitHub workflow
217
+ - **🎯 Issue #20179 Contributors**: Directly address the internationalization challenges raised by the community
218
+
219
+ ## 🛠️ Tech Stack
220
+
221
+ <div align="center">
222
+
223
+ ![Python](https://img.shields.io/badge/python-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54)
224
+ ![Gradio](https://img.shields.io/badge/gradio-FF6B35?style=for-the-badge&logo=gradio&logoColor=white)
225
+ ![Anthropic](https://img.shields.io/badge/Claude-191919?style=for-the-badge&logo=anthropic&logoColor=white)
226
+ ![GitHub](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white)
227
+ ![LangChain](https://img.shields.io/badge/LangChain-121212?style=for-the-badge&logo=chainlink&logoColor=white)
228
+
229
+ </div>
230
+
231
+ ## ❓ FAQ
232
+
233
+ <details>
234
+ <summary><strong>Q: How does this relate to Issue #20179?</strong></summary>
235
+ <br>
236
+ This agent directly addresses the pain points raised in <a href="https://github.com/huggingface/transformers/issues/20179">Issue #20179</a> by automating the translation workflow, reducing manual overhead, and making it easier for contributors to submit high-quality translations.
237
+ </details>
238
+
239
+ <details>
240
+ <summary><strong>Q: How accurate are the translations?</strong></summary>
241
+ <br>
242
+ The agent uses Claude Sonnet 4, which provides high-quality translations with technical context awareness. It preserves code blocks, maintains formatting, and follows established translation patterns.
243
+ </details>
244
+
245
+ <details>
246
+ <summary><strong>Q: What permissions do I need for GitHub integration?</strong></summary>
247
+ <br>
248
+ Your GitHub token needs repository read/write permissions and the ability to create branches and pull requests on the target repository.
249
+ </details>
250
+
251
+ <details>
252
+ <summary><strong>Q: Can I customize the translation style?</strong></summary>
253
+ <br>
254
+ Yes! You can provide reference PR URLs to match existing translation patterns and maintain consistency with community standards.
255
+ </details>
256
+
257
+ ## 🐛 Troubleshooting
258
+
259
+ ### Common Issues
260
+
261
+ <details>
262
+ <summary><strong>API Key Issues</strong></summary>
263
+
264
+ - Ensure your Anthropic API key is valid and has sufficient credits
265
+ - Check that your GitHub token has the necessary repository permissions
266
+
267
+ </details>
268
+
269
+ <details>
270
+ <summary><strong>Translation Quality</strong></summary>
271
+
272
+ - The system uses Claude Sonnet 4 for high-quality translations
273
+ - Formatting and markdown structure is maintained
274
+ - Please restart the translation again if you met format issue
275
+
276
+ </details>
277
+
278
+ <details>
279
+ <summary><strong>GitHub PR Creation</strong></summary>
280
+
281
+ - Verify repository permissions and branch protection rules
282
+ - Check that the reference PR URL is accessible and valid
283
+
284
+ </details>
285
+
286
+
287
+ ## 🙏 Acknowledgments
288
+
289
+ Special thanks to the amazing communities that make this possible:
290
+
291
+ - **🤗 Hugging Face** - For building the Transformers library and comprehensive documentation
292
+ - **🎭 Anthropic** - For Claude's incredible language capabilities
293
+ - **👥 Hugging Face KREW Community** - For championing Korean AI translation
294
+ - **🎨 Gradio** - For making beautiful AI interfaces simple
295
+ - **🌍 Community Contributors** - For raising awareness through [Issue #20179](https://github.com/huggingface/transformers/issues/20179)
296
+
297
+ ---
298
+
299
+ <div align="center">
300
+
301
+ **Made with ❤️ for global accessibility of Hugging Face Transformers documentation.**
302
+
303
+ **🎯 Solving [Issue #20179](https://github.com/huggingface/transformers/issues/20179) one translation at a time.**
304
+
305
+ [⭐ Star this repo](https://github.com/Hugging-Face-KREW/i18n-agent.git) • [🐛 Report Bug](https://github.com/Hugging-Face-KREW/i18n-agent.git) • [💡 Request Feature](https://github.com/Hugging-Face-KREW/i18n-agent.git)
306
+
307
+ </div>
agent/handler.py CHANGED
@@ -1,639 +1,639 @@
1
- """Module for gradio chat-based translation agent interface."""
2
-
3
- import os
4
- import re
5
- from pathlib import Path
6
-
7
- import gradio as gr
8
-
9
- from agent.workflow import (
10
- report_translation_target_files,
11
- translate_docs_interactive,
12
- generate_github_pr,
13
- )
14
- from pr_generator.searcher import find_reference_pr_simple_stream
15
- from translator.content import get_full_prompt, get_content, preprocess_content
16
- from translator.project_config import get_available_projects, get_project_config
17
-
18
-
19
- # State management
20
- class ChatState:
21
- def __init__(self):
22
- self.step = "welcome" # welcome -> find_files -> translate -> create_github_pr
23
-
24
- # Transient state (reset on restart)
25
- self.selected_project = "transformers"
26
- self.target_language = "ko"
27
- self.k_files = 10
28
- self.files_to_translate = []
29
- self.additional_instruction = ""
30
- self.current_file_content = {"translated": ""}
31
- self.pr_result = None
32
-
33
- # Persistent settings (preserved across restarts)
34
- self.persistent_settings = {
35
- "anthropic_api_key": "",
36
- "aws_bearer_token_bedrock": "",
37
- "github_config": {
38
- "token": "",
39
- "owner": "",
40
- "repo_name": "",
41
- "reference_pr_url": "",
42
- }
43
- }
44
-
45
- def reset_transient_state(self):
46
- """Reset only the workflow state, keep persistent settings"""
47
- self.step = "welcome"
48
- self.selected_project = "transformers"
49
- self.target_language = "ko"
50
- self.k_files = 10
51
- self.files_to_translate = []
52
- self.additional_instruction = ""
53
- self.current_file_content = {"translated": ""}
54
- self.pr_result = None
55
-
56
- @property
57
- def github_config(self):
58
- return self.persistent_settings["github_config"]
59
-
60
-
61
- state = ChatState()
62
-
63
-
64
- def _extract_content_for_display(content: str) -> str:
65
- """Extract text from document for display."""
66
- # Remove Copyright header
67
- to_translate = re.sub(r"<!--.*?-->", "", content, count=1, flags=re.DOTALL)
68
- to_translate = to_translate.strip()
69
- ## remove code blocks from text
70
- to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
71
- ## remove markdown tables from text
72
- to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
73
- ## remove empty lines from text
74
- to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
75
-
76
- return to_translate
77
-
78
-
79
- def get_welcome_message():
80
- """Initial welcome message with project selection"""
81
- return """**👋 Welcome to 🌐 Hugging Face i18n Translation Agent!**
82
-
83
- I'll help you find files that need translation and translate them in a streamlined workflow.
84
-
85
- **🎯 First, select which project you want to translate:**
86
-
87
- Use the **`Quick Controls`** on the right to select a project, or **ask me `what`, `how`, or `help`** to get started.
88
- """
89
-
90
-
91
- def process_file_search_handler(project: str, lang: str, k: int, history: list) -> tuple:
92
- """Process file search request and update Gradio UI components."""
93
- global state
94
- state.selected_project = project
95
- state.target_language = lang
96
- state.k_files = k
97
- state.step = "find_files"
98
-
99
- try:
100
- status_report, files_list = report_translation_target_files(project, lang, k)
101
- except Exception as e:
102
- if "rate limit" in str(e).lower():
103
- response = f"""❌ **GitHub API Rate Limit Exceeded**
104
-
105
- {str(e)}
106
-
107
- **💡 To fix this:**
108
- 1. Set GitHub Token in Configuration panel above
109
- 2. Click "💾 Save Configuration"
110
- 3. Try "Find Files" again"""
111
- history.append(["File search request", response])
112
- return history, "", update_status(), gr.Tabs(selected=0), gr.update(choices=[]), gr.update(visible=False)
113
- else:
114
- raise # Re-raise non-rate-limit errors
115
- state.files_to_translate = (
116
- [file[0] for file in files_list]
117
- if files_list
118
- else []
119
- )
120
-
121
- response = f"""**✅ File search completed!**
122
-
123
- **Status Report:**
124
- {status_report}
125
-
126
- **📁 Found first {len(state.files_to_translate)} files to translate:**
127
- """
128
-
129
- if state.files_to_translate:
130
- config = get_project_config(state.selected_project)
131
- for i, file in enumerate(state.files_to_translate, 1):
132
- file_link = f"{config.repo_url}/blob/main/{file}"
133
- response += f"\n{i}. [`{file}`]({file_link})"
134
-
135
- # if len(state.files_to_translate) > 5:
136
- # response += f"\n... and {len(state.files_to_translate) - 5} more files"
137
-
138
- response += "\n\n**🚀 Ready to start translation?**\nI can begin translating these files one by one. Would you like to proceed?"
139
- else:
140
- response += "\nNo files found that need translation."
141
-
142
- # Add to history
143
- history.append(["Please find files that need translation", response])
144
- cleared_input = ""
145
-
146
- # 드롭다운 choices로 쓸 파일 리스트 반환 추가
147
- return (
148
- history,
149
- cleared_input,
150
- update_status(),
151
- gr.Tabs(), # Don't change tab
152
- update_dropdown_choices(state.files_to_translate),
153
- )
154
-
155
-
156
- def update_dropdown_choices(file_list):
157
- return gr.update(choices=file_list, value=None)
158
-
159
-
160
- def confirm_and_go_translate_handler(history):
161
- """Confirm selection and go to translate tab"""
162
- global state
163
-
164
- response = f"✅ **Selection confirmed!**\n\n🎯 **Project:** {state.selected_project}\n🌍 **Language:** {state.target_language}\n\n**➡️ Go to Tab 2 to start translation.**"
165
- history.append(["Confirm selection", response])
166
- return history, "", update_status(), gr.Tabs(selected=1)
167
-
168
-
169
- def confirm_translation_and_go_upload_handler(history):
170
- """Confirm translation and go to upload PR tab"""
171
- global state
172
-
173
- if not state.current_file_content.get("translated"):
174
- response = "❌ No translation available. Please complete translation first."
175
- history.append(["Upload PR request", response])
176
- return history, "", update_status(), gr.Tabs()
177
-
178
- response = f"✅ **Translation confirmed!**\n\n📄 **File:** `{state.files_to_translate[0] if state.files_to_translate else 'Unknown'}`\n\n**➡️ Go to Tab 3 to upload PR.**"
179
- history.append(["Upload PR request", response])
180
- return history, "", update_status(), gr.Tabs(selected=2)
181
-
182
-
183
- def start_translation_process(force_retranslate=False):
184
- """Start the translation process for the first file"""
185
- if not state.files_to_translate:
186
- return "❌ No files available for translation.", ""
187
-
188
- current_file = state.files_to_translate[0]
189
-
190
- # Call translation function (simplified for demo)
191
- try:
192
- status, translated = translate_docs_interactive(
193
- state.target_language, [[current_file]], state.additional_instruction, state.selected_project, force_retranslate
194
- )
195
-
196
- state.current_file_content = {"translated": translated}
197
- path = (
198
- Path(__file__).resolve().parent.parent
199
- / f"translation_result/{current_file}"
200
- )
201
- p = Path(path)
202
- p.parent.mkdir(parents=True, exist_ok=True)
203
- p.write_text(translated, encoding="utf-8")
204
-
205
- config = get_project_config(state.selected_project)
206
- original_file_link = f"{config.repo_url}/blob/main/{current_file}"
207
- print("Compeleted translation:\n")
208
- print(translated)
209
- print("----------------------------")
210
-
211
- # Different response format for existing vs new translation
212
- if isinstance(status, str) and "Existing translation loaded" in status:
213
- response = f"{status}\n**📄 Original Content Link:** {original_file_link}\n\n**🌐 Translated Content:**"
214
- else:
215
- response = (
216
- f"""🔄 Translation for: `{current_file}`\n"""
217
- f"**📄 Original Content Link:** {original_file_link}\n\n"
218
- f"{status}\n\n"
219
- "**🌐 Translated Content:**"
220
- )
221
- return response, translated
222
-
223
-
224
- except Exception as e:
225
- response = f"❌ Translation failed: {str(e)}"
226
- response += "\n**➡️ Please try from the beginning.**"
227
- return response, ""
228
-
229
-
230
- def handle_general_message(message):
231
- """Handle general messages"""
232
- message_lower = message.lower()
233
-
234
- if any(word in message_lower for word in ["help", "what", "how"]):
235
- return """**🤖 I'm your Hugging Face i18n Translation Agent!**
236
-
237
- I can help you:
238
- 1. **🔍 Find files** that need translation
239
- 2. **🌐 Translate documents** using AI
240
- 3. **📋 Review translations** for quality
241
- 4. **🚀 Create GitHub PR** for translation
242
-
243
- Currently available actions with quick controls:
244
- - "find files" - Search for files needing translation
245
- - "translate" - Start translation process
246
- - "review" - Review current translation
247
- - "github" - Create GitHub Pull Request
248
- - "restart" - Start over"""
249
-
250
- elif "restart" in message_lower:
251
- global state
252
- state = ChatState()
253
- return get_welcome_message()
254
-
255
- else:
256
- return """I understand you want to work on translations!
257
-
258
- **Two ways to get started:**
259
-
260
- 1. **🔍 Find Files first** - Use Tab 1 to discover files that need translation
261
- 2. **🚀 Direct Translation** - Go to Tab 2 and enter a file path directly (e.g., `docs/source/en/model_doc/bert.md`)
262
-
263
- Make sure to configure your API keys in the Configuration panel above.
264
- """
265
-
266
-
267
- # Main handler
268
- def handle_user_message(message, history):
269
- """Handle user messages and provide appropriate responses"""
270
- global state
271
-
272
- if not message.strip():
273
- return history, ""
274
-
275
- elif state.step == "find_files" and any(
276
- word in message.lower()
277
- for word in ["yes", "proceed", "start", "translate", "translation"]
278
- ):
279
- # User wants to start translation
280
- if state.files_to_translate:
281
- state.step = "translate"
282
- response, translated = start_translation_process()
283
- history.append([message, response])
284
- history.append(["", translated])
285
- return history, ""
286
- else:
287
- response = (
288
- "❌ No files available for translation. Please search for files first."
289
- )
290
- # Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
291
- else:
292
- # General response
293
- response = handle_general_message(message)
294
-
295
- history.append([message, response])
296
- return history, ""
297
-
298
-
299
- def update_status():
300
- if state.step == "welcome":
301
- return f"""
302
- <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px;padding: 10px; background: rgba(0, 0, 0, 0.25); border-radius: 8px;">
303
- <div><strong>🔄 Step:</strong> Welcome</div>
304
- <div><strong>🎯 Project:</strong> {state.selected_project}</div>
305
- <div><strong>📁 Files:</strong> 0</div>
306
- <div><strong>🌍 Language:</strong> {state.target_language}</div>
307
- </div>
308
- """
309
-
310
- step_map = {
311
- "welcome": "Welcome",
312
- "find_files": "Finding Files",
313
- "translate": "Translating",
314
- "review": "Reviewing",
315
- "create_github_pr": "Creating PR",
316
- }
317
-
318
- progress_map = {
319
- "welcome": "Ready to start",
320
- "find_files": "Files found",
321
- "translate": f"{len(state.files_to_translate)} remaining",
322
- "review": "Review complete",
323
- "create_github_pr": "PR generation in progress",
324
- }
325
-
326
- # Check GitHub configuration status
327
- github_status = "❌ Not configured"
328
- if all(
329
- [
330
- state.github_config["token"],
331
- state.github_config["owner"],
332
- state.github_config["repo_name"],
333
- ]
334
- ):
335
- github_status = (
336
- f"✅ {state.github_config['owner']}/{state.github_config['repo_name']}"
337
- )
338
-
339
- status_html = f"""
340
- <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px; padding: 10px; background: rgba(0, 0, 0, 0.25); border-radius: 8px;">
341
- <div><strong>🔄 Step:</strong> {step_map.get(state.step, state.step)}</div>
342
- <div><strong>🎯 Project:</strong> {state.selected_project}</div>
343
- <div><strong>📁 Files:</strong> {len(state.files_to_translate)}</div>
344
- <div><strong>🌍 Language:</strong> {state.target_language}</div>
345
- <div><strong>⏳ Progress:</strong> {progress_map.get(state.step, 'In progress')}</div>
346
- <div><strong>🔧 GitHub:</strong> {github_status}</div>
347
- </div>
348
- """
349
-
350
- return status_html
351
-
352
-
353
- # Event handlers
354
-
355
-
356
- def sync_language_displays(lang):
357
- return lang
358
-
359
-
360
- def update_project_selection(project, history):
361
- """Update state when project is selected"""
362
- global state
363
- state.selected_project = project
364
- response = f"Selection confirmed: 🎯 Project → **{project}**"
365
- history.append(["Project selection", response])
366
- return history, "", update_status()
367
-
368
-
369
- def update_language_selection(lang, history):
370
- """Update state when language is selected"""
371
- global state
372
- state.target_language = lang
373
- response = f"Selection confirmed: 🌍 Language → **{lang}**"
374
- history.append(["Language selection", response])
375
- return history, "", update_status(), lang
376
-
377
-
378
- def update_persistent_config(api_provider, anthropic_key, aws_bearer_token_bedrock, github_token, github_owner, github_repo, reference_pr_url, history):
379
- """Update persistent configuration settings."""
380
- global state
381
-
382
- # Update API keys based on provider selection
383
- if api_provider == "Anthropic":
384
- state.persistent_settings["anthropic_api_key"] = anthropic_key
385
- os.environ["ANTHROPIC_API_KEY"] = anthropic_key
386
- # Clear AWS Bedrock token if Anthropic is selected
387
- state.persistent_settings["aws_bearer_token_bedrock"] = ""
388
- os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None)
389
- elif api_provider == "AWS Bedrock":
390
- state.persistent_settings["aws_bearer_token_bedrock"] = aws_bearer_token_bedrock
391
- os.environ["AWS_BEARER_TOKEN_BEDROCK"] = aws_bearer_token_bedrock
392
- # Clear Anthropic key if AWS Bedrock is selected
393
- state.persistent_settings["anthropic_api_key"] = ""
394
- os.environ.pop("ANTHROPIC_API_KEY", None)
395
- else:
396
- # If no provider is selected or unknown, clear both
397
- state.persistent_settings["anthropic_api_key"] = ""
398
- os.environ.pop("ANTHROPIC_API_KEY", None)
399
- state.persistent_settings["aws_bearer_token_bedrock"] = ""
400
- os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None)
401
-
402
- if github_token:
403
- os.environ["GITHUB_TOKEN"] = github_token
404
-
405
- # Get default reference PR URL from project config if not provided
406
- if not reference_pr_url and state.selected_project:
407
- try:
408
- config = get_project_config(state.selected_project)
409
- reference_pr_url = config.reference_pr_url
410
- except:
411
- pass
412
-
413
- # Save GitHub configuration to persistent settings
414
- state.persistent_settings["github_config"].update({
415
- "token": github_token or "",
416
- "owner": github_owner or "",
417
- "repo_name": github_repo or "",
418
- "reference_pr_url": reference_pr_url or "",
419
- })
420
-
421
- # Build response message based on what was configured
422
- response = "✅ Configuration saved!"
423
- if github_owner and github_repo:
424
- response += f" GitHub: {github_owner}/{github_repo}"
425
-
426
- if api_provider == "Anthropic" and anthropic_key:
427
- response += " Anthropic API key updated."
428
- elif api_provider == "AWS Bedrock" and aws_bearer_token_bedrock:
429
- response += " AWS Bedrock Bearer Token updated."
430
-
431
- history.append(["Configuration update", response])
432
- return history, "", update_status()
433
-
434
-
435
- def update_github_config(token, owner, repo, reference_pr_url):
436
- """Legacy function for backward compatibility."""
437
- return update_persistent_config("", token, owner, repo, reference_pr_url)
438
-
439
-
440
- def update_prompt_preview(language, file_path, additional_instruction):
441
- """Update prompt preview based on current settings"""
442
- if not file_path.strip():
443
- return "Select a file to see the prompt preview..."
444
-
445
- try:
446
- # Get language name
447
- if language == "ko":
448
- translation_lang = "Korean"
449
- else:
450
- translation_lang = language
451
-
452
- # Get sample content (first 500 characters)
453
- content = get_content(file_path, state.selected_project)
454
- to_translate = preprocess_content(content)
455
-
456
- # Truncate for preview
457
- sample_content = to_translate[:500] + ("..." if len(to_translate) > 500 else "")
458
-
459
- # Generate prompt
460
- prompt = get_full_prompt(translation_lang, sample_content, additional_instruction)
461
-
462
- return prompt
463
- except Exception as e:
464
- error_str = str(e)
465
- if "Failed to retrieve content from the URL" in error_str:
466
- return f"❌ **File not found:** `{file_path}`\n\n💡 **Please check:**\n1. Is this file in the **{state.selected_project}** project?\n2. Use \"🔍 Find Files to Translate\" to see available files\n3. Verify the file path is correct"
467
- return f"Error generating prompt preview: {error_str}"
468
-
469
-
470
- def send_message(message, history):
471
- new_history, cleared_input = handle_user_message(message, history)
472
- return new_history, cleared_input, update_status()
473
-
474
-
475
- # Button handlers with tab switching
476
- def start_translate_handler(history, file_to_translate, additional_instruction="", force_retranslate=False):
477
- # Use persistent anthropic key
478
- anthropic_key = state.persistent_settings["anthropic_api_key"]
479
- aws_bearer_token_bedrock = state.persistent_settings["aws_bearer_token_bedrock"]
480
-
481
- if not anthropic_key and not aws_bearer_token_bedrock:
482
- response = "❌ Please set either Anthropic API key or AWS Bearer Token for Bedrock in Configuration panel first."
483
- history.append(["Translation request", response])
484
- return history, "", update_status(), gr.Tabs(), gr.update(), gr.update()
485
-
486
- # Set the active API key to environment variable for translator.content.py
487
- if anthropic_key:
488
- os.environ["ANTHROPIC_API_KEY"] = anthropic_key
489
- os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None) # Ensure only one is active
490
- elif aws_bearer_token_bedrock:
491
- os.environ["AWS_BEARER_TOKEN_BEDROCK"] = aws_bearer_token_bedrock
492
- os.environ.pop("ANTHROPIC_API_KEY", None) # Ensure only one is active
493
-
494
- # Check if file path is provided
495
- if not file_to_translate or not file_to_translate.strip():
496
- response = "❌ Please select a file from the dropdown or enter a file path to translate."
497
- history.append(["Translation request", response])
498
- return history, "", update_status(), gr.Tabs(), gr.update(), gr.update()
499
-
500
- state.additional_instruction = additional_instruction
501
- state.files_to_translate = [file_to_translate]
502
- state.step = "translate"
503
-
504
- # Start translation directly
505
- if force_retranslate:
506
- history.append(["Translation request", "🔄 **Force retranslation started...**"])
507
- response, translated = start_translation_process(force_retranslate)
508
- history.append(["", response])
509
- if translated:
510
- history.append(["", translated])
511
-
512
- # Update button text and show confirm button after translation
513
- start_btn_text = "🔄 Retranslation" if state.current_file_content["translated"] else "🚀 Start Translation"
514
- confirm_btn_visible = bool(state.current_file_content["translated"])
515
-
516
- return history, "", update_status(), gr.Tabs(), gr.update(value=start_btn_text), gr.update(visible=confirm_btn_visible)
517
-
518
-
519
- def approve_handler(history, owner, repo, reference_pr_url):
520
- """Handles the request to generate a GitHub PR."""
521
- global state
522
- state.step = "create_github_pr"
523
-
524
- # Check all required GitHub configuration at once
525
- github_config = state.persistent_settings["github_config"]
526
- missing_config = []
527
-
528
- if not github_config.get("token"):
529
- missing_config.append("GitHub Token")
530
- if not owner:
531
- missing_config.append("GitHub Owner")
532
- if not repo:
533
- missing_config.append("Repository Name")
534
-
535
- if missing_config:
536
- config = get_project_config(state.selected_project)
537
- repo_name = config.repo_url.split('/')[-1] # Extract repo name from URL
538
- response = f"❌ Please set the following in Configuration panel first: {', '.join(missing_config)}\n\n💡 **Note:** GitHub Owner/Repository should be your fork of [`{repo_name}`]({config.repo_url}) (e.g., Owner: `your-username`, Repository: `{repo_name}`)"
539
- history.append(["GitHub PR creation request", response])
540
- return history, "", update_status()
541
-
542
- # Update reference PR URL (can be set per PR)
543
- if reference_pr_url:
544
- state.persistent_settings["github_config"]["reference_pr_url"] = reference_pr_url
545
-
546
- # Use persistent settings
547
- github_config = state.persistent_settings["github_config"]
548
-
549
- # Initialize response variable
550
- response = ""
551
-
552
- # If reference PR is not provided, use the agent to find one
553
- if not github_config.get("reference_pr_url"):
554
- response = "🤖 **Reference PR URL not found. The agent will now search for a suitable one...**"
555
- try:
556
- # This part is simplified to avoid streaming logic in a non-generator function
557
- stream_gen = find_reference_pr_simple_stream(
558
- target_language=state.target_language,
559
- context="documentation translation",
560
- )
561
- # We will just get the final result from the generator
562
- final_result = None
563
- try:
564
- while True:
565
- # We are not interested in the streamed messages here, just the final result.
566
- next(stream_gen)
567
- except StopIteration as e:
568
- final_result = e.value
569
-
570
- if final_result and final_result.get("status") == "success":
571
- result_text = final_result.get("result", "")
572
- match = re.search(r"https://github.com/[^\s]+", result_text)
573
- if match:
574
- found_url = match.group(0)
575
- state.github_config["reference_pr_url"] = found_url
576
- response += f"\n✅ **Agent found a reference PR:** {found_url}"
577
- else:
578
- raise ValueError(
579
- "Could not extract a valid PR URL from agent's response."
580
- )
581
- else:
582
- error_message = final_result.get("message") or final_result.get(
583
- "result", "Unknown error"
584
- )
585
- raise ValueError(f"Agent failed to find a PR. Reason: {error_message}")
586
- except Exception as e:
587
- response += f"\n❌ **Agent failed to find a reference PR.**\nReason: {e}\n\nPlease provide a reference PR URL manually in Tab 3 and try again."
588
- history.append(["Agent searching for PR", response])
589
- return history, "", update_status()
590
-
591
- # Proceed with PR generation
592
- if state.files_to_translate and state.current_file_content.get("translated"):
593
- current_file = state.files_to_translate[0]
594
- translated_content = state.current_file_content["translated"]
595
- response += "\n\n🚀 **Generating GitHub PR...**"
596
-
597
- # Extract title from file for toctree mapping
598
- file_name = current_file.split("/")[-1].replace(".md", "").replace("_", " ").title()
599
- print(file_name)
600
-
601
- pr_response = generate_github_pr(
602
- target_language=state.target_language,
603
- filepath=current_file,
604
- translated_content=translated_content,
605
- github_config=state.github_config,
606
- en_title=file_name,
607
- project=state.selected_project,
608
- )
609
- response += f"\n{pr_response}"
610
- else:
611
- response = "❌ No translated file available. Please complete the translation process first."
612
-
613
- history.append(["GitHub PR creation request", response])
614
- return history, "", update_status()
615
-
616
-
617
- def restart_handler(history):
618
- """Resets the workflow state but preserves persistent settings."""
619
- global state
620
- # Backup persistent settings
621
- backup_settings = state.persistent_settings.copy()
622
-
623
- # Reset state
624
- state = ChatState()
625
-
626
- # Restore persistent settings
627
- state.persistent_settings = backup_settings
628
-
629
- # Restore environment variables
630
- if backup_settings["anthropic_api_key"]:
631
- os.environ["ANTHROPIC_API_KEY"] = backup_settings["anthropic_api_key"]
632
- if backup_settings["aws_bearer_token_bedrock"]:
633
- os.environ["AWS_BEARER_TOKEN_BEDROCK"] = backup_settings["aws_bearer_token_bedrock"]
634
- if backup_settings["github_config"]["token"]:
635
- os.environ["GITHUB_TOKEN"] = backup_settings["github_config"]["token"]
636
-
637
- welcome_msg = get_welcome_message()
638
- new_hist = [[None, welcome_msg]]
639
- return new_hist, "", update_status(), gr.Tabs(selected=0)
 
1
+ """Module for gradio chat-based translation agent interface."""
2
+
3
+ import os
4
+ import re
5
+ from pathlib import Path
6
+
7
+ import gradio as gr
8
+
9
+ from agent.workflow import (
10
+ report_translation_target_files,
11
+ translate_docs_interactive,
12
+ generate_github_pr,
13
+ )
14
+ from pr_generator.searcher import find_reference_pr_simple_stream
15
+ from translator.content import get_full_prompt, get_content, preprocess_content
16
+ from translator.project_config import get_available_projects, get_project_config
17
+
18
+
19
+ # State management
20
+ class ChatState:
21
+ def __init__(self):
22
+ self.step = "welcome" # welcome -> find_files -> translate -> create_github_pr
23
+
24
+ # Transient state (reset on restart)
25
+ self.selected_project = "transformers"
26
+ self.target_language = "ko"
27
+ self.k_files = 10
28
+ self.files_to_translate = []
29
+ self.additional_instruction = ""
30
+ self.current_file_content = {"translated": ""}
31
+ self.pr_result = None
32
+
33
+ # Persistent settings (preserved across restarts)
34
+ self.persistent_settings = {
35
+ "anthropic_api_key": "",
36
+ "aws_bearer_token_bedrock": "",
37
+ "github_config": {
38
+ "token": "",
39
+ "owner": "",
40
+ "repo_name": "",
41
+ "reference_pr_url": "",
42
+ }
43
+ }
44
+
45
+ def reset_transient_state(self):
46
+ """Reset only the workflow state, keep persistent settings"""
47
+ self.step = "welcome"
48
+ self.selected_project = "transformers"
49
+ self.target_language = "ko"
50
+ self.k_files = 10
51
+ self.files_to_translate = []
52
+ self.additional_instruction = ""
53
+ self.current_file_content = {"translated": ""}
54
+ self.pr_result = None
55
+
56
+ @property
57
+ def github_config(self):
58
+ return self.persistent_settings["github_config"]
59
+
60
+
61
+ state = ChatState()
62
+
63
+
64
+ def _extract_content_for_display(content: str) -> str:
65
+ """Extract text from document for display."""
66
+ # Remove Copyright header
67
+ to_translate = re.sub(r"<!--.*?-->", "", content, count=1, flags=re.DOTALL)
68
+ to_translate = to_translate.strip()
69
+ ## remove code blocks from text
70
+ to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
71
+ ## remove markdown tables from text
72
+ to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
73
+ ## remove empty lines from text
74
+ to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
75
+
76
+ return to_translate
77
+
78
+
79
+ def get_welcome_message():
80
+ """Initial welcome message with project selection"""
81
+ return """**👋 Welcome to 🌐 Hugging Face i18n Translation Agent!**
82
+
83
+ I'll help you find files that need translation and translate them in a streamlined workflow.
84
+
85
+ **🎯 First, select which project you want to translate:**
86
+
87
+ Use the **`Quick Controls`** on the right to select a project, or **ask me `what`, `how`, or `help`** to get started.
88
+ """
89
+
90
+
91
+ def process_file_search_handler(project: str, lang: str, k: int, history: list) -> tuple:
92
+ """Process file search request and update Gradio UI components."""
93
+ global state
94
+ state.selected_project = project
95
+ state.target_language = lang
96
+ state.k_files = k
97
+ state.step = "find_files"
98
+
99
+ try:
100
+ status_report, files_list = report_translation_target_files(project, lang, k)
101
+ except Exception as e:
102
+ if "rate limit" in str(e).lower():
103
+ response = f"""❌ **GitHub API Rate Limit Exceeded**
104
+
105
+ {str(e)}
106
+
107
+ **💡 To fix this:**
108
+ 1. Set GitHub Token in Configuration panel above
109
+ 2. Click "💾 Save Configuration"
110
+ 3. Try "Find Files" again"""
111
+ history.append(["File search request", response])
112
+ return history, "", update_status(), gr.Tabs(selected=0), gr.update(choices=[]), gr.update(visible=False)
113
+ else:
114
+ raise # Re-raise non-rate-limit errors
115
+ state.files_to_translate = (
116
+ [file[0] for file in files_list]
117
+ if files_list
118
+ else []
119
+ )
120
+
121
+ response = f"""**✅ File search completed!**
122
+
123
+ **Status Report:**
124
+ {status_report}
125
+
126
+ **📁 Found first {len(state.files_to_translate)} files to translate:**
127
+ """
128
+
129
+ if state.files_to_translate:
130
+ config = get_project_config(state.selected_project)
131
+ for i, file in enumerate(state.files_to_translate, 1):
132
+ file_link = f"{config.repo_url}/blob/main/{file}"
133
+ response += f"\n{i}. [`{file}`]({file_link})"
134
+
135
+ # if len(state.files_to_translate) > 5:
136
+ # response += f"\n... and {len(state.files_to_translate) - 5} more files"
137
+
138
+ response += "\n\n**🚀 Ready to start translation?**\nI can begin translating these files one by one. Would you like to proceed?"
139
+ else:
140
+ response += "\nNo files found that need translation."
141
+
142
+ # Add to history
143
+ history.append(["Please find files that need translation", response])
144
+ cleared_input = ""
145
+
146
+ # 드롭다운 choices로 쓸 파일 리스트 반환 추가
147
+ return (
148
+ history,
149
+ cleared_input,
150
+ update_status(),
151
+ gr.Tabs(), # Don't change tab
152
+ update_dropdown_choices(state.files_to_translate),
153
+ )
154
+
155
+
156
+ def update_dropdown_choices(file_list):
157
+ return gr.update(choices=file_list, value=None)
158
+
159
+
160
+ def confirm_and_go_translate_handler(history):
161
+ """Confirm selection and go to translate tab"""
162
+ global state
163
+
164
+ response = f"✅ **Selection confirmed!**\n\n🎯 **Project:** {state.selected_project}\n🌍 **Language:** {state.target_language}\n\n**➡️ Go to Tab 2 to start translation.**"
165
+ history.append(["Confirm selection", response])
166
+ return history, "", update_status(), gr.Tabs(selected=1)
167
+
168
+
169
+ def confirm_translation_and_go_upload_handler(history):
170
+ """Confirm translation and go to upload PR tab"""
171
+ global state
172
+
173
+ if not state.current_file_content.get("translated"):
174
+ response = "❌ No translation available. Please complete translation first."
175
+ history.append(["Upload PR request", response])
176
+ return history, "", update_status(), gr.Tabs()
177
+
178
+ response = f"✅ **Translation confirmed!**\n\n📄 **File:** `{state.files_to_translate[0] if state.files_to_translate else 'Unknown'}`\n\n**➡️ Go to Tab 3 to upload PR.**"
179
+ history.append(["Upload PR request", response])
180
+ return history, "", update_status(), gr.Tabs(selected=2)
181
+
182
+
183
+ def start_translation_process(force_retranslate=False):
184
+ """Start the translation process for the first file"""
185
+ if not state.files_to_translate:
186
+ return "❌ No files available for translation.", ""
187
+
188
+ current_file = state.files_to_translate[0]
189
+
190
+ # Call translation function (simplified for demo)
191
+ try:
192
+ status, translated = translate_docs_interactive(
193
+ state.target_language, [[current_file]], state.additional_instruction, state.selected_project, force_retranslate
194
+ )
195
+
196
+ state.current_file_content = {"translated": translated}
197
+ path = (
198
+ Path(__file__).resolve().parent.parent
199
+ / f"translation_result/{current_file}"
200
+ )
201
+ p = Path(path)
202
+ p.parent.mkdir(parents=True, exist_ok=True)
203
+ p.write_text(translated, encoding="utf-8")
204
+
205
+ config = get_project_config(state.selected_project)
206
+ original_file_link = f"{config.repo_url}/blob/main/{current_file}"
207
+ print("Compeleted translation:\n")
208
+ print(translated)
209
+ print("----------------------------")
210
+
211
+ # Different response format for existing vs new translation
212
+ if isinstance(status, str) and "Existing translation loaded" in status:
213
+ response = f"{status}\n**📄 Original Content Link:** {original_file_link}\n\n**🌐 Translated Content:**"
214
+ else:
215
+ response = (
216
+ f"""🔄 Translation for: `{current_file}`\n"""
217
+ f"**📄 Original Content Link:** {original_file_link}\n\n"
218
+ f"{status}\n\n"
219
+ "**🌐 Translated Content:**"
220
+ )
221
+ return response, translated
222
+
223
+
224
+ except Exception as e:
225
+ response = f"❌ Translation failed: {str(e)}"
226
+ response += "\n**➡️ Please try from the beginning.**"
227
+ return response, ""
228
+
229
+
230
+ def handle_general_message(message):
231
+ """Handle general messages"""
232
+ message_lower = message.lower()
233
+
234
+ if any(word in message_lower for word in ["help", "what", "how"]):
235
+ return """**🤖 I'm your Hugging Face i18n Translation Agent!**
236
+
237
+ I can help you:
238
+ 1. **🔍 Find files** that need translation
239
+ 2. **🌐 Translate documents** using AI
240
+ 3. **📋 Review translations** for quality
241
+ 4. **🚀 Create GitHub PR** for translation
242
+
243
+ Currently available actions with quick controls:
244
+ - "find files" - Search for files needing translation
245
+ - "translate" - Start translation process
246
+ - "review" - Review current translation
247
+ - "github" - Create GitHub Pull Request
248
+ - "restart" - Start over"""
249
+
250
+ elif "restart" in message_lower:
251
+ global state
252
+ state = ChatState()
253
+ return get_welcome_message()
254
+
255
+ else:
256
+ return """I understand you want to work on translations!
257
+
258
+ **Two ways to get started:**
259
+
260
+ 1. **🔍 Find Files first** - Use Tab 1 to discover files that need translation
261
+ 2. **🚀 Direct Translation** - Go to Tab 2 and enter a file path directly (e.g., `docs/source/en/model_doc/bert.md`)
262
+
263
+ Make sure to configure your API keys in the Configuration panel above.
264
+ """
265
+
266
+
267
+ # Main handler
268
+ def handle_user_message(message, history):
269
+ """Handle user messages and provide appropriate responses"""
270
+ global state
271
+
272
+ if not message.strip():
273
+ return history, ""
274
+
275
+ elif state.step == "find_files" and any(
276
+ word in message.lower()
277
+ for word in ["yes", "proceed", "start", "translate", "translation"]
278
+ ):
279
+ # User wants to start translation
280
+ if state.files_to_translate:
281
+ state.step = "translate"
282
+ response, translated = start_translation_process()
283
+ history.append([message, response])
284
+ history.append(["", translated])
285
+ return history, ""
286
+ else:
287
+ response = (
288
+ "❌ No files available for translation. Please search for files first."
289
+ )
290
+ # Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
291
+ else:
292
+ # General response
293
+ response = handle_general_message(message)
294
+
295
+ history.append([message, response])
296
+ return history, ""
297
+
298
+
299
+ def update_status():
300
+ if state.step == "welcome":
301
+ return f"""
302
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px;padding: 10px; background: rgba(0, 0, 0, 0.25); border-radius: 8px;">
303
+ <div><strong>🔄 Step:</strong> Welcome</div>
304
+ <div><strong>🎯 Project:</strong> {state.selected_project}</div>
305
+ <div><strong>📁 Files:</strong> 0</div>
306
+ <div><strong>🌍 Language:</strong> {state.target_language}</div>
307
+ </div>
308
+ """
309
+
310
+ step_map = {
311
+ "welcome": "Welcome",
312
+ "find_files": "Finding Files",
313
+ "translate": "Translating",
314
+ "review": "Reviewing",
315
+ "create_github_pr": "Creating PR",
316
+ }
317
+
318
+ progress_map = {
319
+ "welcome": "Ready to start",
320
+ "find_files": "Files found",
321
+ "translate": f"{len(state.files_to_translate)} remaining",
322
+ "review": "Review complete",
323
+ "create_github_pr": "PR generation in progress",
324
+ }
325
+
326
+ # Check GitHub configuration status
327
+ github_status = "❌ Not configured"
328
+ if all(
329
+ [
330
+ state.github_config["token"],
331
+ state.github_config["owner"],
332
+ state.github_config["repo_name"],
333
+ ]
334
+ ):
335
+ github_status = (
336
+ f"✅ {state.github_config['owner']}/{state.github_config['repo_name']}"
337
+ )
338
+
339
+ status_html = f"""
340
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px; padding: 10px; background: rgba(0, 0, 0, 0.25); border-radius: 8px;">
341
+ <div><strong>🔄 Step:</strong> {step_map.get(state.step, state.step)}</div>
342
+ <div><strong>🎯 Project:</strong> {state.selected_project}</div>
343
+ <div><strong>📁 Files:</strong> {len(state.files_to_translate)}</div>
344
+ <div><strong>🌍 Language:</strong> {state.target_language}</div>
345
+ <div><strong>⏳ Progress:</strong> {progress_map.get(state.step, 'In progress')}</div>
346
+ <div><strong>🔧 GitHub:</strong> {github_status}</div>
347
+ </div>
348
+ """
349
+
350
+ return status_html
351
+
352
+
353
+ # Event handlers
354
+
355
+
356
+ def sync_language_displays(lang):
357
+ return lang
358
+
359
+
360
+ def update_project_selection(project, history):
361
+ """Update state when project is selected"""
362
+ global state
363
+ state.selected_project = project
364
+ response = f"Selection confirmed: 🎯 Project → **{project}**"
365
+ history.append(["Project selection", response])
366
+ return history, "", update_status()
367
+
368
+
369
+ def update_language_selection(lang, history):
370
+ """Update state when language is selected"""
371
+ global state
372
+ state.target_language = lang
373
+ response = f"Selection confirmed: 🌍 Language → **{lang}**"
374
+ history.append(["Language selection", response])
375
+ return history, "", update_status(), lang
376
+
377
+
378
+ def update_persistent_config(api_provider, anthropic_key, aws_bearer_token_bedrock, github_token, github_owner, github_repo, reference_pr_url, history):
379
+ """Update persistent configuration settings."""
380
+ global state
381
+
382
+ # Update API keys based on provider selection
383
+ if api_provider == "Anthropic":
384
+ state.persistent_settings["anthropic_api_key"] = anthropic_key
385
+ os.environ["ANTHROPIC_API_KEY"] = anthropic_key
386
+ # Clear AWS Bedrock token if Anthropic is selected
387
+ state.persistent_settings["aws_bearer_token_bedrock"] = ""
388
+ os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None)
389
+ elif api_provider == "AWS Bedrock":
390
+ state.persistent_settings["aws_bearer_token_bedrock"] = aws_bearer_token_bedrock
391
+ os.environ["AWS_BEARER_TOKEN_BEDROCK"] = aws_bearer_token_bedrock
392
+ # Clear Anthropic key if AWS Bedrock is selected
393
+ state.persistent_settings["anthropic_api_key"] = ""
394
+ os.environ.pop("ANTHROPIC_API_KEY", None)
395
+ else:
396
+ # If no provider is selected or unknown, clear both
397
+ state.persistent_settings["anthropic_api_key"] = ""
398
+ os.environ.pop("ANTHROPIC_API_KEY", None)
399
+ state.persistent_settings["aws_bearer_token_bedrock"] = ""
400
+ os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None)
401
+
402
+ if github_token:
403
+ os.environ["GITHUB_TOKEN"] = github_token
404
+
405
+ # Get default reference PR URL from project config if not provided
406
+ if not reference_pr_url and state.selected_project:
407
+ try:
408
+ config = get_project_config(state.selected_project)
409
+ reference_pr_url = config.reference_pr_url
410
+ except:
411
+ pass
412
+
413
+ # Save GitHub configuration to persistent settings
414
+ state.persistent_settings["github_config"].update({
415
+ "token": github_token or "",
416
+ "owner": github_owner or "",
417
+ "repo_name": github_repo or "",
418
+ "reference_pr_url": reference_pr_url or "",
419
+ })
420
+
421
+ # Build response message based on what was configured
422
+ response = "✅ Configuration saved!"
423
+ if github_owner and github_repo:
424
+ response += f" GitHub: {github_owner}/{github_repo}"
425
+
426
+ if api_provider == "Anthropic" and anthropic_key:
427
+ response += " Anthropic API key updated."
428
+ elif api_provider == "AWS Bedrock" and aws_bearer_token_bedrock:
429
+ response += " AWS Bedrock Bearer Token updated."
430
+
431
+ history.append(["Configuration update", response])
432
+ return history, "", update_status()
433
+
434
+
435
+ def update_github_config(token, owner, repo, reference_pr_url):
436
+ """Legacy function for backward compatibility."""
437
+ return update_persistent_config("", token, owner, repo, reference_pr_url)
438
+
439
+
440
+ def update_prompt_preview(language, file_path, additional_instruction):
441
+ """Update prompt preview based on current settings"""
442
+ if not file_path.strip():
443
+ return "Select a file to see the prompt preview..."
444
+
445
+ try:
446
+ # Get language name
447
+ if language == "ko":
448
+ translation_lang = "Korean"
449
+ else:
450
+ translation_lang = language
451
+
452
+ # Get sample content (first 500 characters)
453
+ content = get_content(file_path, state.selected_project)
454
+ to_translate = preprocess_content(content)
455
+
456
+ # Truncate for preview
457
+ sample_content = to_translate[:500] + ("..." if len(to_translate) > 500 else "")
458
+
459
+ # Generate prompt
460
+ prompt = get_full_prompt(translation_lang, sample_content, additional_instruction)
461
+
462
+ return prompt
463
+ except Exception as e:
464
+ error_str = str(e)
465
+ if "Failed to retrieve content from the URL" in error_str:
466
+ return f"❌ **File not found:** `{file_path}`\n\n💡 **Please check:**\n1. Is this file in the **{state.selected_project}** project?\n2. Use \"🔍 Find Files to Translate\" to see available files\n3. Verify the file path is correct"
467
+ return f"Error generating prompt preview: {error_str}"
468
+
469
+
470
+ def send_message(message, history):
471
+ new_history, cleared_input = handle_user_message(message, history)
472
+ return new_history, cleared_input, update_status()
473
+
474
+
475
+ # Button handlers with tab switching
476
+ def start_translate_handler(history, file_to_translate, additional_instruction="", force_retranslate=False):
477
+ # Use persistent anthropic key
478
+ anthropic_key = state.persistent_settings["anthropic_api_key"]
479
+ aws_bearer_token_bedrock = state.persistent_settings["aws_bearer_token_bedrock"]
480
+
481
+ if not anthropic_key and not aws_bearer_token_bedrock:
482
+ response = "❌ Please set either Anthropic API key or AWS Bearer Token for Bedrock in Configuration panel first."
483
+ history.append(["Translation request", response])
484
+ return history, "", update_status(), gr.Tabs(), gr.update(), gr.update()
485
+
486
+ # Set the active API key to environment variable for translator.content.py
487
+ if anthropic_key:
488
+ os.environ["ANTHROPIC_API_KEY"] = anthropic_key
489
+ os.environ.pop("AWS_BEARER_TOKEN_BEDROCK", None) # Ensure only one is active
490
+ elif aws_bearer_token_bedrock:
491
+ os.environ["AWS_BEARER_TOKEN_BEDROCK"] = aws_bearer_token_bedrock
492
+ os.environ.pop("ANTHROPIC_API_KEY", None) # Ensure only one is active
493
+
494
+ # Check if file path is provided
495
+ if not file_to_translate or not file_to_translate.strip():
496
+ response = "❌ Please select a file from the dropdown or enter a file path to translate."
497
+ history.append(["Translation request", response])
498
+ return history, "", update_status(), gr.Tabs(), gr.update(), gr.update()
499
+
500
+ state.additional_instruction = additional_instruction
501
+ state.files_to_translate = [file_to_translate]
502
+ state.step = "translate"
503
+
504
+ # Start translation directly
505
+ if force_retranslate:
506
+ history.append(["Translation request", "🔄 **Force retranslation started...**"])
507
+ response, translated = start_translation_process(force_retranslate)
508
+ history.append(["", response])
509
+ if translated:
510
+ history.append(["", translated])
511
+
512
+ # Update button text and show confirm button after translation
513
+ start_btn_text = "🔄 Retranslation" if state.current_file_content["translated"] else "🚀 Start Translation"
514
+ confirm_btn_visible = bool(state.current_file_content["translated"])
515
+
516
+ return history, "", update_status(), gr.Tabs(), gr.update(value=start_btn_text), gr.update(visible=confirm_btn_visible)
517
+
518
+
519
+ def approve_handler(history, owner, repo, reference_pr_url):
520
+ """Handles the request to generate a GitHub PR."""
521
+ global state
522
+ state.step = "create_github_pr"
523
+
524
+ # Check all required GitHub configuration at once
525
+ github_config = state.persistent_settings["github_config"]
526
+ missing_config = []
527
+
528
+ if not github_config.get("token"):
529
+ missing_config.append("GitHub Token")
530
+ if not owner:
531
+ missing_config.append("GitHub Owner")
532
+ if not repo:
533
+ missing_config.append("Repository Name")
534
+
535
+ if missing_config:
536
+ config = get_project_config(state.selected_project)
537
+ repo_name = config.repo_url.split('/')[-1] # Extract repo name from URL
538
+ response = f"❌ Please set the following in Configuration panel first: {', '.join(missing_config)}\n\n💡 **Note:** GitHub Owner/Repository should be your fork of [`{repo_name}`]({config.repo_url}) (e.g., Owner: `your-username`, Repository: `{repo_name}`)"
539
+ history.append(["GitHub PR creation request", response])
540
+ return history, "", update_status()
541
+
542
+ # Update reference PR URL (can be set per PR)
543
+ if reference_pr_url:
544
+ state.persistent_settings["github_config"]["reference_pr_url"] = reference_pr_url
545
+
546
+ # Use persistent settings
547
+ github_config = state.persistent_settings["github_config"]
548
+
549
+ # Initialize response variable
550
+ response = ""
551
+
552
+ # If reference PR is not provided, use the agent to find one
553
+ if not github_config.get("reference_pr_url"):
554
+ response = "🤖 **Reference PR URL not found. The agent will now search for a suitable one...**"
555
+ try:
556
+ # This part is simplified to avoid streaming logic in a non-generator function
557
+ stream_gen = find_reference_pr_simple_stream(
558
+ target_language=state.target_language,
559
+ context="documentation translation",
560
+ )
561
+ # We will just get the final result from the generator
562
+ final_result = None
563
+ try:
564
+ while True:
565
+ # We are not interested in the streamed messages here, just the final result.
566
+ next(stream_gen)
567
+ except StopIteration as e:
568
+ final_result = e.value
569
+
570
+ if final_result and final_result.get("status") == "success":
571
+ result_text = final_result.get("result", "")
572
+ match = re.search(r"https://github.com/[^\s]+", result_text)
573
+ if match:
574
+ found_url = match.group(0)
575
+ state.github_config["reference_pr_url"] = found_url
576
+ response += f"\n✅ **Agent found a reference PR:** {found_url}"
577
+ else:
578
+ raise ValueError(
579
+ "Could not extract a valid PR URL from agent's response."
580
+ )
581
+ else:
582
+ error_message = final_result.get("message") or final_result.get(
583
+ "result", "Unknown error"
584
+ )
585
+ raise ValueError(f"Agent failed to find a PR. Reason: {error_message}")
586
+ except Exception as e:
587
+ response += f"\n❌ **Agent failed to find a reference PR.**\nReason: {e}\n\nPlease provide a reference PR URL manually in Tab 3 and try again."
588
+ history.append(["Agent searching for PR", response])
589
+ return history, "", update_status()
590
+
591
+ # Proceed with PR generation
592
+ if state.files_to_translate and state.current_file_content.get("translated"):
593
+ current_file = state.files_to_translate[0]
594
+ translated_content = state.current_file_content["translated"]
595
+ response += "\n\n🚀 **Generating GitHub PR...**"
596
+
597
+ # Extract title from file for toctree mapping
598
+ file_name = current_file.split("/")[-1].replace(".md", "").replace("_", " ").title()
599
+ print(file_name)
600
+
601
+ pr_response = generate_github_pr(
602
+ target_language=state.target_language,
603
+ filepath=current_file,
604
+ translated_content=translated_content,
605
+ github_config=state.github_config,
606
+ en_title=file_name,
607
+ project=state.selected_project,
608
+ )
609
+ response += f"\n{pr_response}"
610
+ else:
611
+ response = "❌ No translated file available. Please complete the translation process first."
612
+
613
+ history.append(["GitHub PR creation request", response])
614
+ return history, "", update_status()
615
+
616
+
617
+ def restart_handler(history):
618
+ """Resets the workflow state but preserves persistent settings."""
619
+ global state
620
+ # Backup persistent settings
621
+ backup_settings = state.persistent_settings.copy()
622
+
623
+ # Reset state
624
+ state = ChatState()
625
+
626
+ # Restore persistent settings
627
+ state.persistent_settings = backup_settings
628
+
629
+ # Restore environment variables
630
+ if backup_settings["anthropic_api_key"]:
631
+ os.environ["ANTHROPIC_API_KEY"] = backup_settings["anthropic_api_key"]
632
+ if backup_settings["aws_bearer_token_bedrock"]:
633
+ os.environ["AWS_BEARER_TOKEN_BEDROCK"] = backup_settings["aws_bearer_token_bedrock"]
634
+ if backup_settings["github_config"]["token"]:
635
+ os.environ["GITHUB_TOKEN"] = backup_settings["github_config"]["token"]
636
+
637
+ welcome_msg = get_welcome_message()
638
+ new_hist = [[None, welcome_msg]]
639
+ return new_hist, "", update_status(), gr.Tabs(selected=0)
agent/toctree_handler.py CHANGED
@@ -1,419 +1,419 @@
1
- import yaml
2
- import requests
3
- from typing import Dict, List, Any
4
- import os
5
-
6
- class TocTreeHandler:
7
- def __init__(self, project: str = "transformers"):
8
- from translator.project_config import get_project_config
9
- self.project = project
10
- self.project_config = get_project_config(project)
11
-
12
- # Extract repository path from config
13
- repo_path = self.project_config.repo_url.replace("https://github.com/", "")
14
-
15
- # Build project-specific URLs
16
- self.en_toctree_url = f"https://raw.githubusercontent.com/{repo_path}/main/docs/source/en/_toctree.yml"
17
- self.ko_toctree_url = f"https://raw.githubusercontent.com/{repo_path}/main/docs/source/ko/_toctree.yml"
18
- self.local_docs_path = "docs/source/ko"
19
-
20
- def fetch_toctree(self, url: str) -> Dict[str, Any]:
21
- """Fetch and parse YAML from URL"""
22
- response = requests.get(url)
23
- response.raise_for_status()
24
- return yaml.safe_load(response.text)
25
-
26
- def get_en_toctree(self) -> Dict[str, Any]:
27
- """Get English toctree structure"""
28
- return self.fetch_toctree(self.en_toctree_url)
29
-
30
- def get_ko_toctree(self) -> Dict[str, Any]:
31
- """Get Korean toctree structure"""
32
- return self.fetch_toctree(self.ko_toctree_url)
33
-
34
- def extract_title_mappings(self, en_data: List[Dict], ko_data: List[Dict]) -> Dict[str, str]:
35
- """Extract title mappings between English and Korean"""
36
- mappings = {}
37
-
38
- def process_section(en_section: Dict, ko_section: Dict):
39
- if 'local' in en_section and 'local' in ko_section:
40
- if en_section['local'] == ko_section['local']:
41
- en_title = en_section.get('title', '')
42
- ko_title = ko_section.get('title', '')
43
- if en_title and ko_title:
44
- mappings[en_title] = ko_title
45
-
46
- if 'sections' in en_section and 'sections' in ko_section:
47
- en_sections = en_section['sections']
48
- ko_sections = ko_section['sections']
49
-
50
- for i, en_sub in enumerate(en_sections):
51
- if i < len(ko_sections):
52
- process_section(en_sub, ko_sections[i])
53
-
54
- for i, en_item in enumerate(en_data):
55
- if i < len(ko_data):
56
- process_section(en_item, ko_data[i])
57
-
58
- return mappings
59
-
60
- def translate_title(self, en_title: str) -> str:
61
- """Translate English title to Korean using LLM"""
62
- try:
63
- from translator.content import llm_translate
64
-
65
- prompt = f"""Translate the following English documentation title to Korean. Return only the translated title, nothing else.
66
-
67
- English title: {en_title}
68
-
69
- Korean title:"""
70
-
71
- callback_result, translated_title = llm_translate(prompt)
72
- return translated_title.strip()
73
- except Exception as e:
74
- print(f"Error translating title '{en_title}': {e}")
75
- return en_title
76
-
77
- def create_local_toctree(self, en_title: str, local_file_path: str) -> Dict[str, str]:
78
- """Create local toctree entry with Korean title and local path"""
79
- try:
80
- # First try to get Korean title from existing mappings
81
- en_data = self.get_en_toctree()
82
- ko_data = self.get_ko_toctree()
83
-
84
- title_mappings = self.extract_title_mappings(en_data, ko_data)
85
- ko_title = title_mappings.get(en_title)
86
-
87
- # If no existing mapping, translate the title
88
- if not ko_title:
89
- ko_title = self.translate_title(en_title)
90
-
91
- return {
92
- 'local': local_file_path,
93
- 'title': ko_title
94
- }
95
- except Exception as e:
96
- print(f"Error creating local toctree: {e}")
97
- return {
98
- 'local': local_file_path,
99
- 'title': en_title
100
- }
101
-
102
- def find_and_update_translation_entry(self, ko_toctree_data, target_local: str, english_title: str, korean_title: str):
103
- """Find entry with '(번역중) 영어제목' and update it"""
104
- target_title_pattern = f"(번역중) {english_title}"
105
-
106
- def process_item(item):
107
- if isinstance(item, dict):
108
- # Check if title matches the pattern
109
- if item.get('title') == target_title_pattern:
110
- # Update local path and title
111
- item['local'] = target_local
112
- item['title'] = korean_title
113
- return True
114
-
115
- # Process sections recursively
116
- if 'sections' in item:
117
- for section in item['sections']:
118
- if process_item(section):
119
- return True
120
- return False
121
-
122
- # Process the toctree data
123
- if isinstance(ko_toctree_data, list):
124
- for item in ko_toctree_data:
125
- if process_item(item):
126
- return True
127
- return False
128
-
129
- def create_updated_toctree_with_replacement(self, ko_toctree: list, target_local: str) -> list:
130
- """Update Korean toctree by finding and updating translation entry"""
131
- try:
132
- # Step 1: Get English toctree and find the English title for target_local
133
- en_toctree = self.get_en_toctree()
134
- english_title = self.find_title_for_local(en_toctree, target_local)
135
-
136
- if not english_title:
137
- print(f"⚠️ Toctree entry not found: '{target_local}' not in English toctree")
138
- print(f"🔍 Attempting to find appropriate section for new entry...")
139
- # Try to add new entry in appropriate location
140
- return self.add_new_toctree_entry(ko_toctree, target_local)
141
-
142
- print(f"Found English title: {english_title} for local: {target_local}")
143
-
144
- # Step 2: Translate the English title to Korean
145
- korean_title = self.translate_title(english_title)
146
- print(f"Translated Korean title: {korean_title}")
147
-
148
- # Step 3: Make a deep copy to avoid modifying original
149
- import copy
150
- updated_toctree = copy.deepcopy(ko_toctree)
151
-
152
- # Step 4: Find and update the "(번역중) 영어제목" entry
153
- updated = self.find_and_update_translation_entry(
154
- updated_toctree, target_local, english_title, korean_title
155
- )
156
-
157
- if updated:
158
- print(f"✅ Successfully updated translation entry: local={target_local}, title={korean_title}")
159
- return updated_toctree
160
- else:
161
- print(f"⚠️ Toctree update skipped: '(번역중) {english_title}' entry not found")
162
- print(f"📋 This may be a new file not yet added to Korean toctree")
163
- return ko_toctree
164
-
165
- except Exception as e:
166
- print(f"Error creating updated toctree: {e}")
167
- return ko_toctree
168
-
169
- def find_title_for_local(self, toctree_data, target_local: str):
170
- """Find title for given local path in toctree"""
171
- def search_item(item):
172
- if isinstance(item, dict):
173
- if item.get('local') == target_local:
174
- return item.get('title', '')
175
-
176
- if 'sections' in item:
177
- for section in item['sections']:
178
- result = search_item(section)
179
- if result:
180
- return result
181
- return None
182
-
183
- if isinstance(toctree_data, list):
184
- for item in toctree_data:
185
- result = search_item(item)
186
- if result:
187
- return result
188
- return None
189
-
190
- def process_pr_commit(self, filepath: str):
191
- """Process PR commit by updating Korean toctree with translated entry"""
192
- # Get filepath without prefix
193
- filepath_without_prefix = filepath.replace("docs/source/en/", "").replace(".md", "")
194
-
195
- # Get Korean toctree
196
- ko_toctree = self.get_ko_toctree()
197
-
198
- # Use diff-merge algorithm to add new entry
199
- updated_ko_toctree = self.add_new_toctree_entry(ko_toctree, filepath_without_prefix)
200
-
201
- if not updated_ko_toctree:
202
- print(f"Failed to create updated Korean toctree for local: {filepath_without_prefix}")
203
- return
204
-
205
- print(f"Successfully updated Korean toctree")
206
-
207
- # Store the updated toctree for commit
208
- self.updated_ko_toctree = updated_ko_toctree
209
-
210
- def commit_and_push_toctree(self, pr_agent, owner: str, repo_name: str, branch_name: str):
211
- """Commit and push toctree updates as a separate commit"""
212
- try:
213
- # Use the updated toctree created by LLM
214
- if not hasattr(self, 'updated_ko_toctree') or not self.updated_ko_toctree:
215
- print("No updated Korean toctree available")
216
- return {"status": "error", "message": "No updated toctree to commit"}
217
-
218
- ko_data = self.updated_ko_toctree
219
-
220
- # Convert to YAML string
221
- toctree_content = yaml.dump(ko_data, allow_unicode=True, default_flow_style=False, sort_keys=False)
222
-
223
- # Create toctree commit message
224
- commit_message = "docs: update Korean documentation table of contents"
225
-
226
- # Commit toctree file
227
- file_result = pr_agent.create_or_update_file(
228
- owner=owner,
229
- repo_name=repo_name,
230
- path="docs/source/ko/_toctree.yml",
231
- message=commit_message,
232
- content=toctree_content,
233
- branch_name=branch_name
234
- )
235
-
236
- if file_result.startswith("SUCCESS"):
237
- return {
238
- "status": "success",
239
- "message": f"Toctree committed successfully: {file_result}",
240
- "commit_message": commit_message
241
- }
242
- else:
243
- return {
244
- "status": "error",
245
- "message": f"Toctree commit failed: {file_result}"
246
- }
247
-
248
- except Exception as e:
249
- return {
250
- "status": "error",
251
- "message": f"Error committing toctree: {str(e)}"
252
- }
253
-
254
- def update_toctree_after_translation(
255
- self,
256
- translation_result: dict,
257
- filepath: str,
258
- pr_agent,
259
- github_config: dict,
260
- project: str = "transformers"
261
- ) -> dict:
262
- """Update toctree after successful translation PR.
263
-
264
- Args:
265
- translation_result: Result from translation PR workflow
266
- filepath: Original file path
267
- pr_agent: GitHub PR agent instance
268
- github_config: GitHub configuration dictionary
269
-
270
- Returns:
271
- Dictionary with toctree update result
272
- """
273
- if translation_result["status"] == "error":
274
- return None
275
-
276
- try:
277
- # Process toctree update with LLM
278
- self.process_pr_commit(filepath)
279
- # Commit toctree as separate commit
280
- if self.updated_ko_toctree:
281
- return self.commit_and_push_toctree(
282
- pr_agent=pr_agent,
283
- owner=github_config["owner"],
284
- repo_name=github_config["repo_name"],
285
- branch_name=translation_result["branch"]
286
- )
287
-
288
- except Exception as e:
289
- return {
290
- "status": "error",
291
- "message": f"Error updating toctree: {str(e)}"
292
- }
293
-
294
- def add_new_toctree_entry(self, ko_toctree: list, target_local: str) -> list:
295
- """Add new toctree entry using diff-merge algorithm"""
296
- try:
297
- import copy
298
- updated_toctree = copy.deepcopy(ko_toctree)
299
-
300
- # Generate new entry
301
- filename = target_local.split('/')[-1].replace('_', ' ').title()
302
- korean_title = self.translate_title(filename)
303
- new_entry = {
304
- 'local': target_local,
305
- 'title': korean_title
306
- }
307
-
308
- # Get English toctree for structure reference
309
- en_toctree = self.get_en_toctree()
310
-
311
- # Use diff-merge algorithm
312
- if self.merge_toctree_sections(en_toctree, updated_toctree, target_local, new_entry):
313
- return updated_toctree
314
- else:
315
- # Fallback: add to root level
316
- updated_toctree.append(new_entry)
317
- print(f"✅ Added new entry at root level: {target_local} -> {korean_title}")
318
- return updated_toctree
319
-
320
- except Exception as e:
321
- print(f"❌ Error adding new toctree entry: {e}")
322
- return ko_toctree
323
-
324
- def merge_toctree_sections(self, en_toctree: list, ko_toctree: list, target_local: str, new_entry: dict) -> bool:
325
- """Merge English toctree structure into Korean toctree for target_local"""
326
- for en_section in en_toctree:
327
- en_title = en_section.get('title')
328
-
329
- # Check if this English section contains our target
330
- if self.contains_target(en_section, target_local):
331
- # Find matching Korean section
332
- ko_section = self.find_matching_section(ko_toctree, en_title)
333
-
334
- if ko_section:
335
- # Section exists - merge subsections
336
- return self.merge_subsections(en_section, ko_section, target_local, new_entry)
337
- else:
338
- # Section doesn't exist - create new section
339
- new_ko_section = self.create_section_with_order(en_section, target_local, new_entry)
340
- ko_toctree.append(new_ko_section)
341
- print(f"✅ Created new section '{new_ko_section.get('title')}' with ordered structure")
342
- return True
343
- return False
344
-
345
- def contains_target(self, section: dict, target_local: str) -> bool:
346
- """Check if section contains target_local recursively"""
347
- if 'sections' in section:
348
- for subsection in section['sections']:
349
- if subsection.get('local') == target_local:
350
- return True
351
- if self.contains_target(subsection, target_local):
352
- return True
353
- return False
354
-
355
- def find_matching_section(self, ko_toctree: list, en_title: str) -> dict:
356
- """Find Korean section that matches English title"""
357
- # Try exact match first
358
- for item in ko_toctree:
359
- if item.get('title') == en_title:
360
- return item
361
-
362
- # Try translated title match
363
- try:
364
- translated_title = self.translate_title(en_title)
365
- for item in ko_toctree:
366
- if item.get('title') == translated_title:
367
- return item
368
- except:
369
- pass
370
-
371
- return None
372
-
373
- def merge_subsections(self, en_section: dict, ko_section: dict, target_local: str, new_entry: dict) -> bool:
374
- """Merge subsections while maintaining order"""
375
- if 'sections' not in en_section:
376
- return False
377
-
378
- # Find target index in English sections
379
- target_index = -1
380
- for i, en_subsection in enumerate(en_section['sections']):
381
- if en_subsection.get('local') == target_local:
382
- target_index = i
383
- break
384
-
385
- if target_index == -1:
386
- return False
387
-
388
- # Ensure Korean section has sections array
389
- if 'sections' not in ko_section:
390
- ko_section['sections'] = []
391
-
392
- # Insert at correct position
393
- self.insert_at_correct_position(ko_section, target_index, new_entry)
394
- return True
395
-
396
- def insert_at_correct_position(self, ko_section: dict, target_index: int, new_entry: dict):
397
- """Insert entry at correct position, expanding array if needed"""
398
- sections = ko_section['sections']
399
-
400
- # Expand sections array if needed
401
- while len(sections) <= target_index:
402
- sections.append(None) # Placeholder
403
-
404
- # Insert new entry
405
- sections[target_index] = new_entry
406
-
407
- # Clean up None placeholders at the end
408
- while sections and sections[-1] is None:
409
- sections.pop()
410
-
411
- def create_section_with_order(self, en_section: dict, target_local: str, new_entry: dict) -> dict:
412
- """Create new Korean section with only the translated entry"""
413
- new_ko_section = {
414
- 'title': self.translate_title(en_section.get('title')),
415
- 'isExpanded': en_section.get('isExpanded', False),
416
- 'sections': [new_entry] # Only add the translated entry
417
- }
418
-
419
- return new_ko_section
 
1
+ import yaml
2
+ import requests
3
+ from typing import Dict, List, Any
4
+ import os
5
+
6
+ class TocTreeHandler:
7
+ def __init__(self, project: str = "transformers"):
8
+ from translator.project_config import get_project_config
9
+ self.project = project
10
+ self.project_config = get_project_config(project)
11
+
12
+ # Extract repository path from config
13
+ repo_path = self.project_config.repo_url.replace("https://github.com/", "")
14
+
15
+ # Build project-specific URLs
16
+ self.en_toctree_url = f"https://raw.githubusercontent.com/{repo_path}/main/docs/source/en/_toctree.yml"
17
+ self.ko_toctree_url = f"https://raw.githubusercontent.com/{repo_path}/main/docs/source/ko/_toctree.yml"
18
+ self.local_docs_path = "docs/source/ko"
19
+
20
+ def fetch_toctree(self, url: str) -> Dict[str, Any]:
21
+ """Fetch and parse YAML from URL"""
22
+ response = requests.get(url)
23
+ response.raise_for_status()
24
+ return yaml.safe_load(response.text)
25
+
26
+ def get_en_toctree(self) -> Dict[str, Any]:
27
+ """Get English toctree structure"""
28
+ return self.fetch_toctree(self.en_toctree_url)
29
+
30
+ def get_ko_toctree(self) -> Dict[str, Any]:
31
+ """Get Korean toctree structure"""
32
+ return self.fetch_toctree(self.ko_toctree_url)
33
+
34
+ def extract_title_mappings(self, en_data: List[Dict], ko_data: List[Dict]) -> Dict[str, str]:
35
+ """Extract title mappings between English and Korean"""
36
+ mappings = {}
37
+
38
+ def process_section(en_section: Dict, ko_section: Dict):
39
+ if 'local' in en_section and 'local' in ko_section:
40
+ if en_section['local'] == ko_section['local']:
41
+ en_title = en_section.get('title', '')
42
+ ko_title = ko_section.get('title', '')
43
+ if en_title and ko_title:
44
+ mappings[en_title] = ko_title
45
+
46
+ if 'sections' in en_section and 'sections' in ko_section:
47
+ en_sections = en_section['sections']
48
+ ko_sections = ko_section['sections']
49
+
50
+ for i, en_sub in enumerate(en_sections):
51
+ if i < len(ko_sections):
52
+ process_section(en_sub, ko_sections[i])
53
+
54
+ for i, en_item in enumerate(en_data):
55
+ if i < len(ko_data):
56
+ process_section(en_item, ko_data[i])
57
+
58
+ return mappings
59
+
60
+ def translate_title(self, en_title: str) -> str:
61
+ """Translate English title to Korean using LLM"""
62
+ try:
63
+ from translator.content import llm_translate
64
+
65
+ prompt = f"""Translate the following English documentation title to Korean. Return only the translated title, nothing else.
66
+
67
+ English title: {en_title}
68
+
69
+ Korean title:"""
70
+
71
+ callback_result, translated_title = llm_translate(prompt)
72
+ return translated_title.strip()
73
+ except Exception as e:
74
+ print(f"Error translating title '{en_title}': {e}")
75
+ return en_title
76
+
77
+ def create_local_toctree(self, en_title: str, local_file_path: str) -> Dict[str, str]:
78
+ """Create local toctree entry with Korean title and local path"""
79
+ try:
80
+ # First try to get Korean title from existing mappings
81
+ en_data = self.get_en_toctree()
82
+ ko_data = self.get_ko_toctree()
83
+
84
+ title_mappings = self.extract_title_mappings(en_data, ko_data)
85
+ ko_title = title_mappings.get(en_title)
86
+
87
+ # If no existing mapping, translate the title
88
+ if not ko_title:
89
+ ko_title = self.translate_title(en_title)
90
+
91
+ return {
92
+ 'local': local_file_path,
93
+ 'title': ko_title
94
+ }
95
+ except Exception as e:
96
+ print(f"Error creating local toctree: {e}")
97
+ return {
98
+ 'local': local_file_path,
99
+ 'title': en_title
100
+ }
101
+
102
+ def find_and_update_translation_entry(self, ko_toctree_data, target_local: str, english_title: str, korean_title: str):
103
+ """Find entry with '(번역중) 영어제목' and update it"""
104
+ target_title_pattern = f"(번역중) {english_title}"
105
+
106
+ def process_item(item):
107
+ if isinstance(item, dict):
108
+ # Check if title matches the pattern
109
+ if item.get('title') == target_title_pattern:
110
+ # Update local path and title
111
+ item['local'] = target_local
112
+ item['title'] = korean_title
113
+ return True
114
+
115
+ # Process sections recursively
116
+ if 'sections' in item:
117
+ for section in item['sections']:
118
+ if process_item(section):
119
+ return True
120
+ return False
121
+
122
+ # Process the toctree data
123
+ if isinstance(ko_toctree_data, list):
124
+ for item in ko_toctree_data:
125
+ if process_item(item):
126
+ return True
127
+ return False
128
+
129
+ def create_updated_toctree_with_replacement(self, ko_toctree: list, target_local: str) -> list:
130
+ """Update Korean toctree by finding and updating translation entry"""
131
+ try:
132
+ # Step 1: Get English toctree and find the English title for target_local
133
+ en_toctree = self.get_en_toctree()
134
+ english_title = self.find_title_for_local(en_toctree, target_local)
135
+
136
+ if not english_title:
137
+ print(f"⚠️ Toctree entry not found: '{target_local}' not in English toctree")
138
+ print(f"🔍 Attempting to find appropriate section for new entry...")
139
+ # Try to add new entry in appropriate location
140
+ return self.add_new_toctree_entry(ko_toctree, target_local)
141
+
142
+ print(f"Found English title: {english_title} for local: {target_local}")
143
+
144
+ # Step 2: Translate the English title to Korean
145
+ korean_title = self.translate_title(english_title)
146
+ print(f"Translated Korean title: {korean_title}")
147
+
148
+ # Step 3: Make a deep copy to avoid modifying original
149
+ import copy
150
+ updated_toctree = copy.deepcopy(ko_toctree)
151
+
152
+ # Step 4: Find and update the "(번역중) 영어제목" entry
153
+ updated = self.find_and_update_translation_entry(
154
+ updated_toctree, target_local, english_title, korean_title
155
+ )
156
+
157
+ if updated:
158
+ print(f"✅ Successfully updated translation entry: local={target_local}, title={korean_title}")
159
+ return updated_toctree
160
+ else:
161
+ print(f"⚠️ Toctree update skipped: '(번역중) {english_title}' entry not found")
162
+ print(f"📋 This may be a new file not yet added to Korean toctree")
163
+ return ko_toctree
164
+
165
+ except Exception as e:
166
+ print(f"Error creating updated toctree: {e}")
167
+ return ko_toctree
168
+
169
+ def find_title_for_local(self, toctree_data, target_local: str):
170
+ """Find title for given local path in toctree"""
171
+ def search_item(item):
172
+ if isinstance(item, dict):
173
+ if item.get('local') == target_local:
174
+ return item.get('title', '')
175
+
176
+ if 'sections' in item:
177
+ for section in item['sections']:
178
+ result = search_item(section)
179
+ if result:
180
+ return result
181
+ return None
182
+
183
+ if isinstance(toctree_data, list):
184
+ for item in toctree_data:
185
+ result = search_item(item)
186
+ if result:
187
+ return result
188
+ return None
189
+
190
+ def process_pr_commit(self, filepath: str):
191
+ """Process PR commit by updating Korean toctree with translated entry"""
192
+ # Get filepath without prefix
193
+ filepath_without_prefix = filepath.replace("docs/source/en/", "").replace(".md", "")
194
+
195
+ # Get Korean toctree
196
+ ko_toctree = self.get_ko_toctree()
197
+
198
+ # Use diff-merge algorithm to add new entry
199
+ updated_ko_toctree = self.add_new_toctree_entry(ko_toctree, filepath_without_prefix)
200
+
201
+ if not updated_ko_toctree:
202
+ print(f"Failed to create updated Korean toctree for local: {filepath_without_prefix}")
203
+ return
204
+
205
+ print(f"Successfully updated Korean toctree")
206
+
207
+ # Store the updated toctree for commit
208
+ self.updated_ko_toctree = updated_ko_toctree
209
+
210
+ def commit_and_push_toctree(self, pr_agent, owner: str, repo_name: str, branch_name: str):
211
+ """Commit and push toctree updates as a separate commit"""
212
+ try:
213
+ # Use the updated toctree created by LLM
214
+ if not hasattr(self, 'updated_ko_toctree') or not self.updated_ko_toctree:
215
+ print("No updated Korean toctree available")
216
+ return {"status": "error", "message": "No updated toctree to commit"}
217
+
218
+ ko_data = self.updated_ko_toctree
219
+
220
+ # Convert to YAML string
221
+ toctree_content = yaml.dump(ko_data, allow_unicode=True, default_flow_style=False, sort_keys=False)
222
+
223
+ # Create toctree commit message
224
+ commit_message = "docs: update Korean documentation table of contents"
225
+
226
+ # Commit toctree file
227
+ file_result = pr_agent.create_or_update_file(
228
+ owner=owner,
229
+ repo_name=repo_name,
230
+ path="docs/source/ko/_toctree.yml",
231
+ message=commit_message,
232
+ content=toctree_content,
233
+ branch_name=branch_name
234
+ )
235
+
236
+ if file_result.startswith("SUCCESS"):
237
+ return {
238
+ "status": "success",
239
+ "message": f"Toctree committed successfully: {file_result}",
240
+ "commit_message": commit_message
241
+ }
242
+ else:
243
+ return {
244
+ "status": "error",
245
+ "message": f"Toctree commit failed: {file_result}"
246
+ }
247
+
248
+ except Exception as e:
249
+ return {
250
+ "status": "error",
251
+ "message": f"Error committing toctree: {str(e)}"
252
+ }
253
+
254
+ def update_toctree_after_translation(
255
+ self,
256
+ translation_result: dict,
257
+ filepath: str,
258
+ pr_agent,
259
+ github_config: dict,
260
+ project: str = "transformers"
261
+ ) -> dict:
262
+ """Update toctree after successful translation PR.
263
+
264
+ Args:
265
+ translation_result: Result from translation PR workflow
266
+ filepath: Original file path
267
+ pr_agent: GitHub PR agent instance
268
+ github_config: GitHub configuration dictionary
269
+
270
+ Returns:
271
+ Dictionary with toctree update result
272
+ """
273
+ if translation_result["status"] == "error":
274
+ return None
275
+
276
+ try:
277
+ # Process toctree update with LLM
278
+ self.process_pr_commit(filepath)
279
+ # Commit toctree as separate commit
280
+ if self.updated_ko_toctree:
281
+ return self.commit_and_push_toctree(
282
+ pr_agent=pr_agent,
283
+ owner=github_config["owner"],
284
+ repo_name=github_config["repo_name"],
285
+ branch_name=translation_result["branch"]
286
+ )
287
+
288
+ except Exception as e:
289
+ return {
290
+ "status": "error",
291
+ "message": f"Error updating toctree: {str(e)}"
292
+ }
293
+
294
+ def add_new_toctree_entry(self, ko_toctree: list, target_local: str) -> list:
295
+ """Add new toctree entry using diff-merge algorithm"""
296
+ try:
297
+ import copy
298
+ updated_toctree = copy.deepcopy(ko_toctree)
299
+
300
+ # Generate new entry
301
+ filename = target_local.split('/')[-1].replace('_', ' ').title()
302
+ korean_title = self.translate_title(filename)
303
+ new_entry = {
304
+ 'local': target_local,
305
+ 'title': korean_title
306
+ }
307
+
308
+ # Get English toctree for structure reference
309
+ en_toctree = self.get_en_toctree()
310
+
311
+ # Use diff-merge algorithm
312
+ if self.merge_toctree_sections(en_toctree, updated_toctree, target_local, new_entry):
313
+ return updated_toctree
314
+ else:
315
+ # Fallback: add to root level
316
+ updated_toctree.append(new_entry)
317
+ print(f"✅ Added new entry at root level: {target_local} -> {korean_title}")
318
+ return updated_toctree
319
+
320
+ except Exception as e:
321
+ print(f"❌ Error adding new toctree entry: {e}")
322
+ return ko_toctree
323
+
324
+ def merge_toctree_sections(self, en_toctree: list, ko_toctree: list, target_local: str, new_entry: dict) -> bool:
325
+ """Merge English toctree structure into Korean toctree for target_local"""
326
+ for en_section in en_toctree:
327
+ en_title = en_section.get('title')
328
+
329
+ # Check if this English section contains our target
330
+ if self.contains_target(en_section, target_local):
331
+ # Find matching Korean section
332
+ ko_section = self.find_matching_section(ko_toctree, en_title)
333
+
334
+ if ko_section:
335
+ # Section exists - merge subsections
336
+ return self.merge_subsections(en_section, ko_section, target_local, new_entry)
337
+ else:
338
+ # Section doesn't exist - create new section
339
+ new_ko_section = self.create_section_with_order(en_section, target_local, new_entry)
340
+ ko_toctree.append(new_ko_section)
341
+ print(f"✅ Created new section '{new_ko_section.get('title')}' with ordered structure")
342
+ return True
343
+ return False
344
+
345
+ def contains_target(self, section: dict, target_local: str) -> bool:
346
+ """Check if section contains target_local recursively"""
347
+ if 'sections' in section:
348
+ for subsection in section['sections']:
349
+ if subsection.get('local') == target_local:
350
+ return True
351
+ if self.contains_target(subsection, target_local):
352
+ return True
353
+ return False
354
+
355
+ def find_matching_section(self, ko_toctree: list, en_title: str) -> dict:
356
+ """Find Korean section that matches English title"""
357
+ # Try exact match first
358
+ for item in ko_toctree:
359
+ if item.get('title') == en_title:
360
+ return item
361
+
362
+ # Try translated title match
363
+ try:
364
+ translated_title = self.translate_title(en_title)
365
+ for item in ko_toctree:
366
+ if item.get('title') == translated_title:
367
+ return item
368
+ except:
369
+ pass
370
+
371
+ return None
372
+
373
+ def merge_subsections(self, en_section: dict, ko_section: dict, target_local: str, new_entry: dict) -> bool:
374
+ """Merge subsections while maintaining order"""
375
+ if 'sections' not in en_section:
376
+ return False
377
+
378
+ # Find target index in English sections
379
+ target_index = -1
380
+ for i, en_subsection in enumerate(en_section['sections']):
381
+ if en_subsection.get('local') == target_local:
382
+ target_index = i
383
+ break
384
+
385
+ if target_index == -1:
386
+ return False
387
+
388
+ # Ensure Korean section has sections array
389
+ if 'sections' not in ko_section:
390
+ ko_section['sections'] = []
391
+
392
+ # Insert at correct position
393
+ self.insert_at_correct_position(ko_section, target_index, new_entry)
394
+ return True
395
+
396
+ def insert_at_correct_position(self, ko_section: dict, target_index: int, new_entry: dict):
397
+ """Insert entry at correct position, expanding array if needed"""
398
+ sections = ko_section['sections']
399
+
400
+ # Expand sections array if needed
401
+ while len(sections) <= target_index:
402
+ sections.append(None) # Placeholder
403
+
404
+ # Insert new entry
405
+ sections[target_index] = new_entry
406
+
407
+ # Clean up None placeholders at the end
408
+ while sections and sections[-1] is None:
409
+ sections.pop()
410
+
411
+ def create_section_with_order(self, en_section: dict, target_local: str, new_entry: dict) -> dict:
412
+ """Create new Korean section with only the translated entry"""
413
+ new_ko_section = {
414
+ 'title': self.translate_title(en_section.get('title')),
415
+ 'isExpanded': en_section.get('isExpanded', False),
416
+ 'sections': [new_entry] # Only add the translated entry
417
+ }
418
+
419
+ return new_ko_section
agent/workflow.py CHANGED
@@ -1,338 +1,338 @@
1
- """Module for gradio interfaces."""
2
-
3
- import os
4
- from pathlib import Path
5
- import gradio as gr
6
-
7
- from translator.content import (
8
- fill_scaffold,
9
- get_content,
10
- get_full_prompt,
11
- llm_translate,
12
- preprocess_content,
13
- )
14
- from translator.retriever import report, get_github_issue_open_pr, get_github_repo_files
15
- # GitHub PR Agent import
16
- try:
17
- from pr_generator.agent import GitHubPRAgent
18
-
19
- GITHUB_PR_AVAILABLE = True
20
- except ImportError as e:
21
- print(f"⚠️ GitHub PR Agent is not available: {e}")
22
- GITHUB_PR_AVAILABLE = False
23
-
24
- import json
25
- from logger.github_logger import GitHubLogger
26
-
27
-
28
- def report_translation_target_files(
29
- project: str, translate_lang: str, top_k: int = 1
30
- ) -> tuple[str, list[list[str]]]:
31
- """Return the top-k files that need translation, excluding files already in progress.
32
-
33
- Args:
34
- project: Project to translate (e.g., "transformers", "smolagents")
35
- translate_lang: Target language to translate
36
- top_k: Number of top-first files to return for translation. (Default 1)
37
- """
38
- # Get repo files once to avoid duplicate API calls
39
- all_repo_files = get_github_repo_files(project)
40
-
41
- # Get all available files for translation using the file list
42
- all_status_report, all_filepath_list = report(project, translate_lang, top_k * 2, all_repo_files) # Get more to account for filtering
43
-
44
- # Get files in progress using the same file list
45
- docs_in_progress, pr_info_list = get_github_issue_open_pr(project, translate_lang, all_repo_files)
46
-
47
- # Filter out files that are already in progress
48
- available_files = [f for f in all_filepath_list if f not in docs_in_progress]
49
-
50
- # Take only the requested number
51
- filepath_list = available_files[:top_k]
52
-
53
- # Build combined status report
54
- status_report = all_status_report
55
-
56
- if docs_in_progress:
57
- status_report += f"\n\n🤖 Found {len(docs_in_progress)} files in progress for translation:"
58
- for i, file in enumerate(docs_in_progress):
59
- status_report += f"\n{i+1}. [`{file}`]({pr_info_list[i]})"
60
- status_report += f"\n\n📋 Showing {len(filepath_list)} available files (excluding in-progress):"
61
-
62
- return status_report, [[file] for file in filepath_list]
63
-
64
-
65
- def translate_docs(lang: str, file_path: str, additional_instruction: str = "", project: str = "transformers", force_retranslate: bool = False) -> tuple[str, str]:
66
- """Translate documentation."""
67
- # Check if translation already exists (unless force retranslate is enabled)
68
- translation_file_path = (
69
- Path(__file__).resolve().parent.parent
70
- / f"translation_result/{file_path}"
71
- )
72
-
73
- if not force_retranslate and translation_file_path.exists():
74
- print(f"📄 Found existing translation: {translation_file_path}")
75
- with open(translation_file_path, "r", encoding="utf-8") as f:
76
- existing_content = f.read()
77
- if existing_content.strip():
78
- existing_msg = f"♻️ **Existing translation loaded** (no tokens used)\n📁 **File:** `{file_path}`\n📅 **Loaded from:** `{translation_file_path}`\n💡 **To retranslate:** Check 'Force Retranslate' option."
79
- return existing_msg, existing_content
80
-
81
- # step 1. Get content from file path
82
- content = get_content(file_path, project)
83
- to_translate = preprocess_content(content)
84
-
85
- # step 2. Prepare prompt with docs content
86
- if lang == "ko":
87
- translation_lang = "Korean"
88
- to_translate_with_prompt = get_full_prompt(translation_lang, to_translate, additional_instruction)
89
-
90
- print("to_translate_with_prompt:\n", to_translate_with_prompt)
91
-
92
- # step 3. Translate with LLM
93
- # TODO: MCP clilent 넘길 부분
94
- callback_result, translated_content = llm_translate(to_translate_with_prompt)
95
- print("translated_content:\n")
96
- print(translated_content)
97
- if translated_content.startswith("```md\n") and translated_content.endswith("```"):
98
- print("Satisfied translated_content.startswith ``` md")
99
- translated_content = translated_content[5:-3].strip()
100
- # step 4. Add scaffold to translation result
101
- translated_doc = fill_scaffold(content, to_translate, translated_content)
102
- print("translated_doc:\n")
103
- print(translated_doc)
104
- return callback_result, translated_doc
105
-
106
-
107
- def translate_docs_interactive(
108
- translate_lang: str, selected_files: list[list[str]], additional_instruction: str = "", project: str = "transformers", force_retranslate: bool = False
109
- ) -> tuple[str, str]:
110
- """Interactive translation function that processes files one by one.
111
-
112
- Args:
113
- translate_lang: Target language to translate
114
- selected_files: List of file paths to translate
115
- """
116
- # Extract file paths from the dataframe format
117
- file_paths = [row[0] for row in selected_files if row and len(row) > 0]
118
-
119
- # Start with the first file
120
- current_file = file_paths[0]
121
-
122
- callback_result, translated_content = translate_docs(translate_lang, current_file, additional_instruction, project, force_retranslate)
123
-
124
- # Check if existing translation was loaded
125
- if isinstance(callback_result, str) and "Existing translation loaded" in callback_result:
126
- status = callback_result # Use the existing translation message
127
- else:
128
- if force_retranslate:
129
- status = f"🔄 **Force Retranslation completed**: `{current_file}` → `{translate_lang}`\n\n"
130
- else:
131
- status = f"✅ Translation completed: `{current_file}` → `{translate_lang}`\n\n"
132
- status += f"💰 Used token and cost: \n```\n{callback_result}\n```"
133
-
134
- print(callback_result)
135
- print(status)
136
-
137
- return status, translated_content
138
-
139
-
140
- def generate_github_pr(
141
- target_language: str,
142
- filepath: str,
143
- translated_content: str = None,
144
- github_config: dict = None,
145
- en_title: str = None,
146
- project: str = "transformers",
147
- ) -> str:
148
- """Generate a GitHub PR for translated documentation.
149
-
150
- Args:
151
- target_language: Target language for translation (e.g., "ko")
152
- filepath: Original file path (e.g., "docs/source/en/accelerator_selection.md")
153
- translated_content: Translated content (if None, read from file)
154
- github_config: GitHub configuration dictionary
155
- en_title: English title for toctree mapping
156
-
157
- Returns:
158
- PR creation result message
159
- """
160
- if not GITHUB_PR_AVAILABLE:
161
- return "❌ GitHub PR Agent is not available. Please install required libraries."
162
-
163
- if not github_config:
164
- return "❌ GitHub configuration not provided. Please set up GitHub token, owner, and repository in Configuration panel."
165
-
166
- # Validate required configuration
167
- required_fields = ["token", "owner", "repo_name", "reference_pr_url"]
168
- missing_fields = [
169
- field for field in required_fields if not github_config.get(field)
170
- ]
171
-
172
- if missing_fields:
173
- return f"❌ Missing required GitHub configuration: {', '.join(missing_fields)}\n\n💡 Go to Configuration panel and set:\n" + "\n".join([f" • {field}" for field in missing_fields])
174
-
175
- # Set token in environment for the agent.
176
- os.environ["GITHUB_TOKEN"] = github_config["token"]
177
-
178
- try:
179
- # Read translated content from file if not provided
180
- if translated_content is None:
181
- translation_file_path = (
182
- Path(__file__).resolve().parent.parent
183
- / f"translation_result/{filepath}"
184
- )
185
- if not translation_file_path.exists():
186
- return f"❌ Translation file not found: {translation_file_path}\n\n💡 Please complete translation first in Tab 2 for file: {filepath}"
187
-
188
- with open(translation_file_path, "r", encoding="utf-8") as f:
189
- translated_content = f.read()
190
-
191
- if not translated_content or not translated_content.strip():
192
- return f"❌ Translated content is empty for file: {filepath}\n\n💡 Please complete translation first in Tab 2."
193
-
194
- # Execute GitHub PR Agent
195
- # Get base repository from project config
196
- from translator.project_config import get_project_config
197
- project_config = get_project_config(project)
198
- base_repo_path = project_config.repo_url.replace("https://github.com/", "")
199
- base_owner, base_repo = base_repo_path.split("/")
200
-
201
- print(f"🚀 Starting GitHub PR creation...")
202
- print(f" 📁 File: {filepath}")
203
- print(f" 🌍 Language: {target_language}")
204
- print(f" 📊 Reference PR: {github_config['reference_pr_url']}")
205
- print(f" 🏠 User Fork: {github_config['owner']}/{github_config['repo_name']}")
206
- print(f" 🎯 Base Repository: {base_owner}/{base_repo}")
207
-
208
- agent = GitHubPRAgent(
209
- user_owner=github_config["owner"],
210
- user_repo=github_config["repo_name"],
211
- base_owner=base_owner,
212
- base_repo=base_repo,
213
- )
214
- result = agent.run_translation_pr_workflow(
215
- reference_pr_url=github_config["reference_pr_url"],
216
- target_language=target_language,
217
- filepath=filepath,
218
- translated_doc=translated_content,
219
- base_branch=github_config.get("base_branch", "main"),
220
- )
221
- # TEST CODE
222
- # result = {
223
- # 'status': 'partial_success',
224
- # 'branch': 'ko-attention_interface',
225
- # 'file_path': 'docs/source/ko/attention_interface.md',
226
- # 'message': 'File was saved and commit was successful.\nPR creation failed: ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1', 'error_details': 'ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1'
227
- # }
228
- # Process toctree update after successful translation PR
229
- toctree_result = None
230
- if en_title:
231
- from agent.toctree_handler import TocTreeHandler
232
- toctree_handler = TocTreeHandler(project)
233
- toctree_result = toctree_handler.update_toctree_after_translation(
234
- result, filepath, agent, github_config, project
235
- )
236
-
237
- # Process result
238
- # Generate toctree status message (shared for both success and partial_success)
239
- toctree_status = ""
240
- if toctree_result:
241
- if toctree_result["status"] == "success":
242
- toctree_status = f"\n📋 **Toctree Updated:** ✅ {toctree_result['message']}"
243
- else:
244
- toctree_status = f"\n📋 **Toctree Update Failed:** ❌ {toctree_result['message']}"
245
-
246
- # Append full result JSON to dedicated GitHub logging repository (always)
247
- try:
248
- log_data = result.copy()
249
- if toctree_result:
250
- log_data["toctree_result"] = toctree_result
251
- log_entry = json.dumps(log_data, ensure_ascii=False) + "\n"
252
- log_res = GitHubLogger().append_jsonl(log_entry)
253
- print(f"📝 Log append result: {log_res}")
254
- except Exception as e:
255
- print(f"❌ Failed to append PR log via GitHub API: {e}")
256
-
257
- if result["status"] == "success":
258
- return f"""✅ **GitHub PR Creation Successful!**
259
-
260
- 🔗 **PR URL:** {result.get('pr_url', 'NO_PR_URL')}
261
- 🌿 **Branch:** {result["branch"]}
262
- 📁 **File:** {result["file_path"]}{toctree_status}
263
-
264
- {result["message"]}"""
265
-
266
- elif result["status"] == "partial_success":
267
- error_details = result.get("error_details", "Unknown error")
268
-
269
- # Check if it's "existing PR" case (not really an error)
270
- if "Existing PR found" in error_details:
271
- existing_pr_url = error_details.split(": ")[-1] if ": " in error_details else "Unknown"
272
- return f"""🔄 **Translation Updated Successfully**
273
-
274
- 🎯 **Selected Project:** {project}
275
- 🌿 **Branch:** {result["branch"]}
276
- 📁 **File:** {result["file_path"]}{toctree_status}
277
-
278
- 🔗 **Existing PR Updated:** {existing_pr_url}
279
-
280
- ✅ Your translation has been added to the existing PR. The file and toctree have been successfully updated!"""
281
- else:
282
- # Actual error case
283
- return f"""⚠️ **Partial Success**
284
-
285
- 🎯 **Selected Project:** {project}
286
- 🏠 **User Fork:** {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')}
287
- 🎯 **Target Base:** {base_owner}/{base_repo}
288
- 🌿 **Branch:** {result["branch"]}
289
- 📁 **File:** {result["file_path"]}{toctree_status}
290
-
291
- {result["message"]}
292
-
293
- **Error Details:**
294
- {error_details}
295
-
296
- 💡 **Project-Repository Mismatch Check:**
297
- - Selected project '{project}' should match repository '{github_config.get('repo_name', 'REPO')}'
298
- - For smolagents: use Jwaminju/smolagents fork
299
- - For transformers: use Jwaminju/transformers fork"""
300
-
301
- else:
302
- error_details = result.get("error_details", "No additional details")
303
- return f"""❌ **GitHub PR Creation Failed**
304
-
305
- 🎯 **Selected Project:** {project}
306
- 🏠 **User Fork:** {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')}
307
- 🎯 **Target Base:** {base_owner}/{base_repo}
308
-
309
- **Error Message:**
310
- {result["message"]}
311
-
312
- **Error Details:**
313
- {error_details}
314
-
315
- 💡 **Project-Repository Mismatch:**
316
- Selected project '{project}' but configured repository '{github_config.get('repo_name', 'REPO')}'
317
- • For smolagents project: use 'smolagents' repository
318
- • For transformers project: use 'transformers' repository"""
319
-
320
- except Exception as e:
321
- error_msg = f"""❌ **Unexpected Error During PR Creation**
322
-
323
- **Error:** {str(e)}
324
-
325
- **Configuration:**
326
- • Project: {project}
327
- • File: {filepath}
328
- • Target: {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')} → {base_owner if 'base_owner' in locals() else 'BASE'}/{base_repo if 'base_repo' in locals() else 'REPO'}"""
329
- print(error_msg)
330
- return error_msg
331
-
332
-
333
- # Backward compatibility function (replaces old mock function)
334
- def mock_generate_PR():
335
- """Backward compatibility function - returns warning message only"""
336
- return (
337
- "⚠️ mock_generate_PR() is deprecated. Please use generate_github_pr() instead."
338
- )
 
1
+ """Module for gradio interfaces."""
2
+
3
+ import os
4
+ from pathlib import Path
5
+ import gradio as gr
6
+
7
+ from translator.content import (
8
+ fill_scaffold,
9
+ get_content,
10
+ get_full_prompt,
11
+ llm_translate,
12
+ preprocess_content,
13
+ )
14
+ from translator.retriever import report, get_github_issue_open_pr, get_github_repo_files
15
+ # GitHub PR Agent import
16
+ try:
17
+ from pr_generator.agent import GitHubPRAgent
18
+
19
+ GITHUB_PR_AVAILABLE = True
20
+ except ImportError as e:
21
+ print(f"⚠️ GitHub PR Agent is not available: {e}")
22
+ GITHUB_PR_AVAILABLE = False
23
+
24
+ import json
25
+ from logger.github_logger import GitHubLogger
26
+
27
+
28
+ def report_translation_target_files(
29
+ project: str, translate_lang: str, top_k: int = 1
30
+ ) -> tuple[str, list[list[str]]]:
31
+ """Return the top-k files that need translation, excluding files already in progress.
32
+
33
+ Args:
34
+ project: Project to translate (e.g., "transformers", "smolagents")
35
+ translate_lang: Target language to translate
36
+ top_k: Number of top-first files to return for translation. (Default 1)
37
+ """
38
+ # Get repo files once to avoid duplicate API calls
39
+ all_repo_files = get_github_repo_files(project)
40
+
41
+ # Get all available files for translation using the file list
42
+ all_status_report, all_filepath_list = report(project, translate_lang, top_k * 2, all_repo_files) # Get more to account for filtering
43
+
44
+ # Get files in progress using the same file list
45
+ docs_in_progress, pr_info_list = get_github_issue_open_pr(project, translate_lang, all_repo_files)
46
+
47
+ # Filter out files that are already in progress
48
+ available_files = [f for f in all_filepath_list if f not in docs_in_progress]
49
+
50
+ # Take only the requested number
51
+ filepath_list = available_files[:top_k]
52
+
53
+ # Build combined status report
54
+ status_report = all_status_report
55
+
56
+ if docs_in_progress:
57
+ status_report += f"\n\n🤖 Found {len(docs_in_progress)} files in progress for translation:"
58
+ for i, file in enumerate(docs_in_progress):
59
+ status_report += f"\n{i+1}. [`{file}`]({pr_info_list[i]})"
60
+ status_report += f"\n\n📋 Showing {len(filepath_list)} available files (excluding in-progress):"
61
+
62
+ return status_report, [[file] for file in filepath_list]
63
+
64
+
65
+ def translate_docs(lang: str, file_path: str, additional_instruction: str = "", project: str = "transformers", force_retranslate: bool = False) -> tuple[str, str]:
66
+ """Translate documentation."""
67
+ # Check if translation already exists (unless force retranslate is enabled)
68
+ translation_file_path = (
69
+ Path(__file__).resolve().parent.parent
70
+ / f"translation_result/{file_path}"
71
+ )
72
+
73
+ if not force_retranslate and translation_file_path.exists():
74
+ print(f"📄 Found existing translation: {translation_file_path}")
75
+ with open(translation_file_path, "r", encoding="utf-8") as f:
76
+ existing_content = f.read()
77
+ if existing_content.strip():
78
+ existing_msg = f"♻️ **Existing translation loaded** (no tokens used)\n📁 **File:** `{file_path}`\n📅 **Loaded from:** `{translation_file_path}`\n💡 **To retranslate:** Check 'Force Retranslate' option."
79
+ return existing_msg, existing_content
80
+
81
+ # step 1. Get content from file path
82
+ content = get_content(file_path, project)
83
+ to_translate = preprocess_content(content)
84
+
85
+ # step 2. Prepare prompt with docs content
86
+ if lang == "ko":
87
+ translation_lang = "Korean"
88
+ to_translate_with_prompt = get_full_prompt(translation_lang, to_translate, additional_instruction)
89
+
90
+ print("to_translate_with_prompt:\n", to_translate_with_prompt)
91
+
92
+ # step 3. Translate with LLM
93
+ # TODO: MCP clilent 넘길 부분
94
+ callback_result, translated_content = llm_translate(to_translate_with_prompt)
95
+ print("translated_content:\n")
96
+ print(translated_content)
97
+ if translated_content.startswith("```md\n") and translated_content.endswith("```"):
98
+ print("Satisfied translated_content.startswith ``` md")
99
+ translated_content = translated_content[5:-3].strip()
100
+ # step 4. Add scaffold to translation result
101
+ translated_doc = fill_scaffold(content, to_translate, translated_content)
102
+ print("translated_doc:\n")
103
+ print(translated_doc)
104
+ return callback_result, translated_doc
105
+
106
+
107
+ def translate_docs_interactive(
108
+ translate_lang: str, selected_files: list[list[str]], additional_instruction: str = "", project: str = "transformers", force_retranslate: bool = False
109
+ ) -> tuple[str, str]:
110
+ """Interactive translation function that processes files one by one.
111
+
112
+ Args:
113
+ translate_lang: Target language to translate
114
+ selected_files: List of file paths to translate
115
+ """
116
+ # Extract file paths from the dataframe format
117
+ file_paths = [row[0] for row in selected_files if row and len(row) > 0]
118
+
119
+ # Start with the first file
120
+ current_file = file_paths[0]
121
+
122
+ callback_result, translated_content = translate_docs(translate_lang, current_file, additional_instruction, project, force_retranslate)
123
+
124
+ # Check if existing translation was loaded
125
+ if isinstance(callback_result, str) and "Existing translation loaded" in callback_result:
126
+ status = callback_result # Use the existing translation message
127
+ else:
128
+ if force_retranslate:
129
+ status = f"🔄 **Force Retranslation completed**: `{current_file}` → `{translate_lang}`\n\n"
130
+ else:
131
+ status = f"✅ Translation completed: `{current_file}` → `{translate_lang}`\n\n"
132
+ status += f"💰 Used token and cost: \n```\n{callback_result}\n```"
133
+
134
+ print(callback_result)
135
+ print(status)
136
+
137
+ return status, translated_content
138
+
139
+
140
+ def generate_github_pr(
141
+ target_language: str,
142
+ filepath: str,
143
+ translated_content: str = None,
144
+ github_config: dict = None,
145
+ en_title: str = None,
146
+ project: str = "transformers",
147
+ ) -> str:
148
+ """Generate a GitHub PR for translated documentation.
149
+
150
+ Args:
151
+ target_language: Target language for translation (e.g., "ko")
152
+ filepath: Original file path (e.g., "docs/source/en/accelerator_selection.md")
153
+ translated_content: Translated content (if None, read from file)
154
+ github_config: GitHub configuration dictionary
155
+ en_title: English title for toctree mapping
156
+
157
+ Returns:
158
+ PR creation result message
159
+ """
160
+ if not GITHUB_PR_AVAILABLE:
161
+ return "❌ GitHub PR Agent is not available. Please install required libraries."
162
+
163
+ if not github_config:
164
+ return "❌ GitHub configuration not provided. Please set up GitHub token, owner, and repository in Configuration panel."
165
+
166
+ # Validate required configuration
167
+ required_fields = ["token", "owner", "repo_name", "reference_pr_url"]
168
+ missing_fields = [
169
+ field for field in required_fields if not github_config.get(field)
170
+ ]
171
+
172
+ if missing_fields:
173
+ return f"❌ Missing required GitHub configuration: {', '.join(missing_fields)}\n\n💡 Go to Configuration panel and set:\n" + "\n".join([f" • {field}" for field in missing_fields])
174
+
175
+ # Set token in environment for the agent.
176
+ os.environ["GITHUB_TOKEN"] = github_config["token"]
177
+
178
+ try:
179
+ # Read translated content from file if not provided
180
+ if translated_content is None:
181
+ translation_file_path = (
182
+ Path(__file__).resolve().parent.parent
183
+ / f"translation_result/{filepath}"
184
+ )
185
+ if not translation_file_path.exists():
186
+ return f"❌ Translation file not found: {translation_file_path}\n\n💡 Please complete translation first in Tab 2 for file: {filepath}"
187
+
188
+ with open(translation_file_path, "r", encoding="utf-8") as f:
189
+ translated_content = f.read()
190
+
191
+ if not translated_content or not translated_content.strip():
192
+ return f"❌ Translated content is empty for file: {filepath}\n\n💡 Please complete translation first in Tab 2."
193
+
194
+ # Execute GitHub PR Agent
195
+ # Get base repository from project config
196
+ from translator.project_config import get_project_config
197
+ project_config = get_project_config(project)
198
+ base_repo_path = project_config.repo_url.replace("https://github.com/", "")
199
+ base_owner, base_repo = base_repo_path.split("/")
200
+
201
+ print(f"🚀 Starting GitHub PR creation...")
202
+ print(f" 📁 File: {filepath}")
203
+ print(f" 🌍 Language: {target_language}")
204
+ print(f" 📊 Reference PR: {github_config['reference_pr_url']}")
205
+ print(f" 🏠 User Fork: {github_config['owner']}/{github_config['repo_name']}")
206
+ print(f" 🎯 Base Repository: {base_owner}/{base_repo}")
207
+
208
+ agent = GitHubPRAgent(
209
+ user_owner=github_config["owner"],
210
+ user_repo=github_config["repo_name"],
211
+ base_owner=base_owner,
212
+ base_repo=base_repo,
213
+ )
214
+ result = agent.run_translation_pr_workflow(
215
+ reference_pr_url=github_config["reference_pr_url"],
216
+ target_language=target_language,
217
+ filepath=filepath,
218
+ translated_doc=translated_content,
219
+ base_branch=github_config.get("base_branch", "main"),
220
+ )
221
+ # TEST CODE
222
+ # result = {
223
+ # 'status': 'partial_success',
224
+ # 'branch': 'ko-attention_interface',
225
+ # 'file_path': 'docs/source/ko/attention_interface.md',
226
+ # 'message': 'File was saved and commit was successful.\nPR creation failed: ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1', 'error_details': 'ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1'
227
+ # }
228
+ # Process toctree update after successful translation PR
229
+ toctree_result = None
230
+ if en_title:
231
+ from agent.toctree_handler import TocTreeHandler
232
+ toctree_handler = TocTreeHandler(project)
233
+ toctree_result = toctree_handler.update_toctree_after_translation(
234
+ result, filepath, agent, github_config, project
235
+ )
236
+
237
+ # Process result
238
+ # Generate toctree status message (shared for both success and partial_success)
239
+ toctree_status = ""
240
+ if toctree_result:
241
+ if toctree_result["status"] == "success":
242
+ toctree_status = f"\n📋 **Toctree Updated:** ✅ {toctree_result['message']}"
243
+ else:
244
+ toctree_status = f"\n📋 **Toctree Update Failed:** ❌ {toctree_result['message']}"
245
+
246
+ # Append full result JSON to dedicated GitHub logging repository (always)
247
+ try:
248
+ log_data = result.copy()
249
+ if toctree_result:
250
+ log_data["toctree_result"] = toctree_result
251
+ log_entry = json.dumps(log_data, ensure_ascii=False) + "\n"
252
+ log_res = GitHubLogger().append_jsonl(log_entry)
253
+ print(f"📝 Log append result: {log_res}")
254
+ except Exception as e:
255
+ print(f"❌ Failed to append PR log via GitHub API: {e}")
256
+
257
+ if result["status"] == "success":
258
+ return f"""✅ **GitHub PR Creation Successful!**
259
+
260
+ 🔗 **PR URL:** {result.get('pr_url', 'NO_PR_URL')}
261
+ 🌿 **Branch:** {result["branch"]}
262
+ 📁 **File:** {result["file_path"]}{toctree_status}
263
+
264
+ {result["message"]}"""
265
+
266
+ elif result["status"] == "partial_success":
267
+ error_details = result.get("error_details", "Unknown error")
268
+
269
+ # Check if it's "existing PR" case (not really an error)
270
+ if "Existing PR found" in error_details:
271
+ existing_pr_url = error_details.split(": ")[-1] if ": " in error_details else "Unknown"
272
+ return f"""🔄 **Translation Updated Successfully**
273
+
274
+ 🎯 **Selected Project:** {project}
275
+ 🌿 **Branch:** {result["branch"]}
276
+ 📁 **File:** {result["file_path"]}{toctree_status}
277
+
278
+ 🔗 **Existing PR Updated:** {existing_pr_url}
279
+
280
+ ✅ Your translation has been added to the existing PR. The file and toctree have been successfully updated!"""
281
+ else:
282
+ # Actual error case
283
+ return f"""⚠️ **Partial Success**
284
+
285
+ 🎯 **Selected Project:** {project}
286
+ 🏠 **User Fork:** {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')}
287
+ 🎯 **Target Base:** {base_owner}/{base_repo}
288
+ 🌿 **Branch:** {result["branch"]}
289
+ 📁 **File:** {result["file_path"]}{toctree_status}
290
+
291
+ {result["message"]}
292
+
293
+ **Error Details:**
294
+ {error_details}
295
+
296
+ 💡 **Project-Repository Mismatch Check:**
297
+ - Selected project '{project}' should match repository '{github_config.get('repo_name', 'REPO')}'
298
+ - For smolagents: use Jwaminju/smolagents fork
299
+ - For transformers: use Jwaminju/transformers fork"""
300
+
301
+ else:
302
+ error_details = result.get("error_details", "No additional details")
303
+ return f"""❌ **GitHub PR Creation Failed**
304
+
305
+ 🎯 **Selected Project:** {project}
306
+ 🏠 **User Fork:** {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')}
307
+ 🎯 **Target Base:** {base_owner}/{base_repo}
308
+
309
+ **Error Message:**
310
+ {result["message"]}
311
+
312
+ **Error Details:**
313
+ {error_details}
314
+
315
+ 💡 **Project-Repository Mismatch:**
316
+ Selected project '{project}' but configured repository '{github_config.get('repo_name', 'REPO')}'
317
+ • For smolagents project: use 'smolagents' repository
318
+ • For transformers project: use 'transformers' repository"""
319
+
320
+ except Exception as e:
321
+ error_msg = f"""❌ **Unexpected Error During PR Creation**
322
+
323
+ **Error:** {str(e)}
324
+
325
+ **Configuration:**
326
+ • Project: {project}
327
+ • File: {filepath}
328
+ • Target: {github_config.get('owner', 'USER')}/{github_config.get('repo_name', 'REPO')} → {base_owner if 'base_owner' in locals() else 'BASE'}/{base_repo if 'base_repo' in locals() else 'REPO'}"""
329
+ print(error_msg)
330
+ return error_msg
331
+
332
+
333
+ # Backward compatibility function (replaces old mock function)
334
+ def mock_generate_PR():
335
+ """Backward compatibility function - returns warning message only"""
336
+ return (
337
+ "⚠️ mock_generate_PR() is deprecated. Please use generate_github_pr() instead."
338
+ )
app.py CHANGED
@@ -1,379 +1,379 @@
1
- """Module for gradio chat-based translation agent interface."""
2
-
3
- import base64
4
- import os
5
-
6
- import gradio as gr
7
- from dotenv import load_dotenv
8
-
9
- from agent.handler import (
10
- approve_handler,
11
- confirm_and_go_translate_handler,
12
- confirm_translation_and_go_upload_handler,
13
- get_welcome_message,
14
- process_file_search_handler,
15
- restart_handler,
16
- send_message,
17
- start_translate_handler,
18
- sync_language_displays,
19
- update_language_selection,
20
- update_project_selection,
21
- update_prompt_preview,
22
- update_status,
23
- update_github_config,
24
- update_persistent_config,
25
- )
26
- from translator.model import Languages
27
- from translator.project_config import get_available_projects
28
-
29
- load_dotenv()
30
-
31
-
32
- css = """
33
- .gradio-container {
34
- background: linear-gradient(135deg, #ffeda7 0%, #ffbebf 100%);
35
- }
36
- .chat-container {
37
- background: rgba(255, 255, 180, 0.25);
38
- border-radius: 18px;
39
- box-shadow: 0 4px 24px rgba(0,0,0,0.08);
40
- padding: 1.0em;
41
- backdrop-filter: blur(8px);
42
- border: 1px solid rgba(255,255,180,0.25);
43
- width: 100%;
44
- height: 100%;
45
- }
46
- .control-panel {
47
- background: rgba(255, 255, 180, 0.25);
48
- border-radius: 18px;
49
- box-shadow: 0 4px 24px rgba(0,0,0,0.08);
50
- padding: 1.0em;
51
- backdrop-filter: blur(8px);
52
- border: 1px solid rgba(255,255,180,0.25);
53
- width: 100%;
54
- overflow: visible !important;
55
-
56
- }
57
- .status-card {
58
- width: 100%
59
- }
60
- .action-button {
61
- background: linear-gradient(135deg, #ff8c8c 0%, #f9a889 100%) !important;
62
- color: white !important;
63
- border: none !important;
64
- font-weight: 600 !important;
65
- box-shadow: 0 4px 12px rgba(0,0,0,0.15) !important;
66
- transition: all 0.3s ease-in-out !important;
67
- }
68
- .action-button:hover {
69
- background: linear-gradient(135deg, #f9a889 0%, #ff8c8c 100%) !important;
70
- box-shadow: 0 6px 16px rgba(0,0,0,0.2) !important;
71
- transform: translateY(-2px) !important;
72
- }
73
-
74
- .simple-tabs .tab-nav button {
75
- background: transparent !important;
76
- color: #4A5568 !important;
77
- box-shadow: none !important;
78
- transform: none !important;
79
- border: none !important;
80
- border-bottom: 2px solid #E2E8F0 !important;
81
- border-radius: 0 !important;
82
- font-weight: 600 !important;
83
- }
84
-
85
- .simple-tabs .tab-nav button.selected {
86
- color: #f97316 !important;
87
- border-bottom: 2px solid #f97316 !important;
88
- }
89
-
90
- .simple-tabs .tab-nav button:hover {
91
- background: #f3f4f6 !important;
92
- color: #f97316 !important;
93
- box-shadow: none !important;
94
- transform: none !important;
95
- }
96
- """
97
-
98
-
99
- # Create the main interface
100
- with gr.Blocks(
101
- css=css, title=" 🌐 Hugging Face Transformers Docs i18n made easy"
102
- ) as demo:
103
- # Title
104
- with open("images/hfkr_logo.png", "rb") as img_file:
105
- base64_img = base64.b64encode(img_file.read()).decode()
106
- gr.Markdown(
107
- f'<img src="data:image/png;base64,{base64_img}" style="display: block; margin-left: auto; margin-right: auto; height: 15em;"/>'
108
- )
109
- gr.Markdown(
110
- '<h1 style="text-align: center;"> 🌐 Hugging Face Transformers Docs i18n made easy</h1>'
111
- )
112
-
113
- # Content
114
- with gr.Row():
115
- # Chat interface
116
- with gr.Column(scale=3, elem_classes=["chat-container"]):
117
- gr.Markdown("### 🌐 Hugging Face i18n Agent")
118
-
119
- chatbot = gr.Chatbot(
120
- value=[[None, get_welcome_message()]], scale=1, height=525,
121
- show_copy_button=True
122
- )
123
-
124
- # Chat input directly under main chat
125
- gr.Markdown("### 💬 Chat with agent")
126
- with gr.Row():
127
- msg_input = gr.Textbox(
128
- placeholder="Type your message here... (e.g. 'what', 'how', or 'help')",
129
- container=False,
130
- scale=4,
131
- )
132
- send_btn = gr.Button("Send", scale=1, elem_classes="action-button")
133
-
134
- # Controller interface
135
- with gr.Column(scale=2):
136
- # Configuration Panel
137
- with gr.Column(elem_classes=["control-panel"]):
138
- gr.Markdown("### ⚙️ Configuration")
139
-
140
- with gr.Accordion("🔧 API & GitHub Settings", open=True):
141
- api_provider_radio = gr.Radio(
142
- ["Anthropic", "AWS Bedrock"],
143
- label="Select API Provider",
144
- value="Anthropic", # Default selection
145
- interactive=True,
146
- )
147
- config_anthropic_key = gr.Textbox(
148
- label="🔑 Anthropic API Key",
149
- type="password",
150
- placeholder="sk-ant-...",
151
- visible=True, # Initially visible as Anthropic is default
152
- )
153
- config_aws_bearer_token_bedrock = gr.Textbox(
154
- label="🔑 AWS Bearer Token for Bedrock",
155
- type="password",
156
- placeholder="AWS_BEARER_TOKEN_BEDROCK",
157
- visible=False, # Initially hidden
158
- )
159
- config_github_token = gr.Textbox(
160
- label="🔑 GitHub Token (Required for PR, Optional for file search)",
161
- type="password",
162
- placeholder="ghp_...",
163
- )
164
-
165
- with gr.Row():
166
- config_github_owner = gr.Textbox(
167
- label="👤 GitHub Owner",
168
- placeholder="your-username",
169
- scale=1,
170
- )
171
- config_github_repo = gr.Textbox(
172
- label="📁 Repository Name",
173
- placeholder="your-repository",
174
- scale=1,
175
- )
176
-
177
- save_config_btn = gr.Button(
178
- "💾 Save Configuration", elem_classes="action-button"
179
- )
180
-
181
- # Quick Controller
182
- with gr.Column(elem_classes=["control-panel"]):
183
- gr.Markdown("### 🛠️ Quick Controls")
184
- status_display = gr.HTML(update_status())
185
-
186
- with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
187
- with gr.TabItem("1. Find Files", id=0):
188
- with gr.Group():
189
- project_dropdown = gr.Radio(
190
- choices=get_available_projects(),
191
- label="🎯 Select Project",
192
- value="transformers",
193
- )
194
- lang_dropdown = gr.Radio(
195
- choices=[language.value for language in Languages],
196
- label="🌍 Translate To",
197
- value="ko",
198
- )
199
- k_input = gr.Number(
200
- label="📊 First k missing translated docs",
201
- value=10,
202
- minimum=1,
203
- )
204
- find_btn = gr.Button(
205
- "🔍 Find Files to Translate",
206
- elem_classes="action-button",
207
- )
208
-
209
- confirm_go_btn = gr.Button(
210
- "✅ Confirm Selection & Go to Translate",
211
- elem_classes="action-button",
212
- )
213
-
214
- with gr.TabItem("2. Translate", id=1):
215
- with gr.Group():
216
- files_to_translate = gr.Radio(
217
- choices=[],
218
- label="📄 Select a file to translate",
219
- interactive=True,
220
- value=None,
221
- )
222
- file_to_translate_input = gr.Textbox(
223
- label="🌍 Select in the dropdown or write the file path to translate",
224
- value="",
225
- )
226
-
227
- translate_lang_display = gr.Dropdown(
228
- choices=[language.value for language in Languages],
229
- label="🌍 Translation Language",
230
- value="ko",
231
- interactive=False,
232
- )
233
- additional_instruction = gr.Textbox(
234
- label="📝 Additional instructions (Optional - e.g., custom glossary)",
235
- placeholder="Example: Translate 'model' as '모델' consistently",
236
- lines=2,
237
- )
238
-
239
- force_retranslate = gr.Checkbox(
240
- label="🔄 Force Retranslate (ignore existing translations)",
241
- value=False,
242
- )
243
-
244
- with gr.Accordion("🔍 Preview Translation Prompt", open=False):
245
- prompt_preview = gr.Textbox(
246
- lines=8,
247
- interactive=False,
248
- placeholder="Select a file and language to see the prompt preview...",
249
- show_copy_button=True,
250
- )
251
-
252
- start_translate_btn = gr.Button(
253
- "🚀 Start Translation", elem_classes="action-button"
254
- )
255
-
256
- confirm_upload_btn = gr.Button(
257
- "✅ Confirm Translation & Upload PR",
258
- elem_classes="action-button",
259
- visible=False,
260
- )
261
-
262
- with gr.TabItem("3. Upload PR", id=2):
263
- with gr.Group():
264
- reference_pr_url = gr.Textbox(
265
- label="🔗 Reference PR URL (Optional)",
266
- placeholder="Auto-filled based on project selection",
267
- )
268
- approve_btn = gr.Button(
269
- "✅ Generate GitHub PR", elem_classes="action-button"
270
- )
271
- restart_btn = gr.Button(
272
- "🔄 Restart Translation", elem_classes="action-button"
273
- )
274
-
275
- # Event Handlers
276
-
277
- find_btn.click(
278
- fn=process_file_search_handler,
279
- inputs=[project_dropdown, lang_dropdown, k_input, chatbot],
280
- outputs=[chatbot, msg_input, status_display, control_tabs, files_to_translate],
281
- )
282
-
283
- confirm_go_btn.click(
284
- fn=confirm_and_go_translate_handler,
285
- inputs=[chatbot],
286
- outputs=[chatbot, msg_input, status_display, control_tabs],
287
- )
288
-
289
- # Auto-save selections to state and update prompt preview
290
- project_dropdown.change(
291
- fn=update_project_selection,
292
- inputs=[project_dropdown, chatbot],
293
- outputs=[chatbot, msg_input, status_display],
294
- )
295
-
296
- # Update prompt preview when project changes
297
- project_dropdown.change(
298
- fn=update_prompt_preview,
299
- inputs=[translate_lang_display, file_to_translate_input, additional_instruction],
300
- outputs=[prompt_preview],
301
- )
302
-
303
- lang_dropdown.change(
304
- fn=update_language_selection,
305
- inputs=[lang_dropdown, chatbot],
306
- outputs=[chatbot, msg_input, status_display, translate_lang_display],
307
- )
308
-
309
- #
310
- files_to_translate.change(
311
- fn=lambda x: x,
312
- inputs=[files_to_translate],
313
- outputs=[file_to_translate_input],
314
- )
315
-
316
- # Button event handlers
317
- start_translate_btn.click(
318
- fn=start_translate_handler,
319
- inputs=[chatbot, file_to_translate_input, additional_instruction, force_retranslate],
320
- outputs=[chatbot, msg_input, status_display, control_tabs, start_translate_btn, confirm_upload_btn],
321
- )
322
-
323
- confirm_upload_btn.click(
324
- fn=confirm_translation_and_go_upload_handler,
325
- inputs=[chatbot],
326
- outputs=[chatbot, msg_input, status_display, control_tabs],
327
- )
328
-
329
- # Configuration Save
330
- save_config_btn.click(
331
- fn=update_persistent_config,
332
- inputs=[api_provider_radio, config_anthropic_key, config_aws_bearer_token_bedrock, config_github_token, config_github_owner, config_github_repo, reference_pr_url, chatbot],
333
- outputs=[chatbot, msg_input, status_display],
334
- )
335
-
336
- # API Provider selection handler
337
- api_provider_radio.change(
338
- fn=lambda provider: (
339
- gr.update(visible=True) if provider == "Anthropic" else gr.update(visible=False),
340
- gr.update(visible=True) if provider == "AWS Bedrock" else gr.update(visible=False),
341
- ),
342
- inputs=[api_provider_radio],
343
- outputs=[config_anthropic_key, config_aws_bearer_token_bedrock],
344
- )
345
-
346
- approve_btn.click(
347
- fn=approve_handler,
348
- inputs=[chatbot, config_github_owner, config_github_repo, reference_pr_url],
349
- outputs=[chatbot, msg_input, status_display],
350
- )
351
-
352
- restart_btn.click(
353
- fn=restart_handler,
354
- inputs=[chatbot],
355
- outputs=[chatbot, msg_input, status_display, control_tabs],
356
- )
357
-
358
- send_btn.click(
359
- fn=send_message,
360
- inputs=[msg_input, chatbot],
361
- outputs=[chatbot, msg_input, status_display],
362
- )
363
-
364
- msg_input.submit(
365
- fn=send_message,
366
- inputs=[msg_input, chatbot],
367
- outputs=[chatbot, msg_input, status_display],
368
- )
369
-
370
- # Update prompt preview when inputs change
371
- for input_component in [translate_lang_display, file_to_translate_input, additional_instruction]:
372
- input_component.change(
373
- fn=update_prompt_preview,
374
- inputs=[translate_lang_display, file_to_translate_input, additional_instruction],
375
- outputs=[prompt_preview],
376
- )
377
-
378
- root_path = os.environ.get("GRADIO_ROOT_PATH")
379
- demo.launch(root_path=root_path)
 
1
+ """Module for gradio chat-based translation agent interface."""
2
+
3
+ import base64
4
+ import os
5
+
6
+ import gradio as gr
7
+ from dotenv import load_dotenv
8
+
9
+ from agent.handler import (
10
+ approve_handler,
11
+ confirm_and_go_translate_handler,
12
+ confirm_translation_and_go_upload_handler,
13
+ get_welcome_message,
14
+ process_file_search_handler,
15
+ restart_handler,
16
+ send_message,
17
+ start_translate_handler,
18
+ sync_language_displays,
19
+ update_language_selection,
20
+ update_project_selection,
21
+ update_prompt_preview,
22
+ update_status,
23
+ update_github_config,
24
+ update_persistent_config,
25
+ )
26
+ from translator.model import Languages
27
+ from translator.project_config import get_available_projects
28
+
29
+ load_dotenv()
30
+
31
+
32
+ css = """
33
+ .gradio-container {
34
+ background: linear-gradient(135deg, #ffeda7 0%, #ffbebf 100%);
35
+ }
36
+ .chat-container {
37
+ background: rgba(255, 255, 180, 0.25);
38
+ border-radius: 18px;
39
+ box-shadow: 0 4px 24px rgba(0,0,0,0.08);
40
+ padding: 1.0em;
41
+ backdrop-filter: blur(8px);
42
+ border: 1px solid rgba(255,255,180,0.25);
43
+ width: 100%;
44
+ height: 100%;
45
+ }
46
+ .control-panel {
47
+ background: rgba(255, 255, 180, 0.25);
48
+ border-radius: 18px;
49
+ box-shadow: 0 4px 24px rgba(0,0,0,0.08);
50
+ padding: 1.0em;
51
+ backdrop-filter: blur(8px);
52
+ border: 1px solid rgba(255,255,180,0.25);
53
+ width: 100%;
54
+ overflow: visible !important;
55
+
56
+ }
57
+ .status-card {
58
+ width: 100%
59
+ }
60
+ .action-button {
61
+ background: linear-gradient(135deg, #ff8c8c 0%, #f9a889 100%) !important;
62
+ color: white !important;
63
+ border: none !important;
64
+ font-weight: 600 !important;
65
+ box-shadow: 0 4px 12px rgba(0,0,0,0.15) !important;
66
+ transition: all 0.3s ease-in-out !important;
67
+ }
68
+ .action-button:hover {
69
+ background: linear-gradient(135deg, #f9a889 0%, #ff8c8c 100%) !important;
70
+ box-shadow: 0 6px 16px rgba(0,0,0,0.2) !important;
71
+ transform: translateY(-2px) !important;
72
+ }
73
+
74
+ .simple-tabs .tab-nav button {
75
+ background: transparent !important;
76
+ color: #4A5568 !important;
77
+ box-shadow: none !important;
78
+ transform: none !important;
79
+ border: none !important;
80
+ border-bottom: 2px solid #E2E8F0 !important;
81
+ border-radius: 0 !important;
82
+ font-weight: 600 !important;
83
+ }
84
+
85
+ .simple-tabs .tab-nav button.selected {
86
+ color: #f97316 !important;
87
+ border-bottom: 2px solid #f97316 !important;
88
+ }
89
+
90
+ .simple-tabs .tab-nav button:hover {
91
+ background: #f3f4f6 !important;
92
+ color: #f97316 !important;
93
+ box-shadow: none !important;
94
+ transform: none !important;
95
+ }
96
+ """
97
+
98
+
99
+ # Create the main interface
100
+ with gr.Blocks(
101
+ css=css, title=" 🌐 Hugging Face Transformers Docs i18n made easy"
102
+ ) as demo:
103
+ # Title
104
+ with open("images/hfkr_logo.png", "rb") as img_file:
105
+ base64_img = base64.b64encode(img_file.read()).decode()
106
+ gr.Markdown(
107
+ f'<img src="data:image/png;base64,{base64_img}" style="display: block; margin-left: auto; margin-right: auto; height: 15em;"/>'
108
+ )
109
+ gr.Markdown(
110
+ '<h1 style="text-align: center;"> 🌐 Hugging Face Transformers Docs i18n made easy</h1>'
111
+ )
112
+
113
+ # Content
114
+ with gr.Row():
115
+ # Chat interface
116
+ with gr.Column(scale=3, elem_classes=["chat-container"]):
117
+ gr.Markdown("### 🌐 Hugging Face i18n Agent")
118
+
119
+ chatbot = gr.Chatbot(
120
+ value=[[None, get_welcome_message()]], scale=1, height=525,
121
+ show_copy_button=True
122
+ )
123
+
124
+ # Chat input directly under main chat
125
+ gr.Markdown("### 💬 Chat with agent")
126
+ with gr.Row():
127
+ msg_input = gr.Textbox(
128
+ placeholder="Type your message here... (e.g. 'what', 'how', or 'help')",
129
+ container=False,
130
+ scale=4,
131
+ )
132
+ send_btn = gr.Button("Send", scale=1, elem_classes="action-button")
133
+
134
+ # Controller interface
135
+ with gr.Column(scale=2):
136
+ # Configuration Panel
137
+ with gr.Column(elem_classes=["control-panel"]):
138
+ gr.Markdown("### ⚙️ Configuration")
139
+
140
+ with gr.Accordion("🔧 API & GitHub Settings", open=True):
141
+ api_provider_radio = gr.Radio(
142
+ ["Anthropic", "AWS Bedrock"],
143
+ label="Select API Provider",
144
+ value="Anthropic", # Default selection
145
+ interactive=True,
146
+ )
147
+ config_anthropic_key = gr.Textbox(
148
+ label="🔑 Anthropic API Key",
149
+ type="password",
150
+ placeholder="sk-ant-...",
151
+ visible=True, # Initially visible as Anthropic is default
152
+ )
153
+ config_aws_bearer_token_bedrock = gr.Textbox(
154
+ label="🔑 AWS Bearer Token for Bedrock",
155
+ type="password",
156
+ placeholder="AWS_BEARER_TOKEN_BEDROCK",
157
+ visible=False, # Initially hidden
158
+ )
159
+ config_github_token = gr.Textbox(
160
+ label="🔑 GitHub Token (Required for PR, Optional for file search)",
161
+ type="password",
162
+ placeholder="ghp_...",
163
+ )
164
+
165
+ with gr.Row():
166
+ config_github_owner = gr.Textbox(
167
+ label="👤 GitHub Owner",
168
+ placeholder="your-username",
169
+ scale=1,
170
+ )
171
+ config_github_repo = gr.Textbox(
172
+ label="📁 Repository Name",
173
+ placeholder="your-repository",
174
+ scale=1,
175
+ )
176
+
177
+ save_config_btn = gr.Button(
178
+ "💾 Save Configuration", elem_classes="action-button"
179
+ )
180
+
181
+ # Quick Controller
182
+ with gr.Column(elem_classes=["control-panel"]):
183
+ gr.Markdown("### 🛠️ Quick Controls")
184
+ status_display = gr.HTML(update_status())
185
+
186
+ with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
187
+ with gr.TabItem("1. Find Files", id=0):
188
+ with gr.Group():
189
+ project_dropdown = gr.Radio(
190
+ choices=get_available_projects(),
191
+ label="🎯 Select Project",
192
+ value="transformers",
193
+ )
194
+ lang_dropdown = gr.Radio(
195
+ choices=[language.value for language in Languages],
196
+ label="🌍 Translate To",
197
+ value="ko",
198
+ )
199
+ k_input = gr.Number(
200
+ label="📊 First k missing translated docs",
201
+ value=10,
202
+ minimum=1,
203
+ )
204
+ find_btn = gr.Button(
205
+ "🔍 Find Files to Translate",
206
+ elem_classes="action-button",
207
+ )
208
+
209
+ confirm_go_btn = gr.Button(
210
+ "✅ Confirm Selection & Go to Translate",
211
+ elem_classes="action-button",
212
+ )
213
+
214
+ with gr.TabItem("2. Translate", id=1):
215
+ with gr.Group():
216
+ files_to_translate = gr.Radio(
217
+ choices=[],
218
+ label="📄 Select a file to translate",
219
+ interactive=True,
220
+ value=None,
221
+ )
222
+ file_to_translate_input = gr.Textbox(
223
+ label="🌍 Select in the dropdown or write the file path to translate",
224
+ value="",
225
+ )
226
+
227
+ translate_lang_display = gr.Dropdown(
228
+ choices=[language.value for language in Languages],
229
+ label="🌍 Translation Language",
230
+ value="ko",
231
+ interactive=False,
232
+ )
233
+ additional_instruction = gr.Textbox(
234
+ label="📝 Additional instructions (Optional - e.g., custom glossary)",
235
+ placeholder="Example: Translate 'model' as '모델' consistently",
236
+ lines=2,
237
+ )
238
+
239
+ force_retranslate = gr.Checkbox(
240
+ label="🔄 Force Retranslate (ignore existing translations)",
241
+ value=False,
242
+ )
243
+
244
+ with gr.Accordion("🔍 Preview Translation Prompt", open=False):
245
+ prompt_preview = gr.Textbox(
246
+ lines=8,
247
+ interactive=False,
248
+ placeholder="Select a file and language to see the prompt preview...",
249
+ show_copy_button=True,
250
+ )
251
+
252
+ start_translate_btn = gr.Button(
253
+ "🚀 Start Translation", elem_classes="action-button"
254
+ )
255
+
256
+ confirm_upload_btn = gr.Button(
257
+ "✅ Confirm Translation & Upload PR",
258
+ elem_classes="action-button",
259
+ visible=False,
260
+ )
261
+
262
+ with gr.TabItem("3. Upload PR", id=2):
263
+ with gr.Group():
264
+ reference_pr_url = gr.Textbox(
265
+ label="🔗 Reference PR URL (Optional)",
266
+ placeholder="Auto-filled based on project selection",
267
+ )
268
+ approve_btn = gr.Button(
269
+ "✅ Generate GitHub PR", elem_classes="action-button"
270
+ )
271
+ restart_btn = gr.Button(
272
+ "🔄 Restart Translation", elem_classes="action-button"
273
+ )
274
+
275
+ # Event Handlers
276
+
277
+ find_btn.click(
278
+ fn=process_file_search_handler,
279
+ inputs=[project_dropdown, lang_dropdown, k_input, chatbot],
280
+ outputs=[chatbot, msg_input, status_display, control_tabs, files_to_translate],
281
+ )
282
+
283
+ confirm_go_btn.click(
284
+ fn=confirm_and_go_translate_handler,
285
+ inputs=[chatbot],
286
+ outputs=[chatbot, msg_input, status_display, control_tabs],
287
+ )
288
+
289
+ # Auto-save selections to state and update prompt preview
290
+ project_dropdown.change(
291
+ fn=update_project_selection,
292
+ inputs=[project_dropdown, chatbot],
293
+ outputs=[chatbot, msg_input, status_display],
294
+ )
295
+
296
+ # Update prompt preview when project changes
297
+ project_dropdown.change(
298
+ fn=update_prompt_preview,
299
+ inputs=[translate_lang_display, file_to_translate_input, additional_instruction],
300
+ outputs=[prompt_preview],
301
+ )
302
+
303
+ lang_dropdown.change(
304
+ fn=update_language_selection,
305
+ inputs=[lang_dropdown, chatbot],
306
+ outputs=[chatbot, msg_input, status_display, translate_lang_display],
307
+ )
308
+
309
+ #
310
+ files_to_translate.change(
311
+ fn=lambda x: x,
312
+ inputs=[files_to_translate],
313
+ outputs=[file_to_translate_input],
314
+ )
315
+
316
+ # Button event handlers
317
+ start_translate_btn.click(
318
+ fn=start_translate_handler,
319
+ inputs=[chatbot, file_to_translate_input, additional_instruction, force_retranslate],
320
+ outputs=[chatbot, msg_input, status_display, control_tabs, start_translate_btn, confirm_upload_btn],
321
+ )
322
+
323
+ confirm_upload_btn.click(
324
+ fn=confirm_translation_and_go_upload_handler,
325
+ inputs=[chatbot],
326
+ outputs=[chatbot, msg_input, status_display, control_tabs],
327
+ )
328
+
329
+ # Configuration Save
330
+ save_config_btn.click(
331
+ fn=update_persistent_config,
332
+ inputs=[api_provider_radio, config_anthropic_key, config_aws_bearer_token_bedrock, config_github_token, config_github_owner, config_github_repo, reference_pr_url, chatbot],
333
+ outputs=[chatbot, msg_input, status_display],
334
+ )
335
+
336
+ # API Provider selection handler
337
+ api_provider_radio.change(
338
+ fn=lambda provider: (
339
+ gr.update(visible=True) if provider == "Anthropic" else gr.update(visible=False),
340
+ gr.update(visible=True) if provider == "AWS Bedrock" else gr.update(visible=False),
341
+ ),
342
+ inputs=[api_provider_radio],
343
+ outputs=[config_anthropic_key, config_aws_bearer_token_bedrock],
344
+ )
345
+
346
+ approve_btn.click(
347
+ fn=approve_handler,
348
+ inputs=[chatbot, config_github_owner, config_github_repo, reference_pr_url],
349
+ outputs=[chatbot, msg_input, status_display],
350
+ )
351
+
352
+ restart_btn.click(
353
+ fn=restart_handler,
354
+ inputs=[chatbot],
355
+ outputs=[chatbot, msg_input, status_display, control_tabs],
356
+ )
357
+
358
+ send_btn.click(
359
+ fn=send_message,
360
+ inputs=[msg_input, chatbot],
361
+ outputs=[chatbot, msg_input, status_display],
362
+ )
363
+
364
+ msg_input.submit(
365
+ fn=send_message,
366
+ inputs=[msg_input, chatbot],
367
+ outputs=[chatbot, msg_input, status_display],
368
+ )
369
+
370
+ # Update prompt preview when inputs change
371
+ for input_component in [translate_lang_display, file_to_translate_input, additional_instruction]:
372
+ input_component.change(
373
+ fn=update_prompt_preview,
374
+ inputs=[translate_lang_display, file_to_translate_input, additional_instruction],
375
+ outputs=[prompt_preview],
376
+ )
377
+
378
+ root_path = os.environ.get("GRADIO_ROOT_PATH")
379
+ demo.launch(root_path=root_path)
config.py CHANGED
@@ -1,10 +1,10 @@
1
- # config.py
2
-
3
- # 기본 모델 목록
4
- default_models = [
5
- "Helsinki-NLP/opus-mt-ko-en",
6
- "Helsinki-NLP/opus-mt-tc-big-en-ko",
7
- "davidkim205/iris-7b",
8
- "maywell/Synatra-7B-v0.3-Translation",
9
- "CUSTOM_MODEL_INPUT" # Placeholder for custom model input
10
  ]
 
1
+ # config.py
2
+
3
+ # 기본 모델 목록
4
+ default_models = [
5
+ "Helsinki-NLP/opus-mt-ko-en",
6
+ "Helsinki-NLP/opus-mt-tc-big-en-ko",
7
+ "davidkim205/iris-7b",
8
+ "maywell/Synatra-7B-v0.3-Translation",
9
+ "CUSTOM_MODEL_INPUT" # Placeholder for custom model input
10
  ]
example.env CHANGED
@@ -1,18 +1,18 @@
1
- ANTHROPIC_API_KEY=<your api key>
2
-
3
- # GitHub PR Agent Configuration
4
- GITHUB_TOKEN=<your github token>
5
- GITHUB_OWNER=<your github username>
6
- GITHUB_REPO=<your repository name>
7
- REFERENCE_PR_URL=<reference pr url for style analysis>
8
-
9
- # Secrets for deployment to HF space
10
- HF_TOKEN=
11
- HF_USERNAME=
12
- HF_SPACE_NAME=
13
-
14
- # Secrets for logging to Github
15
- LOG_REPO=
16
- LOG_GITHUB_TOKEN=
17
- LOG_BRANCH=
18
- LOG_FILE_PATH=
 
1
+ ANTHROPIC_API_KEY=<your api key>
2
+
3
+ # GitHub PR Agent Configuration
4
+ GITHUB_TOKEN=<your github token>
5
+ GITHUB_OWNER=<your github username>
6
+ GITHUB_REPO=<your repository name>
7
+ REFERENCE_PR_URL=<reference pr url for style analysis>
8
+
9
+ # Secrets for deployment to HF space
10
+ HF_TOKEN=
11
+ HF_USERNAME=
12
+ HF_SPACE_NAME=
13
+
14
+ # Secrets for logging to Github
15
+ LOG_REPO=
16
+ LOG_GITHUB_TOKEN=
17
+ LOG_BRANCH=
18
+ LOG_FILE_PATH=
logger/github_logger.py CHANGED
@@ -1,71 +1,71 @@
1
- import os
2
- import base64
3
- from typing import Optional
4
-
5
- try:
6
- from github import Github, GithubException
7
- LIBS_OK = True
8
- except ImportError:
9
- LIBS_OK = False
10
-
11
- class GitHubLogger:
12
- """Dedicated logger that appends JSONL entries to a GitHub repo/branch/file.
13
-
14
- Env vars:
15
- - LOG_GITHUB_TOKEN (fallback: GITHUB_TOKEN)
16
- - LOG_REPO (format: owner/repo)
17
- - LOG_BRANCH (default: 'log_event')
18
- - LOG_FILE_PATH (default: 'pr_success.log')
19
- """
20
-
21
- def __init__(self):
22
- if not LIBS_OK:
23
- raise ImportError("PyGithub not installed. Please install PyGithub.")
24
- token = os.environ.get("LOG_GITHUB_TOKEN") or os.environ.get("GITHUB_TOKEN")
25
- if not token:
26
- raise ValueError("Missing LOG_GITHUB_TOKEN or GITHUB_TOKEN for logging")
27
- self._client = Github(token)
28
-
29
- repo_spec = os.environ.get("LOG_REPO")
30
- if not repo_spec or "/" not in repo_spec:
31
- raise ValueError("Missing or invalid LOG_REPO. Expected 'owner/repo'.")
32
- self.owner, self.repo_name = repo_spec.split("/", 1)
33
-
34
- self.branch = os.environ.get("LOG_BRANCH", "log_event")
35
- self.path = os.environ.get("LOG_FILE_PATH", "pr_success.log")
36
-
37
- def _ensure_branch(self, repo):
38
- try:
39
- repo.get_branch(self.branch)
40
- except GithubException as e:
41
- if e.status == 404:
42
- base = repo.get_branch(repo.default_branch)
43
- repo.create_git_ref(ref=f"refs/heads/{self.branch}", sha=base.commit.sha)
44
- else:
45
- raise
46
-
47
- def append_jsonl(self, jsonl_line: str, commit_message: str = "chore(log): append entry") -> str:
48
- repo = self._client.get_repo(f"{self.owner}/{self.repo_name}")
49
- self._ensure_branch(repo)
50
- try:
51
- existing = repo.get_contents(self.path, ref=self.branch)
52
- existing_content = base64.b64decode(existing.content).decode("utf-8")
53
- new_content = existing_content + jsonl_line
54
- repo.update_file(
55
- path=self.path,
56
- message=commit_message,
57
- content=new_content,
58
- sha=existing.sha,
59
- branch=self.branch,
60
- )
61
- return "SUCCESS: Log appended"
62
- except GithubException as e:
63
- if e.status == 404:
64
- repo.create_file(
65
- path=self.path,
66
- message=commit_message,
67
- content=jsonl_line,
68
- branch=self.branch,
69
- )
70
- return "SUCCESS: Log file created and first entry appended"
71
- raise
 
1
+ import os
2
+ import base64
3
+ from typing import Optional
4
+
5
+ try:
6
+ from github import Github, GithubException
7
+ LIBS_OK = True
8
+ except ImportError:
9
+ LIBS_OK = False
10
+
11
+ class GitHubLogger:
12
+ """Dedicated logger that appends JSONL entries to a GitHub repo/branch/file.
13
+
14
+ Env vars:
15
+ - LOG_GITHUB_TOKEN (fallback: GITHUB_TOKEN)
16
+ - LOG_REPO (format: owner/repo)
17
+ - LOG_BRANCH (default: 'log_event')
18
+ - LOG_FILE_PATH (default: 'pr_success.log')
19
+ """
20
+
21
+ def __init__(self):
22
+ if not LIBS_OK:
23
+ raise ImportError("PyGithub not installed. Please install PyGithub.")
24
+ token = os.environ.get("LOG_GITHUB_TOKEN") or os.environ.get("GITHUB_TOKEN")
25
+ if not token:
26
+ raise ValueError("Missing LOG_GITHUB_TOKEN or GITHUB_TOKEN for logging")
27
+ self._client = Github(token)
28
+
29
+ repo_spec = os.environ.get("LOG_REPO")
30
+ if not repo_spec or "/" not in repo_spec:
31
+ raise ValueError("Missing or invalid LOG_REPO. Expected 'owner/repo'.")
32
+ self.owner, self.repo_name = repo_spec.split("/", 1)
33
+
34
+ self.branch = os.environ.get("LOG_BRANCH", "log_event")
35
+ self.path = os.environ.get("LOG_FILE_PATH", "pr_success.log")
36
+
37
+ def _ensure_branch(self, repo):
38
+ try:
39
+ repo.get_branch(self.branch)
40
+ except GithubException as e:
41
+ if e.status == 404:
42
+ base = repo.get_branch(repo.default_branch)
43
+ repo.create_git_ref(ref=f"refs/heads/{self.branch}", sha=base.commit.sha)
44
+ else:
45
+ raise
46
+
47
+ def append_jsonl(self, jsonl_line: str, commit_message: str = "chore(log): append entry") -> str:
48
+ repo = self._client.get_repo(f"{self.owner}/{self.repo_name}")
49
+ self._ensure_branch(repo)
50
+ try:
51
+ existing = repo.get_contents(self.path, ref=self.branch)
52
+ existing_content = base64.b64decode(existing.content).decode("utf-8")
53
+ new_content = existing_content + jsonl_line
54
+ repo.update_file(
55
+ path=self.path,
56
+ message=commit_message,
57
+ content=new_content,
58
+ sha=existing.sha,
59
+ branch=self.branch,
60
+ )
61
+ return "SUCCESS: Log appended"
62
+ except GithubException as e:
63
+ if e.status == 404:
64
+ repo.create_file(
65
+ path=self.path,
66
+ message=commit_message,
67
+ content=jsonl_line,
68
+ branch=self.branch,
69
+ )
70
+ return "SUCCESS: Log file created and first entry appended"
71
+ raise
pr_generator/agent.py CHANGED
@@ -1,596 +1,596 @@
1
- """
2
- GitHub PR creation agent using Langchain.
3
- This code integrates with the actual GitHub API using the PyGithub library.
4
- Please set the GITHUB_TOKEN environment variable and install required libraries before running.
5
- """
6
-
7
- import os
8
- import re
9
- import json
10
- from typing import Optional, Dict, List, Tuple, Any
11
-
12
- # Load environment variables from .env file
13
- from dotenv import load_dotenv
14
- from translator.content import llm_translate
15
-
16
- load_dotenv()
17
-
18
- # Constants definition
19
- ANTHROPIC_MODEL_ID = "claude-sonnet-4-20250514"
20
- DEFAULT_TEMPERATURE = 0.0
21
-
22
- # Library imports and error handling
23
- try:
24
- from github import Github, GithubException
25
- from github.GitRef import GitRef
26
- from langchain_anthropic import ChatAnthropic
27
-
28
- REQUIRED_LIBS_AVAILABLE = True
29
- except ImportError as e:
30
- print(f"Required libraries are not installed: {e}")
31
- print("Please run: pip install PyGithub boto3 langchain-anthropic")
32
- REQUIRED_LIBS_AVAILABLE = False
33
-
34
-
35
- class GitHubPRAgent:
36
- """Agent class for GitHub PR creation"""
37
-
38
- def __init__(self, user_owner: str = None, user_repo: str = None, base_owner: str = None, base_repo: str = None):
39
- self._github_client = None
40
- self._llm = None
41
- self.user_owner = user_owner
42
- self.user_repo = user_repo
43
- self.base_owner = base_owner
44
- self.base_repo = base_repo
45
-
46
- @property
47
- def github_client(self) -> Optional[Github]:
48
- """Return GitHub API client with lazy initialization."""
49
- if not REQUIRED_LIBS_AVAILABLE:
50
- raise ImportError("Required libraries not found.")
51
-
52
- if self._github_client is None:
53
- token = os.environ.get("GITHUB_TOKEN")
54
- if not token:
55
- print("Warning: GITHUB_TOKEN environment variable not set.")
56
- return Github() # Limited access
57
- self._github_client = Github(token)
58
-
59
- return self._github_client
60
-
61
- @property
62
- def llm(self):
63
- """Return LLM client with lazy initialization."""
64
- if not REQUIRED_LIBS_AVAILABLE:
65
- raise ImportError("Required libraries not found.")
66
-
67
- if self._llm is None:
68
- self._llm = ChatAnthropic(
69
- model=ANTHROPIC_MODEL_ID,
70
- temperature=DEFAULT_TEMPERATURE,
71
- )
72
- return self._llm
73
-
74
- def _handle_github_error(self, e: Exception, operation: str) -> str:
75
- """Handle GitHub API errors consistently."""
76
- if isinstance(e, GithubException):
77
- return f"{operation} failed: {e.status} {e.data.get('message', e.data)}"
78
- return f"Unexpected error during {operation}: {str(e)}"
79
-
80
- def create_pull_request(
81
- self,
82
- owner: str,
83
- repo_name: str,
84
- title: str,
85
- head: str,
86
- base: str,
87
- body: str = "",
88
- draft: bool = False,
89
- maintainer_can_modify: bool = True,
90
- ) -> str:
91
- """Create a new Pull Request."""
92
- try:
93
- # 1. Check if head and base are the same
94
- if head == base:
95
- return f"ERROR: head branch ({head}) and base branch ({base}) are identical."
96
-
97
- # 2. Check for existing PR
98
- existing_pr = self.check_existing_pr(owner, repo_name, head, base)
99
- if existing_pr:
100
- return f"ERROR: {existing_pr}"
101
-
102
- # 3. Verify head and base branches exist
103
- repo = self.github_client.get_repo(f"{owner}/{repo_name}")
104
- try:
105
- # For fork-to-upstream PR, head format is "fork_owner:branch_name"
106
- if ":" in head:
107
- fork_owner, branch_name = head.split(":", 1)
108
- fork_repo = self.github_client.get_repo(f"{fork_owner}/{repo_name}")
109
- head_branch = fork_repo.get_branch(branch_name)
110
- else:
111
- head_branch = repo.get_branch(head)
112
-
113
- base_branch = repo.get_branch(base)
114
-
115
- # 4. Check if head and base branches point to the same commit
116
- if head_branch.commit.sha == base_branch.commit.sha:
117
- return f"ERROR: head branch ({head}) and base branch ({base}) point to the same commit. No changes to merge."
118
-
119
- except GithubException as e:
120
- if e.status == 404:
121
- return f"ERROR: Branch not found. head: {head}, base: {base}"
122
-
123
- # 5. Create PR
124
- pr = repo.create_pull(
125
- title=title,
126
- body=body,
127
- head=head,
128
- base=base,
129
- draft=draft,
130
- maintainer_can_modify=maintainer_can_modify,
131
- )
132
- return f"PR creation successful: {pr.html_url}"
133
- except GithubException as e:
134
- if e.status == 422:
135
- error_msg = e.data.get("message", "Unknown error")
136
- errors = e.data.get("errors", [])
137
-
138
- error_details = []
139
- for error in errors:
140
- if "message" in error:
141
- error_details.append(error["message"])
142
-
143
- detail_msg = " | ".join(error_details) if error_details else ""
144
- return f"ERROR: PR creation failed (422): {error_msg}. {detail_msg}"
145
- return self._handle_github_error(e, "PR creation")
146
- except Exception as e:
147
- return self._handle_github_error(e, "PR creation")
148
-
149
- def create_branch(
150
- self, owner: str, repo_name: str, branch_name: str, source_sha: str
151
- ) -> str:
152
- """Create a new branch."""
153
- try:
154
- repo = self.github_client.get_repo(f"{owner}/{repo_name}")
155
- ref_name = f"refs/heads/{branch_name}"
156
- new_ref = repo.create_git_ref(ref=ref_name, sha=source_sha)
157
-
158
- if isinstance(new_ref, GitRef):
159
- return f"SUCCESS: Branch '{branch_name}' created successfully (ref: {new_ref.ref})"
160
- return f"ERROR: Branch '{branch_name}' creation failed. Please check API response."
161
- except GithubException as e:
162
- if e.status == 422 and "Reference already exists" in str(e.data):
163
- return f"WARNING: Branch '{branch_name}' already exists."
164
- return self._handle_github_error(e, "branch creation")
165
- except Exception as e:
166
- return self._handle_github_error(e, "branch creation")
167
-
168
- def check_existing_pr(
169
- self, owner: str, repo_name: str, head: str, base: str
170
- ) -> Optional[str]:
171
- """Check if there's an existing PR with the same head and base."""
172
- try:
173
- repo = self.github_client.get_repo(f"{owner}/{repo_name}")
174
- # For head parameter, use exactly what was passed (could be "fork_owner:branch" or just "branch")
175
- search_head = head if ":" in head else f"{owner}:{head}"
176
- pulls = repo.get_pulls(state="open", head=search_head, base=base)
177
- for pr in pulls:
178
- return f"Existing PR found: {pr.html_url}"
179
- return None
180
- except Exception as e:
181
- print(f"⚠️ Error checking existing PR: {str(e)}")
182
- return None
183
-
184
- def create_or_update_file(
185
- self,
186
- owner: str,
187
- repo_name: str,
188
- path: str,
189
- message: str,
190
- content: str,
191
- branch_name: Optional[str] = None,
192
- sha_blob: Optional[str] = None,
193
- ) -> str:
194
- """Create or update a single file."""
195
- try:
196
- repo = self.github_client.get_repo(f"{owner}/{repo_name}")
197
-
198
- args = {
199
- "path": path,
200
- "message": message,
201
- "content": content,
202
- }
203
- if branch_name:
204
- args["branch"] = branch_name
205
-
206
- # Try to update file
207
- if sha_blob:
208
- args["sha"] = sha_blob
209
- repo.update_file(**args)
210
- return f"SUCCESS: File updated - {path}"
211
-
212
- # Try to create file
213
- repo.create_file(**args)
214
- return f"SUCCESS: File created - {path}"
215
-
216
- except GithubException as e:
217
- # Try to update if file already exists
218
- if e.status == 422:
219
- try:
220
- existing_file = repo.get_contents(
221
- path, ref=branch_name or repo.default_branch
222
- )
223
- args["sha"] = existing_file.sha
224
- repo.update_file(**args)
225
- return f"SUCCESS: File updated - {path}"
226
- except:
227
- pass
228
- return f"ERROR: File processing failed - {path}"
229
- except Exception:
230
- return f"ERROR: File processing failed - {path}"
231
-
232
- def analyze_reference_pr(self, pr_url: str) -> Dict[str, Any]:
233
- """Analyze reference PR to extract style information."""
234
- try:
235
- # Parse PR URL
236
- match = re.match(r"https://github\.com/([^/]+)/([^/]+)/pull/(\d+)", pr_url)
237
- if not match:
238
- return {"error": f"Invalid PR URL format: {pr_url}"}
239
-
240
- owner, repo_name, pr_number = match.groups()
241
- repo = self.github_client.get_repo(f"{owner}/{repo_name}")
242
- pr = repo.get_pull(int(pr_number))
243
-
244
- return {
245
- "title": pr.title,
246
- "body": pr.body,
247
- "head_branch": pr.head.ref,
248
- "base_branch": pr.base.ref,
249
- "files_changed": [f.filename for f in pr.get_files()],
250
- "commits": [
251
- {"message": c.commit.message, "sha": c.sha}
252
- for c in pr.get_commits()
253
- ],
254
- }
255
- except Exception as e:
256
- return {"error": f"Error occurred during PR analysis: {str(e)}"}
257
-
258
- def _generate_with_llm(
259
- self, prompt: str, fallback_value: str, operation: str
260
- ) -> str:
261
- """Generate text using LLM."""
262
- try:
263
- _usage_info, generated = llm_translate(prompt)
264
- generated = generated.strip()
265
- print(f"LLM generated {operation}: {generated}")
266
- return generated
267
- except Exception as e:
268
- print(f"❌ Error generating {operation} with LLM: {e}")
269
- print(f"Using fallback value: {fallback_value}")
270
- return fallback_value
271
-
272
- def generate_branch_name_from_reference(
273
- self, reference_branch_name: str, target_language: str, file_name: str
274
- ) -> str:
275
- """Generate branch name using simple template."""
276
- # Keep .md extension and make branch-safe
277
- branch_safe_name = file_name.replace('_', '-')
278
- return f"{target_language}-{branch_safe_name}"
279
-
280
- def generate_pr_content_from_reference(
281
- self,
282
- reference_title: str,
283
- reference_body: str,
284
- target_language: str,
285
- filepath: str,
286
- target_filepath: str,
287
- file_name: str,
288
- ) -> Tuple[str, str]:
289
- """Use LLM to analyze reference PR title and body and generate appropriate PR content."""
290
- prompt = f"""Here is the reference PR information:
291
-
292
- Reference PR title: {reference_title}
293
-
294
- Reference PR body:
295
- {reference_body}
296
-
297
- Now I need to generate PR title and body for a new translation task:
298
- - Target language: {target_language}
299
- - Original file: {filepath}
300
- - Translation file: {target_filepath}
301
- - File name: {file_name}
302
-
303
- Please analyze the style and format of the reference PR to generate consistent new PR title and body.
304
-
305
- Requirements:
306
- 1. Follow the title format and pattern of the reference PR
307
- 2. Maintain the body style, markdown format, indentation, and line breaks of the reference PR
308
- 3. Appropriately reflect the target language ({target_language}) and file paths
309
- 4. If there are user mentions (@username), change them to general text instead of actual mentions
310
- 5. Adjust the content to fit the translation task
311
-
312
- Response format:
313
- Title: [PR title here]
314
- Body: [PR body here, maintaining the exact markdown format and structure of the original]"""
315
-
316
- try:
317
- _usage_info, generated_content = llm_translate(prompt)
318
- generated_content = generated_content.strip()
319
-
320
- # Separate title and body from response
321
- lines = generated_content.split("\n")
322
- title_line = ""
323
- body_lines = []
324
- parsing_body = False
325
-
326
- for line in lines:
327
- if line.startswith("Title:"):
328
- title_line = line.replace("Title:", "").strip()
329
- elif line.startswith("Body:"):
330
- parsing_body = True
331
- body_content = line.replace("Body:", "").strip()
332
- if body_content:
333
- body_lines.append(body_content)
334
- elif parsing_body:
335
- body_lines.append(line)
336
-
337
- generated_title = title_line if title_line else reference_title
338
- generated_body = (
339
- "\n".join(body_lines)
340
- if body_lines
341
- else f"Add {target_language} translation for `{filepath}`."
342
- )
343
-
344
- print(f"LLM generated PR title: {generated_title}")
345
- print(f"LLM generated PR body (first 100 chars): {generated_body[:100]}...")
346
-
347
- return generated_title, generated_body
348
-
349
- except Exception as e:
350
- print(f"❌ Error generating PR content with LLM: {e}")
351
- return self._generate_default_pr_content(
352
- target_language, filepath, target_filepath, file_name
353
- )
354
-
355
- def _generate_default_pr_content(
356
- self, target_language: str, filepath: str, target_filepath: str, file_name: str
357
- ) -> Tuple[str, str]:
358
- """Generate default PR content."""
359
- title = f"🌐 [i18n-{target_language}] Translated `{file_name}` to {target_language}"
360
- body = f"""# What does this PR do?
361
-
362
- Translated the `{filepath}` file of the documentation to {target_language} 😄
363
- Thank you in advance for your review!
364
-
365
- Part of https://github.com/huggingface/transformers/issues/20179
366
-
367
- ## Before reviewing
368
- - [x] Check for missing / redundant translations (번역 누락/중복 검사)
369
- - [x] Grammar Check (맞춤법 검사)
370
- - [x] Review or Add new terms to glossary (용어 확인 및 추가)
371
- - [x] Check Inline TOC (e.g. `[[lowercased-header]]`)
372
- - [x] Check live-preview for gotchas (live-preview로 정상작동 확인)
373
-
374
- ## Who can review? (Initial)
375
- {target_language} translation reviewers
376
-
377
- ## Before submitting
378
- - [x] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
379
- - [x] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
380
- Pull Request section?
381
- - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
382
- to it if that's the case.
383
- - [x] Did you make sure to update the documentation with your changes? Here are the
384
- [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and
385
- [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
386
- - [ ] Did you write any new necessary tests?
387
-
388
- ## Who can review? (Final)
389
- May you please review this PR?
390
- Documentation maintainers
391
- """
392
- return title, body
393
-
394
- def generate_commit_message_from_reference(
395
- self, commit_messages: List[str], target_language: str, file_name: str
396
- ) -> str:
397
- """Generate simple commit message using template."""
398
- return f"docs: {target_language}: {file_name}"
399
-
400
- def get_branch_info(self, owner: str, repo_name: str, branch_name: str) -> str:
401
- """Get information about an existing branch."""
402
- try:
403
- repo = self.github_client.get_repo(f"{owner}/{repo_name}")
404
- branch = repo.get_branch(branch_name)
405
- commit = branch.commit
406
- commit_info = commit.commit
407
-
408
- return f"""
409
- 📋 Existing branch information:
410
- - Branch name: {branch_name}
411
- - Latest commit: {commit.sha[:8]}
412
- - Commit message: {commit_info.message.split(chr(10))[0][:80]}...
413
- - Author: {commit_info.author.name}
414
- - Date: {commit_info.author.date.strftime('%Y-%m-%d %H:%M:%S')}
415
- """
416
- except Exception as e:
417
- return f"Failed to retrieve branch information: {str(e)}"
418
-
419
- def run_translation_pr_workflow(
420
- self,
421
- reference_pr_url: str,
422
- target_language: str,
423
- filepath: str,
424
- translated_doc: str,
425
- base_branch: str = "main",
426
- ) -> Dict[str, Any]:
427
- """Execute translation document PR creation workflow."""
428
- try:
429
- # 1. Analyze reference PR
430
- print(f"🔍 Analyzing reference PR: {reference_pr_url}")
431
- pr_analysis = self.analyze_reference_pr(reference_pr_url)
432
-
433
- if "error" in pr_analysis:
434
- return {"status": "error", "message": pr_analysis["error"]}
435
-
436
- print("Reference PR analysis completed")
437
-
438
- # 2. Generate translation file path and branch name
439
- target_filepath = filepath.replace("/en/", f"/{target_language}/")
440
- file_name = filepath.split("/")[-1] # Keep .md extension
441
-
442
- print(f"🌿 Generating branch name...")
443
- branch_name = self.generate_branch_name_from_reference(
444
- pr_analysis["head_branch"], target_language, file_name
445
- )
446
-
447
- # 3. Get main branch SHA from upstream and create branch in fork
448
- upstream_repo = self.github_client.get_repo(f"{self.base_owner}/{self.base_repo}")
449
- main_branch = upstream_repo.get_branch(base_branch)
450
- main_sha = main_branch.commit.sha
451
-
452
- print(f"🌿 Creating branch: {branch_name} in fork repository")
453
- branch_result = self.create_branch(self.user_owner, self.user_repo, branch_name, main_sha)
454
-
455
- # Check branch creation result
456
- if branch_result.startswith("ERROR"):
457
- return {
458
- "status": "error",
459
- "message": f"Branch creation failed: {branch_result}\n\nTarget: {self.user_owner}/{self.user_repo}\nBranch: {branch_name}\nBase SHA: {main_sha[:8]}",
460
- "branch": branch_name,
461
- "error_details": branch_result,
462
- }
463
- elif branch_result.startswith("WARNING"):
464
- print(f"⚠️ {branch_result}")
465
- # Continue if branch already exists
466
- elif branch_result.startswith("SUCCESS"):
467
- print(f"✅ {branch_result}")
468
- else:
469
- print(f"⚠️ Unexpected branch creation result: {branch_result}")
470
- # Continue anyway, might still work
471
-
472
- # 4. Generate commit message and save file
473
- commit_messages = [commit["message"] for commit in pr_analysis["commits"]]
474
- commit_message = self.generate_commit_message_from_reference(
475
- commit_messages, target_language, file_name
476
- )
477
-
478
- print(f"📄 Saving file: {target_filepath}")
479
- file_result = self.create_or_update_file(
480
- self.user_owner,
481
- self.user_repo,
482
- target_filepath,
483
- commit_message,
484
- translated_doc,
485
- branch_name,
486
- )
487
-
488
- if not file_result.startswith("SUCCESS"):
489
- return {
490
- "status": "error",
491
- "message": f"File save failed: {file_result}\n\n🎯 Target: {self.user_owner}/{self.user_repo} (expected: {target_language} fork of {self.base_owner}/{self.base_repo})\n🌿 Branch: {branch_name}\n📁 File: {target_filepath}",
492
- "branch": branch_name,
493
- "file_path": target_filepath,
494
- "error_details": file_result,
495
- }
496
-
497
- print(f"{file_result}")
498
-
499
- # 5. Create PR
500
- pr_title, pr_body = self.generate_pr_content_from_reference(
501
- pr_analysis["title"],
502
- pr_analysis["body"],
503
- target_language,
504
- filepath,
505
- target_filepath,
506
- file_name,
507
- )
508
-
509
- print(f"🔄 Creating PR: {pr_title}")
510
- print(f" Head: {self.user_owner}:{branch_name} → Base: {self.base_owner}:{base_branch}")
511
-
512
- # Create PR from fork to upstream repository
513
- pr_result = self.create_pull_request(
514
- self.base_owner, self.base_repo, pr_title, f"{self.user_owner}:{branch_name}", base_branch, pr_body, draft=True
515
- )
516
-
517
- if pr_result.startswith("ERROR"):
518
- print(f"❌ {pr_result}")
519
- return {
520
- "status": "partial_success",
521
- "branch": branch_name,
522
- "file_path": target_filepath,
523
- "message": f"File was saved and commit was successful.\nPR creation failed: {pr_result}",
524
- "error_details": pr_result,
525
- }
526
- elif "successful" in pr_result and "http" in pr_result:
527
- print(f"{pr_result}")
528
- return {
529
- "status": "success",
530
- "branch": branch_name,
531
- "file_path": target_filepath,
532
- "pr_url": pr_result.split(": ")[-1],
533
- "message": "Translation document PR created successfully!",
534
- }
535
- else:
536
- return {
537
- "status": "partial_success",
538
- "branch": branch_name,
539
- "file_path": target_filepath,
540
- "message": "File was saved but PR creation failed.",
541
- }
542
-
543
- except Exception as e:
544
- return {
545
- "status": "error",
546
- "message": f"Workflow execution failed: {str(e)}\n\nConfig: {self.user_owner}/{self.user_repo} → {self.base_owner}/{self.base_repo}\nFile: {filepath if 'filepath' in locals() else 'Unknown'}",
547
- "error_details": str(e),
548
- }
549
-
550
-
551
- # Backward compatibility functions (maintain compatibility with existing code)
552
- _agent = GitHubPRAgent()
553
-
554
-
555
- def get_github_client():
556
- return _agent.github_client
557
-
558
-
559
- def create_pull_request_func(*args, **kwargs):
560
- return _agent.create_pull_request(*args, **kwargs)
561
-
562
-
563
- def create_branch_func(*args, **kwargs):
564
- return _agent.create_branch(*args, **kwargs)
565
-
566
-
567
- def create_or_update_file_func(*args, **kwargs):
568
- return _agent.create_or_update_file(*args, **kwargs)
569
-
570
-
571
- def analyze_reference_pr_func(*args, **kwargs):
572
- return _agent.analyze_reference_pr(*args, **kwargs)
573
-
574
-
575
- def generate_branch_name_from_reference(*args, **kwargs):
576
- return _agent.generate_branch_name_from_reference(*args, **kwargs)
577
-
578
-
579
- def generate_pr_content_from_reference(*args, **kwargs):
580
- return _agent.generate_pr_content_from_reference(*args, **kwargs)
581
-
582
-
583
- def generate_default_pr_content(*args, **kwargs):
584
- return _agent._generate_default_pr_content(*args, **kwargs)
585
-
586
-
587
- def generate_commit_message_from_reference(*args, **kwargs):
588
- return _agent.generate_commit_message_from_reference(*args, **kwargs)
589
-
590
-
591
- def get_branch_info(*args, **kwargs):
592
- return _agent.get_branch_info(*args, **kwargs)
593
-
594
-
595
- def run_translation_pr_agent_simple(*args, **kwargs):
596
- return _agent.run_translation_pr_workflow(*args, **kwargs)
 
1
+ """
2
+ GitHub PR creation agent using Langchain.
3
+ This code integrates with the actual GitHub API using the PyGithub library.
4
+ Please set the GITHUB_TOKEN environment variable and install required libraries before running.
5
+ """
6
+
7
+ import os
8
+ import re
9
+ import json
10
+ from typing import Optional, Dict, List, Tuple, Any
11
+
12
+ # Load environment variables from .env file
13
+ from dotenv import load_dotenv
14
+ from translator.content import llm_translate
15
+
16
+ load_dotenv()
17
+
18
+ # Constants definition
19
+ ANTHROPIC_MODEL_ID = "claude-sonnet-4-20250514"
20
+ DEFAULT_TEMPERATURE = 0.0
21
+
22
+ # Library imports and error handling
23
+ try:
24
+ from github import Github, GithubException
25
+ from github.GitRef import GitRef
26
+ from langchain_anthropic import ChatAnthropic
27
+
28
+ REQUIRED_LIBS_AVAILABLE = True
29
+ except ImportError as e:
30
+ print(f"Required libraries are not installed: {e}")
31
+ print("Please run: pip install PyGithub boto3 langchain-anthropic")
32
+ REQUIRED_LIBS_AVAILABLE = False
33
+
34
+
35
+ class GitHubPRAgent:
36
+ """Agent class for GitHub PR creation"""
37
+
38
+ def __init__(self, user_owner: str = None, user_repo: str = None, base_owner: str = None, base_repo: str = None):
39
+ self._github_client = None
40
+ self._llm = None
41
+ self.user_owner = user_owner
42
+ self.user_repo = user_repo
43
+ self.base_owner = base_owner
44
+ self.base_repo = base_repo
45
+
46
+ @property
47
+ def github_client(self) -> Optional[Github]:
48
+ """Return GitHub API client with lazy initialization."""
49
+ if not REQUIRED_LIBS_AVAILABLE:
50
+ raise ImportError("Required libraries not found.")
51
+
52
+ if self._github_client is None:
53
+ token = os.environ.get("GITHUB_TOKEN")
54
+ if not token:
55
+ print("Warning: GITHUB_TOKEN environment variable not set.")
56
+ return Github() # Limited access
57
+ self._github_client = Github(token)
58
+
59
+ return self._github_client
60
+
61
+ @property
62
+ def llm(self):
63
+ """Return LLM client with lazy initialization."""
64
+ if not REQUIRED_LIBS_AVAILABLE:
65
+ raise ImportError("Required libraries not found.")
66
+
67
+ if self._llm is None:
68
+ self._llm = ChatAnthropic(
69
+ model=ANTHROPIC_MODEL_ID,
70
+ temperature=DEFAULT_TEMPERATURE,
71
+ )
72
+ return self._llm
73
+
74
+ def _handle_github_error(self, e: Exception, operation: str) -> str:
75
+ """Handle GitHub API errors consistently."""
76
+ if isinstance(e, GithubException):
77
+ return f"{operation} failed: {e.status} {e.data.get('message', e.data)}"
78
+ return f"Unexpected error during {operation}: {str(e)}"
79
+
80
+ def create_pull_request(
81
+ self,
82
+ owner: str,
83
+ repo_name: str,
84
+ title: str,
85
+ head: str,
86
+ base: str,
87
+ body: str = "",
88
+ draft: bool = False,
89
+ maintainer_can_modify: bool = True,
90
+ ) -> str:
91
+ """Create a new Pull Request."""
92
+ try:
93
+ # 1. Check if head and base are the same
94
+ if head == base:
95
+ return f"ERROR: head branch ({head}) and base branch ({base}) are identical."
96
+
97
+ # 2. Check for existing PR
98
+ existing_pr = self.check_existing_pr(owner, repo_name, head, base)
99
+ if existing_pr:
100
+ return f"ERROR: {existing_pr}"
101
+
102
+ # 3. Verify head and base branches exist
103
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
104
+ try:
105
+ # For fork-to-upstream PR, head format is "fork_owner:branch_name"
106
+ if ":" in head:
107
+ fork_owner, branch_name = head.split(":", 1)
108
+ fork_repo = self.github_client.get_repo(f"{fork_owner}/{repo_name}")
109
+ head_branch = fork_repo.get_branch(branch_name)
110
+ else:
111
+ head_branch = repo.get_branch(head)
112
+
113
+ base_branch = repo.get_branch(base)
114
+
115
+ # 4. Check if head and base branches point to the same commit
116
+ if head_branch.commit.sha == base_branch.commit.sha:
117
+ return f"ERROR: head branch ({head}) and base branch ({base}) point to the same commit. No changes to merge."
118
+
119
+ except GithubException as e:
120
+ if e.status == 404:
121
+ return f"ERROR: Branch not found. head: {head}, base: {base}"
122
+
123
+ # 5. Create PR
124
+ pr = repo.create_pull(
125
+ title=title,
126
+ body=body,
127
+ head=head,
128
+ base=base,
129
+ draft=draft,
130
+ maintainer_can_modify=maintainer_can_modify,
131
+ )
132
+ return f"PR creation successful: {pr.html_url}"
133
+ except GithubException as e:
134
+ if e.status == 422:
135
+ error_msg = e.data.get("message", "Unknown error")
136
+ errors = e.data.get("errors", [])
137
+
138
+ error_details = []
139
+ for error in errors:
140
+ if "message" in error:
141
+ error_details.append(error["message"])
142
+
143
+ detail_msg = " | ".join(error_details) if error_details else ""
144
+ return f"ERROR: PR creation failed (422): {error_msg}. {detail_msg}"
145
+ return self._handle_github_error(e, "PR creation")
146
+ except Exception as e:
147
+ return self._handle_github_error(e, "PR creation")
148
+
149
+ def create_branch(
150
+ self, owner: str, repo_name: str, branch_name: str, source_sha: str
151
+ ) -> str:
152
+ """Create a new branch."""
153
+ try:
154
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
155
+ ref_name = f"refs/heads/{branch_name}"
156
+ new_ref = repo.create_git_ref(ref=ref_name, sha=source_sha)
157
+
158
+ if isinstance(new_ref, GitRef):
159
+ return f"SUCCESS: Branch '{branch_name}' created successfully (ref: {new_ref.ref})"
160
+ return f"ERROR: Branch '{branch_name}' creation failed. Please check API response."
161
+ except GithubException as e:
162
+ if e.status == 422 and "Reference already exists" in str(e.data):
163
+ return f"WARNING: Branch '{branch_name}' already exists."
164
+ return self._handle_github_error(e, "branch creation")
165
+ except Exception as e:
166
+ return self._handle_github_error(e, "branch creation")
167
+
168
+ def check_existing_pr(
169
+ self, owner: str, repo_name: str, head: str, base: str
170
+ ) -> Optional[str]:
171
+ """Check if there's an existing PR with the same head and base."""
172
+ try:
173
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
174
+ # For head parameter, use exactly what was passed (could be "fork_owner:branch" or just "branch")
175
+ search_head = head if ":" in head else f"{owner}:{head}"
176
+ pulls = repo.get_pulls(state="open", head=search_head, base=base)
177
+ for pr in pulls:
178
+ return f"Existing PR found: {pr.html_url}"
179
+ return None
180
+ except Exception as e:
181
+ print(f"⚠️ Error checking existing PR: {str(e)}")
182
+ return None
183
+
184
+ def create_or_update_file(
185
+ self,
186
+ owner: str,
187
+ repo_name: str,
188
+ path: str,
189
+ message: str,
190
+ content: str,
191
+ branch_name: Optional[str] = None,
192
+ sha_blob: Optional[str] = None,
193
+ ) -> str:
194
+ """Create or update a single file."""
195
+ try:
196
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
197
+
198
+ args = {
199
+ "path": path,
200
+ "message": message,
201
+ "content": content,
202
+ }
203
+ if branch_name:
204
+ args["branch"] = branch_name
205
+
206
+ # Try to update file
207
+ if sha_blob:
208
+ args["sha"] = sha_blob
209
+ repo.update_file(**args)
210
+ return f"SUCCESS: File updated - {path}"
211
+
212
+ # Try to create file
213
+ repo.create_file(**args)
214
+ return f"SUCCESS: File created - {path}"
215
+
216
+ except GithubException as e:
217
+ # Try to update if file already exists
218
+ if e.status == 422:
219
+ try:
220
+ existing_file = repo.get_contents(
221
+ path, ref=branch_name or repo.default_branch
222
+ )
223
+ args["sha"] = existing_file.sha
224
+ repo.update_file(**args)
225
+ return f"SUCCESS: File updated - {path}"
226
+ except:
227
+ pass
228
+ return f"ERROR: File processing failed - {path}"
229
+ except Exception:
230
+ return f"ERROR: File processing failed - {path}"
231
+
232
+ def analyze_reference_pr(self, pr_url: str) -> Dict[str, Any]:
233
+ """Analyze reference PR to extract style information."""
234
+ try:
235
+ # Parse PR URL
236
+ match = re.match(r"https://github\.com/([^/]+)/([^/]+)/pull/(\d+)", pr_url)
237
+ if not match:
238
+ return {"error": f"Invalid PR URL format: {pr_url}"}
239
+
240
+ owner, repo_name, pr_number = match.groups()
241
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
242
+ pr = repo.get_pull(int(pr_number))
243
+
244
+ return {
245
+ "title": pr.title,
246
+ "body": pr.body,
247
+ "head_branch": pr.head.ref,
248
+ "base_branch": pr.base.ref,
249
+ "files_changed": [f.filename for f in pr.get_files()],
250
+ "commits": [
251
+ {"message": c.commit.message, "sha": c.sha}
252
+ for c in pr.get_commits()
253
+ ],
254
+ }
255
+ except Exception as e:
256
+ return {"error": f"Error occurred during PR analysis: {str(e)}"}
257
+
258
+ def _generate_with_llm(
259
+ self, prompt: str, fallback_value: str, operation: str
260
+ ) -> str:
261
+ """Generate text using LLM."""
262
+ try:
263
+ _usage_info, generated = llm_translate(prompt)
264
+ generated = generated.strip()
265
+ print(f"LLM generated {operation}: {generated}")
266
+ return generated
267
+ except Exception as e:
268
+ print(f"❌ Error generating {operation} with LLM: {e}")
269
+ print(f"Using fallback value: {fallback_value}")
270
+ return fallback_value
271
+
272
+ def generate_branch_name_from_reference(
273
+ self, reference_branch_name: str, target_language: str, file_name: str
274
+ ) -> str:
275
+ """Generate branch name using simple template."""
276
+ # Keep .md extension and make branch-safe
277
+ branch_safe_name = file_name.replace('_', '-')
278
+ return f"{target_language}-{branch_safe_name}"
279
+
280
+ def generate_pr_content_from_reference(
281
+ self,
282
+ reference_title: str,
283
+ reference_body: str,
284
+ target_language: str,
285
+ filepath: str,
286
+ target_filepath: str,
287
+ file_name: str,
288
+ ) -> Tuple[str, str]:
289
+ """Use LLM to analyze reference PR title and body and generate appropriate PR content."""
290
+ prompt = f"""Here is the reference PR information:
291
+
292
+ Reference PR title: {reference_title}
293
+
294
+ Reference PR body:
295
+ {reference_body}
296
+
297
+ Now I need to generate PR title and body for a new translation task:
298
+ - Target language: {target_language}
299
+ - Original file: {filepath}
300
+ - Translation file: {target_filepath}
301
+ - File name: {file_name}
302
+
303
+ Please analyze the style and format of the reference PR to generate consistent new PR title and body.
304
+
305
+ Requirements:
306
+ 1. Follow the title format and pattern of the reference PR
307
+ 2. Maintain the body style, markdown format, indentation, and line breaks of the reference PR
308
+ 3. Appropriately reflect the target language ({target_language}) and file paths
309
+ 4. If there are user mentions (@username), change them to general text instead of actual mentions
310
+ 5. Adjust the content to fit the translation task
311
+
312
+ Response format:
313
+ Title: [PR title here]
314
+ Body: [PR body here, maintaining the exact markdown format and structure of the original]"""
315
+
316
+ try:
317
+ _usage_info, generated_content = llm_translate(prompt)
318
+ generated_content = generated_content.strip()
319
+
320
+ # Separate title and body from response
321
+ lines = generated_content.split("\n")
322
+ title_line = ""
323
+ body_lines = []
324
+ parsing_body = False
325
+
326
+ for line in lines:
327
+ if line.startswith("Title:"):
328
+ title_line = line.replace("Title:", "").strip()
329
+ elif line.startswith("Body:"):
330
+ parsing_body = True
331
+ body_content = line.replace("Body:", "").strip()
332
+ if body_content:
333
+ body_lines.append(body_content)
334
+ elif parsing_body:
335
+ body_lines.append(line)
336
+
337
+ generated_title = title_line if title_line else reference_title
338
+ generated_body = (
339
+ "\n".join(body_lines)
340
+ if body_lines
341
+ else f"Add {target_language} translation for `{filepath}`."
342
+ )
343
+
344
+ print(f"LLM generated PR title: {generated_title}")
345
+ print(f"LLM generated PR body (first 100 chars): {generated_body[:100]}...")
346
+
347
+ return generated_title, generated_body
348
+
349
+ except Exception as e:
350
+ print(f"❌ Error generating PR content with LLM: {e}")
351
+ return self._generate_default_pr_content(
352
+ target_language, filepath, target_filepath, file_name
353
+ )
354
+
355
+ def _generate_default_pr_content(
356
+ self, target_language: str, filepath: str, target_filepath: str, file_name: str
357
+ ) -> Tuple[str, str]:
358
+ """Generate default PR content."""
359
+ title = f"🌐 [i18n-{target_language}] Translated `{file_name}` to {target_language}"
360
+ body = f"""# What does this PR do?
361
+
362
+ Translated the `{filepath}` file of the documentation to {target_language} 😄
363
+ Thank you in advance for your review!
364
+
365
+ Part of https://github.com/huggingface/transformers/issues/20179
366
+
367
+ ## Before reviewing
368
+ - [x] Check for missing / redundant translations (번역 누락/중복 검사)
369
+ - [x] Grammar Check (맞춤법 검사)
370
+ - [x] Review or Add new terms to glossary (용어 확인 및 추가)
371
+ - [x] Check Inline TOC (e.g. `[[lowercased-header]]`)
372
+ - [x] Check live-preview for gotchas (live-preview로 정상작동 확인)
373
+
374
+ ## Who can review? (Initial)
375
+ {target_language} translation reviewers
376
+
377
+ ## Before submitting
378
+ - [x] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
379
+ - [x] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#start-contributing-pull-requests),
380
+ Pull Request section?
381
+ - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
382
+ to it if that's the case.
383
+ - [x] Did you make sure to update the documentation with your changes? Here are the
384
+ [documentation guidelines](https://github.com/huggingface/transformers/tree/main/docs), and
385
+ [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation).
386
+ - [ ] Did you write any new necessary tests?
387
+
388
+ ## Who can review? (Final)
389
+ May you please review this PR?
390
+ Documentation maintainers
391
+ """
392
+ return title, body
393
+
394
+ def generate_commit_message_from_reference(
395
+ self, commit_messages: List[str], target_language: str, file_name: str
396
+ ) -> str:
397
+ """Generate simple commit message using template."""
398
+ return f"docs: {target_language}: {file_name}"
399
+
400
+ def get_branch_info(self, owner: str, repo_name: str, branch_name: str) -> str:
401
+ """Get information about an existing branch."""
402
+ try:
403
+ repo = self.github_client.get_repo(f"{owner}/{repo_name}")
404
+ branch = repo.get_branch(branch_name)
405
+ commit = branch.commit
406
+ commit_info = commit.commit
407
+
408
+ return f"""
409
+ 📋 Existing branch information:
410
+ - Branch name: {branch_name}
411
+ - Latest commit: {commit.sha[:8]}
412
+ - Commit message: {commit_info.message.split(chr(10))[0][:80]}...
413
+ - Author: {commit_info.author.name}
414
+ - Date: {commit_info.author.date.strftime('%Y-%m-%d %H:%M:%S')}
415
+ """
416
+ except Exception as e:
417
+ return f"Failed to retrieve branch information: {str(e)}"
418
+
419
+ def run_translation_pr_workflow(
420
+ self,
421
+ reference_pr_url: str,
422
+ target_language: str,
423
+ filepath: str,
424
+ translated_doc: str,
425
+ base_branch: str = "main",
426
+ ) -> Dict[str, Any]:
427
+ """Execute translation document PR creation workflow."""
428
+ try:
429
+ # 1. Analyze reference PR
430
+ print(f"🔍 Analyzing reference PR: {reference_pr_url}")
431
+ pr_analysis = self.analyze_reference_pr(reference_pr_url)
432
+
433
+ if "error" in pr_analysis:
434
+ return {"status": "error", "message": pr_analysis["error"]}
435
+
436
+ print("Reference PR analysis completed")
437
+
438
+ # 2. Generate translation file path and branch name
439
+ target_filepath = filepath.replace("/en/", f"/{target_language}/")
440
+ file_name = filepath.split("/")[-1] # Keep .md extension
441
+
442
+ print(f"🌿 Generating branch name...")
443
+ branch_name = self.generate_branch_name_from_reference(
444
+ pr_analysis["head_branch"], target_language, file_name
445
+ )
446
+
447
+ # 3. Get main branch SHA from upstream and create branch in fork
448
+ upstream_repo = self.github_client.get_repo(f"{self.base_owner}/{self.base_repo}")
449
+ main_branch = upstream_repo.get_branch(base_branch)
450
+ main_sha = main_branch.commit.sha
451
+
452
+ print(f"🌿 Creating branch: {branch_name} in fork repository")
453
+ branch_result = self.create_branch(self.user_owner, self.user_repo, branch_name, main_sha)
454
+
455
+ # Check branch creation result
456
+ if branch_result.startswith("ERROR"):
457
+ return {
458
+ "status": "error",
459
+ "message": f"Branch creation failed: {branch_result}\n\nTarget: {self.user_owner}/{self.user_repo}\nBranch: {branch_name}\nBase SHA: {main_sha[:8]}",
460
+ "branch": branch_name,
461
+ "error_details": branch_result,
462
+ }
463
+ elif branch_result.startswith("WARNING"):
464
+ print(f"⚠️ {branch_result}")
465
+ # Continue if branch already exists
466
+ elif branch_result.startswith("SUCCESS"):
467
+ print(f"✅ {branch_result}")
468
+ else:
469
+ print(f"⚠️ Unexpected branch creation result: {branch_result}")
470
+ # Continue anyway, might still work
471
+
472
+ # 4. Generate commit message and save file
473
+ commit_messages = [commit["message"] for commit in pr_analysis["commits"]]
474
+ commit_message = self.generate_commit_message_from_reference(
475
+ commit_messages, target_language, file_name
476
+ )
477
+
478
+ print(f"📄 Saving file: {target_filepath}")
479
+ file_result = self.create_or_update_file(
480
+ self.user_owner,
481
+ self.user_repo,
482
+ target_filepath,
483
+ commit_message,
484
+ translated_doc,
485
+ branch_name,
486
+ )
487
+
488
+ if not file_result.startswith("SUCCESS"):
489
+ return {
490
+ "status": "error",
491
+ "message": f"File save failed: {file_result}\n\n🎯 Target: {self.user_owner}/{self.user_repo} (expected: {target_language} fork of {self.base_owner}/{self.base_repo})\n🌿 Branch: {branch_name}\n📁 File: {target_filepath}",
492
+ "branch": branch_name,
493
+ "file_path": target_filepath,
494
+ "error_details": file_result,
495
+ }
496
+
497
+ print(f"{file_result}")
498
+
499
+ # 5. Create PR
500
+ pr_title, pr_body = self.generate_pr_content_from_reference(
501
+ pr_analysis["title"],
502
+ pr_analysis["body"],
503
+ target_language,
504
+ filepath,
505
+ target_filepath,
506
+ file_name,
507
+ )
508
+
509
+ print(f"🔄 Creating PR: {pr_title}")
510
+ print(f" Head: {self.user_owner}:{branch_name} → Base: {self.base_owner}:{base_branch}")
511
+
512
+ # Create PR from fork to upstream repository
513
+ pr_result = self.create_pull_request(
514
+ self.base_owner, self.base_repo, pr_title, f"{self.user_owner}:{branch_name}", base_branch, pr_body, draft=True
515
+ )
516
+
517
+ if pr_result.startswith("ERROR"):
518
+ print(f"❌ {pr_result}")
519
+ return {
520
+ "status": "partial_success",
521
+ "branch": branch_name,
522
+ "file_path": target_filepath,
523
+ "message": f"File was saved and commit was successful.\nPR creation failed: {pr_result}",
524
+ "error_details": pr_result,
525
+ }
526
+ elif "successful" in pr_result and "http" in pr_result:
527
+ print(f"{pr_result}")
528
+ return {
529
+ "status": "success",
530
+ "branch": branch_name,
531
+ "file_path": target_filepath,
532
+ "pr_url": pr_result.split(": ")[-1],
533
+ "message": "Translation document PR created successfully!",
534
+ }
535
+ else:
536
+ return {
537
+ "status": "partial_success",
538
+ "branch": branch_name,
539
+ "file_path": target_filepath,
540
+ "message": "File was saved but PR creation failed.",
541
+ }
542
+
543
+ except Exception as e:
544
+ return {
545
+ "status": "error",
546
+ "message": f"Workflow execution failed: {str(e)}\n\nConfig: {self.user_owner}/{self.user_repo} → {self.base_owner}/{self.base_repo}\nFile: {filepath if 'filepath' in locals() else 'Unknown'}",
547
+ "error_details": str(e),
548
+ }
549
+
550
+
551
+ # Backward compatibility functions (maintain compatibility with existing code)
552
+ _agent = GitHubPRAgent()
553
+
554
+
555
+ def get_github_client():
556
+ return _agent.github_client
557
+
558
+
559
+ def create_pull_request_func(*args, **kwargs):
560
+ return _agent.create_pull_request(*args, **kwargs)
561
+
562
+
563
+ def create_branch_func(*args, **kwargs):
564
+ return _agent.create_branch(*args, **kwargs)
565
+
566
+
567
+ def create_or_update_file_func(*args, **kwargs):
568
+ return _agent.create_or_update_file(*args, **kwargs)
569
+
570
+
571
+ def analyze_reference_pr_func(*args, **kwargs):
572
+ return _agent.analyze_reference_pr(*args, **kwargs)
573
+
574
+
575
+ def generate_branch_name_from_reference(*args, **kwargs):
576
+ return _agent.generate_branch_name_from_reference(*args, **kwargs)
577
+
578
+
579
+ def generate_pr_content_from_reference(*args, **kwargs):
580
+ return _agent.generate_pr_content_from_reference(*args, **kwargs)
581
+
582
+
583
+ def generate_default_pr_content(*args, **kwargs):
584
+ return _agent._generate_default_pr_content(*args, **kwargs)
585
+
586
+
587
+ def generate_commit_message_from_reference(*args, **kwargs):
588
+ return _agent.generate_commit_message_from_reference(*args, **kwargs)
589
+
590
+
591
+ def get_branch_info(*args, **kwargs):
592
+ return _agent.get_branch_info(*args, **kwargs)
593
+
594
+
595
+ def run_translation_pr_agent_simple(*args, **kwargs):
596
+ return _agent.run_translation_pr_workflow(*args, **kwargs)
pr_generator/searcher.py CHANGED
@@ -1,238 +1,238 @@
1
- """
2
- GitHub PR Search Agent
3
- An agent that finds a suitable reference PR when a reference PR URL is not provided.
4
- """
5
-
6
- import os
7
- import re
8
- import logging
9
- from typing import List, Dict, Any, Optional
10
-
11
- # Load environment variables
12
- from dotenv import load_dotenv
13
-
14
- load_dotenv()
15
-
16
- # Setup logging
17
- logging.basicConfig(
18
- level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
19
- )
20
- logger = logging.getLogger(__name__)
21
-
22
- # Langchain imports
23
- try:
24
- from langchain_anthropic import ChatAnthropic
25
- from langchain.tools import StructuredTool
26
- from langchain.agents import AgentExecutor, create_tool_calling_agent
27
- from langchain_core.prompts import ChatPromptTemplate
28
- from github import Github
29
-
30
- REQUIRED_LIBS_AVAILABLE = True
31
- except ImportError as e:
32
- print(f"Required libraries are not installed: {e}")
33
- REQUIRED_LIBS_AVAILABLE = False
34
-
35
- # Constants
36
- ANTHROPIC_MODEL_ID = "claude-sonnet-4-20250514"
37
- DEFAULT_TEMPERATURE = 0.0
38
- # Fallback PR URL to ensure a PR is always returned
39
- DEFAULT_FALLBACK_PR_URL = "https://github.com/huggingface/transformers/pull/24968"
40
-
41
-
42
- class GitHubPRSearcher:
43
- """GitHub PR Searcher - now using a LangChain agent."""
44
-
45
- def _search_github_prs(self, query: str) -> List[Dict[str, Any]]:
46
- """
47
- Searches GitHub for pull requests matching the query and returns the top 5 results.
48
- The query should be a valid GitHub search query.
49
- """
50
- logger.info(f"Executing GitHub search with query: {query}")
51
- try:
52
- issues = self.github_client.search_issues(query=query)
53
- # Take top 5 to keep context small for the agent
54
- top_issues = issues.get_page(0)[:5]
55
-
56
- if not top_issues:
57
- return []
58
-
59
- return [
60
- {"title": issue.title, "url": issue.html_url, "number": issue.number}
61
- for issue in top_issues
62
- ]
63
- except Exception as e:
64
- logger.error(f"Error during GitHub search: {e}", exc_info=True)
65
- # Return an error message that the agent can understand
66
- return [{"error": f"An error occurred during search: {e}"}]
67
-
68
- def __init__(self):
69
- if not REQUIRED_LIBS_AVAILABLE:
70
- raise ImportError("Required libraries for agent could not be found.")
71
-
72
- self._github_client = None
73
- self.llm = ChatAnthropic(
74
- model=ANTHROPIC_MODEL_ID,
75
- temperature=DEFAULT_TEMPERATURE,
76
- )
77
-
78
- search_tool = StructuredTool.from_function(
79
- func=self._search_github_prs,
80
- name="search_github_prs",
81
- description="Searches GitHub for pull requests matching the query and returns the top 5 results. The query should be a valid GitHub search query.",
82
- )
83
- tools = [search_tool]
84
-
85
- prompt_string = """You are a GitHub expert. Your mission is to find the best reference pull request (PR) for a given task.
86
-
87
- You need to find a merged PR in the repository: {owner}/{repo_name}.
88
- The PR should be for a documentation translation into **{target_language}**.
89
- The context for the translation is: **{context}**.
90
-
91
- Use the tools at your disposal to search for relevant PRs.
92
- Analyze the search results and select the one that best matches the request. A good PR is usually one that has "translation", "docs", "i18n", and the target language in its title.
93
-
94
- Here is an example of a good search query you could use:
95
- `repo:{owner}/{repo_name} is:pr is:merged "{target_language}" "{context}" i18n translation docs`
96
-
97
- After your analysis, you MUST output **only the final URL** of the best PR you have chosen. Do not include any other text in your final response."""
98
-
99
- prompt = ChatPromptTemplate.from_messages(
100
- [
101
- ("system", prompt_string),
102
- (
103
- "human",
104
- "Find the best reference PR for translating docs to {target_language} about {context} in the {owner}/{repo_name} repository.",
105
- ),
106
- ("placeholder", "{agent_scratchpad}"),
107
- ]
108
- )
109
-
110
- agent = create_tool_calling_agent(self.llm, tools, prompt)
111
- self.agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)
112
-
113
- @property
114
- def github_client(self) -> Optional[Github]:
115
- """Lazy initialization of the GitHub API client."""
116
- if not REQUIRED_LIBS_AVAILABLE:
117
- raise ImportError("Required libraries could not be found.")
118
-
119
- if self._github_client is None:
120
- token = os.environ.get("GITHUB_TOKEN")
121
- if not token:
122
- print("Warning: GITHUB_TOKEN environment variable is not set.")
123
- self._github_client = Github() # Limited access
124
- else:
125
- self._github_client = Github(token)
126
- return self._github_client
127
-
128
- def find_best_reference_pr(
129
- self, owner: str, repo_name: str, target_language: str, context: str
130
- ):
131
- """
132
- Finds the best reference PR using a LangChain agent.
133
- Yields progress and returns the final PR URL.
134
- """
135
- message = "🤖 Agent is searching for the best reference PR..."
136
- logger.info(message)
137
- yield message
138
-
139
- try:
140
- agent_input = {
141
- "owner": owner,
142
- "repo_name": repo_name,
143
- "target_language": target_language,
144
- "context": context,
145
- }
146
-
147
- agent_output = None
148
- for event in self.agent_executor.stream(agent_input):
149
- if "actions" in event and event["actions"]:
150
- action = event["actions"][0]
151
- tool_query = action.tool_input.get("query", str(action.tool_input))
152
- message = f"🔍 Agent is using tool `{action.tool}` with query:\n`{tool_query}`"
153
- logger.info(message)
154
- yield message
155
- elif "steps" in event and event["steps"]:
156
- message = "📊 Agent is analyzing the results from the tool..."
157
- logger.info(message)
158
- yield message
159
- elif "output" in event and event["output"]:
160
- agent_output = event["output"]
161
-
162
- if not agent_output:
163
- message = "⚠️ Agent failed to find a suitable PR. Using default PR."
164
- logger.warning(message)
165
- yield message
166
- return DEFAULT_FALLBACK_PR_URL
167
-
168
- # The agent's final output can be a string, a list of tool results,
169
- # or a list of content blocks from the LLM. We'll find the URL
170
- # by searching for it in the string representation of the output.
171
- output_text = str(agent_output)
172
- urls = re.findall(r"https?://github.com/[^/]+/[^/]+/pull/\d+", output_text)
173
-
174
- final_url = ""
175
- if urls:
176
- final_url = urls[-1] # Take the last URL found
177
-
178
- if not final_url:
179
- message = f"⚠️ Agent returned unparsable output: {agent_output}. Using default PR."
180
- logger.warning(message)
181
- yield message
182
- return DEFAULT_FALLBACK_PR_URL
183
-
184
- message = f"✅ Selected the best PR:\n`{final_url}`"
185
- logger.info(f"Selected the best PR: {final_url}")
186
- yield message
187
- return final_url
188
-
189
- except Exception as e:
190
- message = f"❌ Error during agent execution: {e}\nUsing default PR."
191
- logger.error(message, exc_info=True)
192
- yield message
193
- return DEFAULT_FALLBACK_PR_URL
194
-
195
-
196
- def find_reference_pr_simple_stream(target_language: str = "", context: str = ""):
197
- """
198
- A simple function to find a reference PR, streaming progress.
199
- This function always searches in the 'huggingface/transformers' repository.
200
- """
201
- searcher = GitHubPRSearcher()
202
- stream_generator = searcher.find_best_reference_pr(
203
- "huggingface", "transformers", target_language, context
204
- )
205
- # The handler will receive the final URL from the generator's return statement
206
- final_url = yield from stream_generator
207
-
208
- # Format the final result as expected by the handler
209
- return {
210
- "status": "success",
211
- "result": f"Recommended PR URL: {final_url}",
212
- "repository": "huggingface/transformers",
213
- "target_language": target_language,
214
- }
215
-
216
-
217
- # Example usage
218
- if __name__ == "__main__":
219
- # Example execution for streaming
220
- # In a real application, a generator consumer (like the one in handler.py)
221
- # would process the yielded values. This script simulates that.
222
- print("--- Running Streaming Search Simulation ---")
223
-
224
- def run_simulation():
225
- """Simulates the consumption of the streaming generator."""
226
- test_gen = find_reference_pr_simple_stream(
227
- target_language="korean", context="docs"
228
- )
229
- try:
230
- while True:
231
- # This will print progress messages
232
- print(next(test_gen))
233
- except StopIteration as e:
234
- # When the generator is exhausted, the final result is in e.value
235
- print("\n--- FINAL RESULT ---")
236
- print(e.value)
237
-
238
- run_simulation()
 
1
+ """
2
+ GitHub PR Search Agent
3
+ An agent that finds a suitable reference PR when a reference PR URL is not provided.
4
+ """
5
+
6
+ import os
7
+ import re
8
+ import logging
9
+ from typing import List, Dict, Any, Optional
10
+
11
+ # Load environment variables
12
+ from dotenv import load_dotenv
13
+
14
+ load_dotenv()
15
+
16
+ # Setup logging
17
+ logging.basicConfig(
18
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
19
+ )
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # Langchain imports
23
+ try:
24
+ from langchain_anthropic import ChatAnthropic
25
+ from langchain.tools import StructuredTool
26
+ from langchain.agents import AgentExecutor, create_tool_calling_agent
27
+ from langchain_core.prompts import ChatPromptTemplate
28
+ from github import Github
29
+
30
+ REQUIRED_LIBS_AVAILABLE = True
31
+ except ImportError as e:
32
+ print(f"Required libraries are not installed: {e}")
33
+ REQUIRED_LIBS_AVAILABLE = False
34
+
35
+ # Constants
36
+ ANTHROPIC_MODEL_ID = "claude-sonnet-4-20250514"
37
+ DEFAULT_TEMPERATURE = 0.0
38
+ # Fallback PR URL to ensure a PR is always returned
39
+ DEFAULT_FALLBACK_PR_URL = "https://github.com/huggingface/transformers/pull/24968"
40
+
41
+
42
+ class GitHubPRSearcher:
43
+ """GitHub PR Searcher - now using a LangChain agent."""
44
+
45
+ def _search_github_prs(self, query: str) -> List[Dict[str, Any]]:
46
+ """
47
+ Searches GitHub for pull requests matching the query and returns the top 5 results.
48
+ The query should be a valid GitHub search query.
49
+ """
50
+ logger.info(f"Executing GitHub search with query: {query}")
51
+ try:
52
+ issues = self.github_client.search_issues(query=query)
53
+ # Take top 5 to keep context small for the agent
54
+ top_issues = issues.get_page(0)[:5]
55
+
56
+ if not top_issues:
57
+ return []
58
+
59
+ return [
60
+ {"title": issue.title, "url": issue.html_url, "number": issue.number}
61
+ for issue in top_issues
62
+ ]
63
+ except Exception as e:
64
+ logger.error(f"Error during GitHub search: {e}", exc_info=True)
65
+ # Return an error message that the agent can understand
66
+ return [{"error": f"An error occurred during search: {e}"}]
67
+
68
+ def __init__(self):
69
+ if not REQUIRED_LIBS_AVAILABLE:
70
+ raise ImportError("Required libraries for agent could not be found.")
71
+
72
+ self._github_client = None
73
+ self.llm = ChatAnthropic(
74
+ model=ANTHROPIC_MODEL_ID,
75
+ temperature=DEFAULT_TEMPERATURE,
76
+ )
77
+
78
+ search_tool = StructuredTool.from_function(
79
+ func=self._search_github_prs,
80
+ name="search_github_prs",
81
+ description="Searches GitHub for pull requests matching the query and returns the top 5 results. The query should be a valid GitHub search query.",
82
+ )
83
+ tools = [search_tool]
84
+
85
+ prompt_string = """You are a GitHub expert. Your mission is to find the best reference pull request (PR) for a given task.
86
+
87
+ You need to find a merged PR in the repository: {owner}/{repo_name}.
88
+ The PR should be for a documentation translation into **{target_language}**.
89
+ The context for the translation is: **{context}**.
90
+
91
+ Use the tools at your disposal to search for relevant PRs.
92
+ Analyze the search results and select the one that best matches the request. A good PR is usually one that has "translation", "docs", "i18n", and the target language in its title.
93
+
94
+ Here is an example of a good search query you could use:
95
+ `repo:{owner}/{repo_name} is:pr is:merged "{target_language}" "{context}" i18n translation docs`
96
+
97
+ After your analysis, you MUST output **only the final URL** of the best PR you have chosen. Do not include any other text in your final response."""
98
+
99
+ prompt = ChatPromptTemplate.from_messages(
100
+ [
101
+ ("system", prompt_string),
102
+ (
103
+ "human",
104
+ "Find the best reference PR for translating docs to {target_language} about {context} in the {owner}/{repo_name} repository.",
105
+ ),
106
+ ("placeholder", "{agent_scratchpad}"),
107
+ ]
108
+ )
109
+
110
+ agent = create_tool_calling_agent(self.llm, tools, prompt)
111
+ self.agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)
112
+
113
+ @property
114
+ def github_client(self) -> Optional[Github]:
115
+ """Lazy initialization of the GitHub API client."""
116
+ if not REQUIRED_LIBS_AVAILABLE:
117
+ raise ImportError("Required libraries could not be found.")
118
+
119
+ if self._github_client is None:
120
+ token = os.environ.get("GITHUB_TOKEN")
121
+ if not token:
122
+ print("Warning: GITHUB_TOKEN environment variable is not set.")
123
+ self._github_client = Github() # Limited access
124
+ else:
125
+ self._github_client = Github(token)
126
+ return self._github_client
127
+
128
+ def find_best_reference_pr(
129
+ self, owner: str, repo_name: str, target_language: str, context: str
130
+ ):
131
+ """
132
+ Finds the best reference PR using a LangChain agent.
133
+ Yields progress and returns the final PR URL.
134
+ """
135
+ message = "🤖 Agent is searching for the best reference PR..."
136
+ logger.info(message)
137
+ yield message
138
+
139
+ try:
140
+ agent_input = {
141
+ "owner": owner,
142
+ "repo_name": repo_name,
143
+ "target_language": target_language,
144
+ "context": context,
145
+ }
146
+
147
+ agent_output = None
148
+ for event in self.agent_executor.stream(agent_input):
149
+ if "actions" in event and event["actions"]:
150
+ action = event["actions"][0]
151
+ tool_query = action.tool_input.get("query", str(action.tool_input))
152
+ message = f"🔍 Agent is using tool `{action.tool}` with query:\n`{tool_query}`"
153
+ logger.info(message)
154
+ yield message
155
+ elif "steps" in event and event["steps"]:
156
+ message = "📊 Agent is analyzing the results from the tool..."
157
+ logger.info(message)
158
+ yield message
159
+ elif "output" in event and event["output"]:
160
+ agent_output = event["output"]
161
+
162
+ if not agent_output:
163
+ message = "⚠️ Agent failed to find a suitable PR. Using default PR."
164
+ logger.warning(message)
165
+ yield message
166
+ return DEFAULT_FALLBACK_PR_URL
167
+
168
+ # The agent's final output can be a string, a list of tool results,
169
+ # or a list of content blocks from the LLM. We'll find the URL
170
+ # by searching for it in the string representation of the output.
171
+ output_text = str(agent_output)
172
+ urls = re.findall(r"https?://github.com/[^/]+/[^/]+/pull/\d+", output_text)
173
+
174
+ final_url = ""
175
+ if urls:
176
+ final_url = urls[-1] # Take the last URL found
177
+
178
+ if not final_url:
179
+ message = f"⚠️ Agent returned unparsable output: {agent_output}. Using default PR."
180
+ logger.warning(message)
181
+ yield message
182
+ return DEFAULT_FALLBACK_PR_URL
183
+
184
+ message = f"✅ Selected the best PR:\n`{final_url}`"
185
+ logger.info(f"Selected the best PR: {final_url}")
186
+ yield message
187
+ return final_url
188
+
189
+ except Exception as e:
190
+ message = f"❌ Error during agent execution: {e}\nUsing default PR."
191
+ logger.error(message, exc_info=True)
192
+ yield message
193
+ return DEFAULT_FALLBACK_PR_URL
194
+
195
+
196
+ def find_reference_pr_simple_stream(target_language: str = "", context: str = ""):
197
+ """
198
+ A simple function to find a reference PR, streaming progress.
199
+ This function always searches in the 'huggingface/transformers' repository.
200
+ """
201
+ searcher = GitHubPRSearcher()
202
+ stream_generator = searcher.find_best_reference_pr(
203
+ "huggingface", "transformers", target_language, context
204
+ )
205
+ # The handler will receive the final URL from the generator's return statement
206
+ final_url = yield from stream_generator
207
+
208
+ # Format the final result as expected by the handler
209
+ return {
210
+ "status": "success",
211
+ "result": f"Recommended PR URL: {final_url}",
212
+ "repository": "huggingface/transformers",
213
+ "target_language": target_language,
214
+ }
215
+
216
+
217
+ # Example usage
218
+ if __name__ == "__main__":
219
+ # Example execution for streaming
220
+ # In a real application, a generator consumer (like the one in handler.py)
221
+ # would process the yielded values. This script simulates that.
222
+ print("--- Running Streaming Search Simulation ---")
223
+
224
+ def run_simulation():
225
+ """Simulates the consumption of the streaming generator."""
226
+ test_gen = find_reference_pr_simple_stream(
227
+ target_language="korean", context="docs"
228
+ )
229
+ try:
230
+ while True:
231
+ # This will print progress messages
232
+ print(next(test_gen))
233
+ except StopIteration as e:
234
+ # When the generator is exhausted, the final result is in e.value
235
+ print("\n--- FINAL RESULT ---")
236
+ print(e.value)
237
+
238
+ run_simulation()
requirements.txt CHANGED
@@ -1,11 +1,11 @@
1
- gradio==5.33.0
2
- requests
3
- pydantic
4
- langchain-anthropic
5
- python-dotenv
6
- langchain
7
- PyGithub
8
- langchain-core
9
- langchain-community
10
- boto3
11
  PyYAML
 
1
+ gradio==5.33.0
2
+ requests
3
+ pydantic
4
+ langchain-anthropic
5
+ python-dotenv
6
+ langchain
7
+ PyGithub
8
+ langchain-core
9
+ langchain-community
10
+ boto3
11
  PyYAML
test/test_final_translate.md CHANGED
@@ -1,127 +1,127 @@
1
- <!--Copyright 2025 The HuggingFace Team. All rights reserved.
2
-
3
- Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
- the License. You may obtain a copy of the License at
5
-
6
- http://www.apache.org/licenses/LICENSE-2.0
7
-
8
- Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
- an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
- specific language governing permissions and limitations under the License.
11
-
12
- ⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
13
- rendered properly in your Markdown viewer.
14
-
15
- -->
16
-
17
- # 가속기 선택 [[accelerator-selection]]
18
-
19
- 분산 훈련 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 개수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기들이 있을 때 더 빠른 가속기를 먼저 사용하고 싶거나, 사용 가능한 가속기 중 일부만 사용하고 싶을 때 유용합니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
20
-
21
- 이 가이드에서는 사용할 가속기의 개수와 사용 순서를 선택하는 방법을 보여드립니다.
22
-
23
- ## 가속기 개수 [[number-of-accelerators]]
24
-
25
- 예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령어를 실행하세요.
26
-
27
- <hfoptions id="select-accelerator">
28
- <hfoption id="torchrun">
29
-
30
- 사용할 가속기 개수를 선택하려면 `--nproc_per_node`를 사용하세요.
31
-
32
- ```bash
33
- torchrun --nproc_per_node=2 trainer-program.py ...
34
- ```
35
-
36
- </hfoption>
37
- <hfoption id="Accelerate">
38
-
39
- 사용할 가속기 개수를 선택하려면 `--num_processes`를 사용하세요.
40
-
41
- ```bash
42
- accelerate launch --num_processes 2 trainer-program.py ...
43
- ```
44
-
45
- </hfoption>
46
- <hfoption id="🤗 DeepSpeed">
47
-
48
- 사용할 GPU 개수를 선택하려면 `--num_gpus`를 사용하세요.
49
-
50
- ```bash
51
- deepspeed --num_gpus 2 trainer-program.py ...
52
- ```
53
-
54
- </hfoption>
55
- </hfoptions>
56
-
57
- ## 가속기 순서 [[order-of-accelerators]]
58
- 사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 보통 각 실행마다 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
59
-
60
- 예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
61
-
62
- <hfoptions id="accelerator-type">
63
- <hfoption id="CUDA">
64
-
65
- ```bash
66
- CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
67
- ```
68
-
69
- GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
70
- 순서를 바꾸려면(GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
71
-
72
-
73
- ```bash
74
- CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
75
- ```
76
-
77
- GPU 없이 실행하려면:
78
-
79
- ```bash
80
- CUDA_VISIBLE_DEVICES= python trainer-program.py ...
81
- ```
82
-
83
- `CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
84
-
85
- - PCIe 버스 ID 순서(`nvidia-smi`와 일치):
86
-
87
- ```bash
88
- $hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
89
- ```
90
-
91
- - 컴퓨팅 성능 순서(가장 빠른 것부터):
92
-
93
- ```bash
94
- export CUDA_DEVICE_ORDER=FASTEST_FIRST
95
- ```
96
-
97
- </hfoption>
98
- <hfoption id="Intel XPU">
99
-
100
- ```bash
101
- ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
102
- ```
103
-
104
- XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
105
- 순서를 바꾸려면(XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
106
-
107
- ```bash
108
- ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
109
- ```
110
-
111
-
112
- 다음으로 Intel XPU의 순서를 제어할 수도 있습니다:
113
-
114
- ```bash
115
- export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
116
- ```
117
-
118
- Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
119
-
120
- </hfoption>
121
- </hfoptions>
122
-
123
-
124
-
125
- > [!WARNING]
126
- > 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신 특정 훈련 실행을 위한 환경 변수를 같은 명령줄에서 설정하는 것이 일반적인 관행입니다.
127
-
 
1
+ <!--Copyright 2025 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # 가속기 선택 [[accelerator-selection]]
18
+
19
+ 분산 훈련 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 개수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기들이 있을 때 더 빠른 가속기를 먼저 사용하고 싶거나, 사용 가능한 가속기 중 일부만 사용하고 싶을 때 유용합니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
20
+
21
+ 이 가이드에서는 사용할 가속기의 개수와 사용 순서를 선택하는 방법을 보여드립니다.
22
+
23
+ ## 가속기 개수 [[number-of-accelerators]]
24
+
25
+ 예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령어를 실행하세요.
26
+
27
+ <hfoptions id="select-accelerator">
28
+ <hfoption id="torchrun">
29
+
30
+ 사용할 가속기 개수를 선택하려면 `--nproc_per_node`를 사용하세요.
31
+
32
+ ```bash
33
+ torchrun --nproc_per_node=2 trainer-program.py ...
34
+ ```
35
+
36
+ </hfoption>
37
+ <hfoption id="Accelerate">
38
+
39
+ 사용할 가속기 개수를 선택하려면 `--num_processes`를 사용하세요.
40
+
41
+ ```bash
42
+ accelerate launch --num_processes 2 trainer-program.py ...
43
+ ```
44
+
45
+ </hfoption>
46
+ <hfoption id="🤗 DeepSpeed">
47
+
48
+ 사용할 GPU 개수를 선택하려면 `--num_gpus`를 사용하세요.
49
+
50
+ ```bash
51
+ deepspeed --num_gpus 2 trainer-program.py ...
52
+ ```
53
+
54
+ </hfoption>
55
+ </hfoptions>
56
+
57
+ ## 가속기 순서 [[order-of-accelerators]]
58
+ 사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 보통 각 실행마다 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
59
+
60
+ 예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
61
+
62
+ <hfoptions id="accelerator-type">
63
+ <hfoption id="CUDA">
64
+
65
+ ```bash
66
+ CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
67
+ ```
68
+
69
+ GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
70
+ 순서를 바꾸려면(GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
71
+
72
+
73
+ ```bash
74
+ CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
75
+ ```
76
+
77
+ GPU 없이 실행하려면:
78
+
79
+ ```bash
80
+ CUDA_VISIBLE_DEVICES= python trainer-program.py ...
81
+ ```
82
+
83
+ `CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
84
+
85
+ - PCIe 버스 ID 순서(`nvidia-smi`와 일치):
86
+
87
+ ```bash
88
+ $hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
89
+ ```
90
+
91
+ - 컴퓨팅 성능 순서(가장 빠른 것부터):
92
+
93
+ ```bash
94
+ export CUDA_DEVICE_ORDER=FASTEST_FIRST
95
+ ```
96
+
97
+ </hfoption>
98
+ <hfoption id="Intel XPU">
99
+
100
+ ```bash
101
+ ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
102
+ ```
103
+
104
+ XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
105
+ 순서를 바꾸려면(XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
106
+
107
+ ```bash
108
+ ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
109
+ ```
110
+
111
+
112
+ 다음으로 Intel XPU의 순서를 제어할 수도 있습니다:
113
+
114
+ ```bash
115
+ export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
116
+ ```
117
+
118
+ Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
119
+
120
+ </hfoption>
121
+ </hfoptions>
122
+
123
+
124
+
125
+ > [!WARNING]
126
+ > 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신 특정 훈련 실행을 위한 환경 변수를 같은 명령줄에서 설정하는 것이 일반적인 관행입니다.
127
+
test/test_prompt.py CHANGED
@@ -1,71 +1,71 @@
1
- output = """
2
- What do these sentences about Hugging Face Transformers (a machine learning library) mean in Korean? Please do not translate the word after a 🤗 emoji as it is a product name. Output only the translated result without any explanations or introductions.
3
- ```md
4
- # Accelerator selection
5
-
6
- During distributed training, you can specify the number and order of accelerators (CUDA, XPU, MPS, HPU, etc.) to use. This can be useful when you have accelerators with different computing power and you want to use the faster accelerator first. Or you could only use a subset of the available accelerators. The selection process works for both [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html). You don't need Accelerate or [DeepSpeed integration](./main_classes/deepspeed).
7
-
8
- This guide will show you how to select the number of accelerators to use and the order to use them in.
9
-
10
- ## Number of accelerators
11
-
12
- For example, if there are 4 accelerators and you only want to use the first 2, run the command below.
13
-
14
- <hfoptions id="select-accelerator">
15
- <hfoption id="torchrun">
16
-
17
- Use the `--nproc_per_node` to select how many accelerators to use.
18
-
19
- </hfoption>
20
- <hfoption id="Accelerate">
21
-
22
- Use `--num_processes` to select how many accelerators to use.
23
-
24
- </hfoption>
25
- <hfoption id="DeepSpeed">
26
-
27
- Use `--num_gpus` to select how many GPUs to use.
28
-
29
- </hfoption>
30
- </hfoptions>
31
-
32
- ## Order of accelerators
33
- To select specific accelerators to use and their order, use the environment variable appropriate for your hardware. This is often set on the command line for each run, but can also be added to your `~/.bashrc` or other startup config file.
34
-
35
- For example, if there are 4 accelerators (0, 1, 2, 3) and you only want to run accelerators 0 and 2:
36
-
37
- <hfoptions id="accelerator-type">
38
- <hfoption id="CUDA">
39
-
40
- Only GPUs 0 and 2 are "visible" to PyTorch and are mapped to `cuda:0` and `cuda:1` respectively.
41
- To reverse the order (use GPU 2 as `cuda:0` and GPU 0 as `cuda:1`):
42
-
43
- To run without any GPUs:
44
-
45
- You can also control the order of CUDA devices using `CUDA_DEVICE_ORDER`:
46
-
47
- - Order by PCIe bus ID (matches `nvidia-smi`):
48
-
49
-
50
-
51
- - Order by compute capability (fastest first):
52
-
53
-
54
-
55
- </hfoption>
56
- <hfoption id="Intel XPU">
57
-
58
- Only XPUs 0 and 2 are "visible" to PyTorch and are mapped to `xpu:0` and `xpu:1` respectively.
59
- To reverse the order (use XPU 2 as `xpu:0` and XPU 0 as `xpu:1`):
60
-
61
- You can also control the order of Intel XPUs with:
62
-
63
- For more information about device enumeration and sorting on Intel XPU, please refer to the [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) documentation.
64
-
65
- </hfoption>
66
- </hfoptions>
67
-
68
- > [!WARNING]
69
- > Environment variables can be exported instead of being added to the command line. This is not recommended because it can be confusing if you forget how the environment variable was set up and you end up using the wrong accelerators. Instead, it is common practice to set the environment variable for a specific training run on the same command line.
70
- ```
71
- """
 
1
+ output = """
2
+ What do these sentences about Hugging Face Transformers (a machine learning library) mean in Korean? Please do not translate the word after a 🤗 emoji as it is a product name. Output only the translated result without any explanations or introductions.
3
+ ```md
4
+ # Accelerator selection
5
+
6
+ During distributed training, you can specify the number and order of accelerators (CUDA, XPU, MPS, HPU, etc.) to use. This can be useful when you have accelerators with different computing power and you want to use the faster accelerator first. Or you could only use a subset of the available accelerators. The selection process works for both [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html). You don't need Accelerate or [DeepSpeed integration](./main_classes/deepspeed).
7
+
8
+ This guide will show you how to select the number of accelerators to use and the order to use them in.
9
+
10
+ ## Number of accelerators
11
+
12
+ For example, if there are 4 accelerators and you only want to use the first 2, run the command below.
13
+
14
+ <hfoptions id="select-accelerator">
15
+ <hfoption id="torchrun">
16
+
17
+ Use the `--nproc_per_node` to select how many accelerators to use.
18
+
19
+ </hfoption>
20
+ <hfoption id="Accelerate">
21
+
22
+ Use `--num_processes` to select how many accelerators to use.
23
+
24
+ </hfoption>
25
+ <hfoption id="DeepSpeed">
26
+
27
+ Use `--num_gpus` to select how many GPUs to use.
28
+
29
+ </hfoption>
30
+ </hfoptions>
31
+
32
+ ## Order of accelerators
33
+ To select specific accelerators to use and their order, use the environment variable appropriate for your hardware. This is often set on the command line for each run, but can also be added to your `~/.bashrc` or other startup config file.
34
+
35
+ For example, if there are 4 accelerators (0, 1, 2, 3) and you only want to run accelerators 0 and 2:
36
+
37
+ <hfoptions id="accelerator-type">
38
+ <hfoption id="CUDA">
39
+
40
+ Only GPUs 0 and 2 are "visible" to PyTorch and are mapped to `cuda:0` and `cuda:1` respectively.
41
+ To reverse the order (use GPU 2 as `cuda:0` and GPU 0 as `cuda:1`):
42
+
43
+ To run without any GPUs:
44
+
45
+ You can also control the order of CUDA devices using `CUDA_DEVICE_ORDER`:
46
+
47
+ - Order by PCIe bus ID (matches `nvidia-smi`):
48
+
49
+
50
+
51
+ - Order by compute capability (fastest first):
52
+
53
+
54
+
55
+ </hfoption>
56
+ <hfoption id="Intel XPU">
57
+
58
+ Only XPUs 0 and 2 are "visible" to PyTorch and are mapped to `xpu:0` and `xpu:1` respectively.
59
+ To reverse the order (use XPU 2 as `xpu:0` and XPU 0 as `xpu:1`):
60
+
61
+ You can also control the order of Intel XPUs with:
62
+
63
+ For more information about device enumeration and sorting on Intel XPU, please refer to the [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) documentation.
64
+
65
+ </hfoption>
66
+ </hfoptions>
67
+
68
+ > [!WARNING]
69
+ > Environment variables can be exported instead of being added to the command line. This is not recommended because it can be confusing if you forget how the environment variable was set up and you end up using the wrong accelerators. Instead, it is common practice to set the environment variable for a specific training run on the same command line.
70
+ ```
71
+ """
test/test_translate.py CHANGED
@@ -1,68 +1,68 @@
1
- translated_content = """
2
- # 가속기 선택
3
-
4
- 분산 훈련 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 개수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기들이 있을 때 더 빠른 가속기를 먼저 사용하고 싶거나, 사용 가능한 가속기 중 일부만 사용하고 싶을 때 유용합니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
5
-
6
- 이 가이드에서는 사용할 가속기의 개수와 사용 순서를 선택하는 방법을 보여드립니다.
7
-
8
- ## 가속기 개수
9
-
10
- 예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령어를 실행하세요.
11
-
12
- <hfoptions id="select-accelerator">
13
- <hfoption id="torchrun">
14
-
15
- 사용할 가속기 개수를 선택하려면 `--nproc_per_node`를 사용하세요.
16
-
17
- </hfoption>
18
- <hfoption id="Accelerate">
19
-
20
- 사용할 가속기 개수를 선택하려면 `--num_processes`를 사용하세요.
21
-
22
- </hfoption>
23
- <hfoption id="🤗 DeepSpeed">
24
-
25
- 사용할 GPU 개수를 선택하려면 `--num_gpus`를 사용하세요.
26
-
27
- </hfoption>
28
- </hfoptions>
29
-
30
- ## 가속기 순서
31
- 사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 보통 각 실행마다 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
32
-
33
- 예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
34
-
35
- <hfoptions id="accelerator-type">
36
- <hfoption id="CUDA">
37
-
38
- GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
39
- 순서를 바꾸려면(GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
40
-
41
- GPU 없이 실행하려면:
42
-
43
- `CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
44
-
45
- - PCIe 버스 ID 순서(`nvidia-smi`와 일치):
46
-
47
-
48
-
49
- - 컴퓨팅 성능 순서(가장 빠른 것부터):
50
-
51
-
52
-
53
- </hfoption>
54
- <hfoption id="Intel XPU">
55
-
56
- XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
57
- 순서를 바꾸려면(XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
58
-
59
- 다음으로 Intel XPU의 순서를 제어할 수도 있습니다:
60
-
61
- Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
62
-
63
- </hfoption>
64
- </hfoptions>
65
-
66
- > [!WARNING]
67
- > 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신 특정 훈련 실행을 위한 환경 변수를 같은 명령줄에서 설정하는 것이 일반적인 관행입니다.
68
- """
 
1
+ translated_content = """
2
+ # 가속기 선택
3
+
4
+ 분산 훈련 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 개수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기들이 있을 때 더 빠른 가속기를 먼저 사용하고 싶거나, 사용 가능한 가속기 중 일부만 사용하고 싶을 때 유용합니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
5
+
6
+ 이 가이드에서는 사용할 가속기의 개수와 사용 순서를 선택하는 방법을 보여드립니다.
7
+
8
+ ## 가속기 개수
9
+
10
+ 예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령어를 실행하세요.
11
+
12
+ <hfoptions id="select-accelerator">
13
+ <hfoption id="torchrun">
14
+
15
+ 사용할 가속기 개수를 선택하려면 `--nproc_per_node`를 사용하세요.
16
+
17
+ </hfoption>
18
+ <hfoption id="Accelerate">
19
+
20
+ 사용할 가속기 개수를 선택하려면 `--num_processes`를 사용하세요.
21
+
22
+ </hfoption>
23
+ <hfoption id="🤗 DeepSpeed">
24
+
25
+ 사용할 GPU 개수를 선택하려면 `--num_gpus`를 사용하세요.
26
+
27
+ </hfoption>
28
+ </hfoptions>
29
+
30
+ ## 가속기 순서
31
+ 사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 보통 각 실행마다 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
32
+
33
+ 예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
34
+
35
+ <hfoptions id="accelerator-type">
36
+ <hfoption id="CUDA">
37
+
38
+ GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
39
+ 순서를 바꾸려면(GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
40
+
41
+ GPU 없이 실행하려면:
42
+
43
+ `CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
44
+
45
+ - PCIe 버스 ID 순서(`nvidia-smi`와 일치):
46
+
47
+
48
+
49
+ - 컴퓨팅 성능 순서(가장 빠른 것부터):
50
+
51
+
52
+
53
+ </hfoption>
54
+ <hfoption id="Intel XPU">
55
+
56
+ XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
57
+ 순서를 바꾸려면(XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
58
+
59
+ 다음으로 Intel XPU의 순서를 제어할 수도 있습니다:
60
+
61
+ Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
62
+
63
+ </hfoption>
64
+ </hfoptions>
65
+
66
+ > [!WARNING]
67
+ > 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신 특정 훈련 실행을 위한 환경 변수를 같은 명령줄에서 설정하는 것이 일반적인 관행입니다.
68
+ """
translation_result/docs/source/en/accelerator_selection.md CHANGED
@@ -1,127 +1,127 @@
1
- <!--Copyright 2025 The HuggingFace Team. All rights reserved.
2
-
3
- Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
- the License. You may obtain a copy of the License at
5
-
6
- http://www.apache.org/licenses/LICENSE-2.0
7
-
8
- Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
- an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
- specific language governing permissions and limitations under the License.
11
-
12
- ⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
13
- rendered properly in your Markdown viewer.
14
-
15
- -->
16
-
17
- # 가속기 선택 [[accelerator-selection]]
18
-
19
- 분산 학습 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기가 있을 때 더 빠른 가속기를 먼저 사용하고 싶은 경우에 유용할 수 있습니다. 또는 사용 가능한 가속기의 일부만 사용할 수도 있습니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)는 필요하지 않습니다.
20
-
21
- 이 가이드는 사용할 가속기의 수와 사용 순서를 선택하는 방법을 보여줍니다.
22
-
23
- ## 가속기 수 [[number-of-accelerators]]
24
-
25
- 예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령을 실행하세요.
26
-
27
- <hfoptions id="select-accelerator">
28
- <hfoption id="torchrun">
29
-
30
- `--nproc_per_node`를 사용하여 사용할 가속기 수를 선택합니다.
31
-
32
- ```bash
33
- torchrun --nproc_per_node=2 trainer-program.py ...
34
- ```
35
-
36
- </hfoption>
37
- <hfoption id="Accelerate">
38
-
39
- `--num_processes`를 사용하여 사용할 가속기 수를 선택합니다.
40
-
41
- ```bash
42
- accelerate launch --num_processes 2 trainer-program.py ...
43
- ```
44
-
45
- </hfoption>
46
- <hfoption id="DeepSpeed">
47
-
48
- `--num_gpus`를 사용하여 사용할 GPU 수를 선택합니다.
49
-
50
- ```bash
51
- deepspeed --num_gpus 2 trainer-program.py ...
52
- ```
53
-
54
- </hfoption>
55
- </hfoptions>
56
-
57
- ## 가속기 순서 [[order-of-accelerators]]
58
- 사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 종종 각 실행에 대해 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 구성 파일에 추가할 수도 있습니다.
59
-
60
- 예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
61
-
62
- <hfoptions id="accelerator-type">
63
- <hfoption id="CUDA">
64
-
65
- ```bash
66
- CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
67
- ```
68
-
69
- GPU 0과 2만 PyTorch에서 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
70
- 순서를 바꾸려면 (GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
71
-
72
-
73
- ```bash
74
- CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
75
- ```
76
-
77
- GPU 없이 실행하려면:
78
-
79
- ```bash
80
- CUDA_VISIBLE_DEVICES= python trainer-program.py ...
81
- ```
82
-
83
- `CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
84
-
85
- - PCIe 버스 ID 순서 (`nvidia-smi`와 일치):
86
-
87
- ```bash
88
- $hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
89
- ```
90
-
91
- - 컴퓨팅 성능 순서 (가장 빠른 것부터):
92
-
93
- ```bash
94
- export CUDA_DEVICE_ORDER=FASTEST_FIRST
95
- ```
96
-
97
- </hfoption>
98
- <hfoption id="Intel XPU">
99
-
100
- ```bash
101
- ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
102
- ```
103
-
104
- XPU 0과 2만 PyTorch에서 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
105
- 순서를 바꾸려면 (XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
106
-
107
- ```bash
108
- ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
109
- ```
110
-
111
-
112
- 다음을 사용하여 Intel XPU의 순서를 제어할 수도 있습니다:
113
-
114
- ```bash
115
- export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
116
- ```
117
-
118
- Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
119
-
120
- </hfoption>
121
- </hfoptions>
122
-
123
-
124
-
125
- > [!WARNING]
126
- > 환경 변수는 명령줄에 추가하는 대신 내보낼 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란을 야기할 수 있으므로 권장하지 않습니다. 대신, 같은 명령줄에서 특정 훈련 실행을 위해 환경 변수를 설정하는 것이 일반적인 관례입니다.
127
  ```
 
1
+ <!--Copyright 2025 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # 가속기 선택 [[accelerator-selection]]
18
+
19
+ 분산 학습 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기가 있을 때 더 빠른 가속기를 먼저 사용하고 싶은 경우에 유용할 수 있습니다. 또는 사용 가능한 가속기의 일부만 사용할 수도 있습니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)는 필요하지 않습니다.
20
+
21
+ 이 가이드는 사용할 가속기의 수와 사용 순서를 선택하는 방법을 보여줍니다.
22
+
23
+ ## 가속기 수 [[number-of-accelerators]]
24
+
25
+ 예를 들어, 4개의 가속기가 있고 처음 2개만 사용하고 싶다면 아래 명령을 실행하세요.
26
+
27
+ <hfoptions id="select-accelerator">
28
+ <hfoption id="torchrun">
29
+
30
+ `--nproc_per_node`를 사용하여 사용할 가속기 수를 선택합니다.
31
+
32
+ ```bash
33
+ torchrun --nproc_per_node=2 trainer-program.py ...
34
+ ```
35
+
36
+ </hfoption>
37
+ <hfoption id="Accelerate">
38
+
39
+ `--num_processes`를 사용하여 사용할 가속기 수를 선택합니다.
40
+
41
+ ```bash
42
+ accelerate launch --num_processes 2 trainer-program.py ...
43
+ ```
44
+
45
+ </hfoption>
46
+ <hfoption id="DeepSpeed">
47
+
48
+ `--num_gpus`를 사용하여 사용할 GPU 수를 선택합니다.
49
+
50
+ ```bash
51
+ deepspeed --num_gpus 2 trainer-program.py ...
52
+ ```
53
+
54
+ </hfoption>
55
+ </hfoptions>
56
+
57
+ ## 가속기 순서 [[order-of-accelerators]]
58
+ 사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 종종 각 실행에 대해 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 구성 파일에 추가할 수도 있습니다.
59
+
60
+ 예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
61
+
62
+ <hfoptions id="accelerator-type">
63
+ <hfoption id="CUDA">
64
+
65
+ ```bash
66
+ CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
67
+ ```
68
+
69
+ GPU 0과 2만 PyTorch에서 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
70
+ 순서를 바꾸려면 (GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
71
+
72
+
73
+ ```bash
74
+ CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
75
+ ```
76
+
77
+ GPU 없이 실행하려면:
78
+
79
+ ```bash
80
+ CUDA_VISIBLE_DEVICES= python trainer-program.py ...
81
+ ```
82
+
83
+ `CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
84
+
85
+ - PCIe 버스 ID 순서 (`nvidia-smi`와 일치):
86
+
87
+ ```bash
88
+ $hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
89
+ ```
90
+
91
+ - 컴퓨팅 성능 순서 (가장 빠른 것부터):
92
+
93
+ ```bash
94
+ export CUDA_DEVICE_ORDER=FASTEST_FIRST
95
+ ```
96
+
97
+ </hfoption>
98
+ <hfoption id="Intel XPU">
99
+
100
+ ```bash
101
+ ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
102
+ ```
103
+
104
+ XPU 0과 2만 PyTorch에서 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
105
+ 순서를 바꾸려면 (XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
106
+
107
+ ```bash
108
+ ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
109
+ ```
110
+
111
+
112
+ 다음을 사용하여 Intel XPU의 순서를 제어할 수도 있습니다:
113
+
114
+ ```bash
115
+ export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
116
+ ```
117
+
118
+ Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
119
+
120
+ </hfoption>
121
+ </hfoptions>
122
+
123
+
124
+
125
+ > [!WARNING]
126
+ > 환경 변수는 명령줄에 추가하는 대신 내보낼 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란을 야기할 수 있으므로 권장하지 않습니다. 대신, 같은 명령줄에서 특정 훈련 실행을 위해 환경 변수를 설정하는 것이 일반적인 관례입니다.
127
  ```
translator/content.py CHANGED
@@ -1,214 +1,214 @@
1
- import os
2
- import re
3
- import string
4
-
5
- import requests
6
- from langchain.callbacks import get_openai_callback
7
- from langchain_anthropic import ChatAnthropic
8
- import boto3
9
- import json
10
-
11
- from translator.prompt_glossary import PROMPT_WITH_GLOSSARY
12
- from translator.project_config import get_project_config
13
-
14
-
15
- def get_content(filepath: str, project: str = "transformers") -> str:
16
- if filepath == "":
17
- raise ValueError("No files selected for translation.")
18
-
19
- config = get_project_config(project)
20
- # Extract repo path from repo_url (e.g., "huggingface/transformers")
21
- repo_path = config.repo_url.replace("https://github.com/", "")
22
-
23
- url = f"https://raw.githubusercontent.com/{repo_path}/main/{filepath}"
24
- response = requests.get(url)
25
- if response.status_code == 200:
26
- content = response.text
27
- return content
28
- else:
29
- raise ValueError("Failed to retrieve content from the URL.", url)
30
-
31
-
32
- def preprocess_content(content: str) -> str:
33
- # Extract text to translate from document
34
-
35
- ## ignore top license comment
36
- to_translate = content[content.find("#") :]
37
- ## remove code blocks from text
38
- # to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
39
- ## remove markdown tables from text
40
- # to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
41
- ## remove empty lines from text
42
- to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
43
- return to_translate
44
-
45
-
46
- def get_full_prompt(language: str, to_translate: str, additional_instruction: str = "") -> str:
47
- base_prompt = string.Template(
48
- "What do these sentences about Hugging Face Transformers "
49
- "(a machine learning library) mean in $language? "
50
- "Please do not translate the word after a 🤗 emoji "
51
- "as it is a product name. Output the complete markdown file**, with prose translated and all other content intact"
52
- "No explanations or extras—only the translated markdown. Also translate all comments within code blocks as well."
53
- ).safe_substitute(language=language)
54
-
55
- base_prompt += "\n\n```md"
56
-
57
- full_prompt = "\n".join([base_prompt, to_translate.strip(), "```", PROMPT_WITH_GLOSSARY])
58
-
59
- if additional_instruction.strip():
60
- full_prompt += f"\n\n🗒️ Additional instructions: {additional_instruction.strip()}"
61
-
62
- return full_prompt
63
-
64
-
65
- def split_markdown_sections(markdown: str) -> list:
66
- # Find all titles using regular expressions
67
- return re.split(r"^(#+\s+)(.*)$", markdown, flags=re.MULTILINE)[1:]
68
- # format is like [level, title, content, level, title, content, ...]
69
-
70
-
71
- def get_anchors(divided: list) -> list:
72
- anchors = []
73
- # from https://github.com/huggingface/doc-builder/blob/01b262bae90d66e1150cdbf58c83c02733ed4366/src/doc_builder/build_doc.py#L300-L302
74
- for title in divided[1::3]:
75
- anchor = re.sub(r"[^a-z0-9\s]+", "", title.lower())
76
- anchor = re.sub(r"\s{2,}", " ", anchor.strip()).replace(" ", "-")
77
- anchors.append(f"[[{anchor}]]")
78
- return anchors
79
-
80
-
81
- def make_scaffold(content: str, to_translate: str) -> string.Template:
82
- scaffold = content
83
- for i, text in enumerate(to_translate.split("\n\n")):
84
- scaffold = scaffold.replace(text, f"$hf_i18n_placeholder{i}", 1)
85
- print("inner scaffold:")
86
- print(scaffold)
87
- return string.Template(scaffold)
88
-
89
-
90
- def is_in_code_block(text: str, position: int) -> bool:
91
- """Check if a position in text is inside a code block"""
92
- text_before = text[:position]
93
- code_block_starts = text_before.count("```")
94
- return code_block_starts % 2 == 1
95
-
96
-
97
- def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
98
- scaffold = make_scaffold(content, to_translate)
99
- print("scaffold:")
100
- print(scaffold.template)
101
-
102
- # Get original text sections to maintain structure
103
- original_sections = to_translate.split("\n\n")
104
-
105
- # Split markdown sections to get headers and anchors
106
- divided = split_markdown_sections(to_translate)
107
- print("divided:")
108
- print(divided)
109
- anchors = get_anchors(divided)
110
-
111
- # Split translated content by markdown sections
112
- translated_divided = split_markdown_sections(translated)
113
- print("translated divided:")
114
- print(translated_divided)
115
-
116
- # Ensure we have the same number of headers as the original
117
- if len(translated_divided[1::3]) != len(anchors):
118
- print(f"Warning: Header count mismatch. Original: {len(anchors)}, Translated: {len(translated_divided[1::3])}")
119
- # Adjust anchors list to match translated headers
120
- if len(translated_divided[1::3]) < len(anchors):
121
- anchors = anchors[:len(translated_divided[1::3])]
122
- else:
123
- # Add empty anchors for extra headers
124
- anchors.extend([""] * (len(translated_divided[1::3]) - len(anchors)))
125
-
126
- # Add anchors to translated headers only if they're not in code blocks
127
- for i, korean_title in enumerate(translated_divided[1::3]):
128
- if i < len(anchors):
129
- # Find the position of this header in the original translated text
130
- header_pos = translated.find(korean_title.strip())
131
- if header_pos != -1 and not is_in_code_block(translated, header_pos):
132
- translated_divided[1 + i * 3] = f"{korean_title} {anchors[i]}"
133
- else:
134
- translated_divided[1 + i * 3] = korean_title
135
-
136
- # Reconstruct translated content with proper structure
137
- reconstructed_translated = "".join([
138
- "".join(translated_divided[i * 3 : i * 3 + 3])
139
- for i in range(len(translated_divided) // 3)
140
- ])
141
-
142
- # Split by double newlines to match original structure
143
- translated_sections = reconstructed_translated.split("\n\n")
144
-
145
- print("scaffold template count:")
146
- print(scaffold.template.count("$hf_i18n_placeholder"))
147
- print("original sections length:")
148
- print(len(original_sections))
149
- print("translated sections length:")
150
- print(len(translated_sections))
151
-
152
- # Ensure section counts match
153
- placeholder_count = scaffold.template.count("$hf_i18n_placeholder")
154
-
155
- if len(translated_sections) < placeholder_count:
156
- # Add empty sections if translated has fewer sections
157
- translated_sections.extend([""] * (placeholder_count - len(translated_sections)))
158
- elif len(translated_sections) > placeholder_count:
159
- # Truncate if translated has more sections
160
- translated_sections = translated_sections[:placeholder_count]
161
-
162
- # Final check
163
- if len(translated_sections) != placeholder_count:
164
- return f"Error: Section count mismatch. Expected: {placeholder_count}, Got: {len(translated_sections)}"
165
-
166
- translated_doc = scaffold.safe_substitute(
167
- {f"hf_i18n_placeholder{i}": text for i, text in enumerate(translated_sections)}
168
- )
169
- return translated_doc
170
-
171
-
172
- def llm_translate(to_translate: str) -> tuple[str, str]:
173
- anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY")
174
- aws_bearer_token_bedrock = os.environ.get("AWS_BEARER_TOKEN_BEDROCK")
175
-
176
- if anthropic_api_key:
177
- # Use Anthropic API Key
178
- model = ChatAnthropic(
179
- model="claude-sonnet-4-20250514", max_tokens=64000, streaming=True
180
- )
181
- ai_message = model.invoke(to_translate)
182
- cb = "Anthropic API Key used"
183
- return str(cb), ai_message.content
184
-
185
- elif aws_bearer_token_bedrock:
186
- # Use AWS Bedrock with bearer token (assuming standard AWS credential chain is configured)
187
- # Note: boto3 does not directly use a 'bearer_token' named environment variable for SigV4 authentication.
188
- # It relies on AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, or IAM roles.
189
- # If AWS_BEARER_TOKEN_BEDROCK is meant to be one of these, it should be renamed accordingly.
190
- # For now, we proceed assuming standard AWS credential chain is configured to pick up credentials.
191
- client = boto3.client("bedrock-runtime", region_name="eu-north-1")
192
-
193
- body = {
194
- "messages": [
195
- {"role": "user", "content": to_translate}
196
- ],
197
- "max_tokens": 128000,
198
- "anthropic_version": "bedrock-2023-05-31"
199
- }
200
-
201
- response = client.invoke_model(
202
- modelId="arn:aws:bedrock:eu-north-1:235729104418:inference-profile/eu.anthropic.claude-3-7-sonnet-20250219-v1:0",
203
- contentType="application/json",
204
- accept="application/json",
205
- body=json.dumps(body),
206
- )
207
- result = json.loads(response["body"].read())
208
- cb = result["usage"]
209
- ai_message = result["content"][0]["text"]
210
-
211
- return str(cb), ai_message
212
-
213
- else:
214
- raise ValueError("No API key found for translation. Please set ANTHROPIC_API_KEY or AWS_BEARER_TOKEN_BEDROCK environment variable.")
 
1
+ import os
2
+ import re
3
+ import string
4
+
5
+ import requests
6
+ from langchain.callbacks import get_openai_callback
7
+ from langchain_anthropic import ChatAnthropic
8
+ import boto3
9
+ import json
10
+
11
+ from translator.prompt_glossary import PROMPT_WITH_GLOSSARY
12
+ from translator.project_config import get_project_config
13
+
14
+
15
+ def get_content(filepath: str, project: str = "transformers") -> str:
16
+ if filepath == "":
17
+ raise ValueError("No files selected for translation.")
18
+
19
+ config = get_project_config(project)
20
+ # Extract repo path from repo_url (e.g., "huggingface/transformers")
21
+ repo_path = config.repo_url.replace("https://github.com/", "")
22
+
23
+ url = f"https://raw.githubusercontent.com/{repo_path}/main/{filepath}"
24
+ response = requests.get(url)
25
+ if response.status_code == 200:
26
+ content = response.text
27
+ return content
28
+ else:
29
+ raise ValueError("Failed to retrieve content from the URL.", url)
30
+
31
+
32
+ def preprocess_content(content: str) -> str:
33
+ # Extract text to translate from document
34
+
35
+ ## ignore top license comment
36
+ to_translate = content[content.find("#") :]
37
+ ## remove code blocks from text
38
+ # to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
39
+ ## remove markdown tables from text
40
+ # to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
41
+ ## remove empty lines from text
42
+ to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
43
+ return to_translate
44
+
45
+
46
+ def get_full_prompt(language: str, to_translate: str, additional_instruction: str = "") -> str:
47
+ base_prompt = string.Template(
48
+ "What do these sentences about Hugging Face Transformers "
49
+ "(a machine learning library) mean in $language? "
50
+ "Please do not translate the word after a 🤗 emoji "
51
+ "as it is a product name. Output the complete markdown file**, with prose translated and all other content intact"
52
+ "No explanations or extras—only the translated markdown. Also translate all comments within code blocks as well."
53
+ ).safe_substitute(language=language)
54
+
55
+ base_prompt += "\n\n```md"
56
+
57
+ full_prompt = "\n".join([base_prompt, to_translate.strip(), "```", PROMPT_WITH_GLOSSARY])
58
+
59
+ if additional_instruction.strip():
60
+ full_prompt += f"\n\n🗒️ Additional instructions: {additional_instruction.strip()}"
61
+
62
+ return full_prompt
63
+
64
+
65
+ def split_markdown_sections(markdown: str) -> list:
66
+ # Find all titles using regular expressions
67
+ return re.split(r"^(#+\s+)(.*)$", markdown, flags=re.MULTILINE)[1:]
68
+ # format is like [level, title, content, level, title, content, ...]
69
+
70
+
71
+ def get_anchors(divided: list) -> list:
72
+ anchors = []
73
+ # from https://github.com/huggingface/doc-builder/blob/01b262bae90d66e1150cdbf58c83c02733ed4366/src/doc_builder/build_doc.py#L300-L302
74
+ for title in divided[1::3]:
75
+ anchor = re.sub(r"[^a-z0-9\s]+", "", title.lower())
76
+ anchor = re.sub(r"\s{2,}", " ", anchor.strip()).replace(" ", "-")
77
+ anchors.append(f"[[{anchor}]]")
78
+ return anchors
79
+
80
+
81
+ def make_scaffold(content: str, to_translate: str) -> string.Template:
82
+ scaffold = content
83
+ for i, text in enumerate(to_translate.split("\n\n")):
84
+ scaffold = scaffold.replace(text, f"$hf_i18n_placeholder{i}", 1)
85
+ print("inner scaffold:")
86
+ print(scaffold)
87
+ return string.Template(scaffold)
88
+
89
+
90
+ def is_in_code_block(text: str, position: int) -> bool:
91
+ """Check if a position in text is inside a code block"""
92
+ text_before = text[:position]
93
+ code_block_starts = text_before.count("```")
94
+ return code_block_starts % 2 == 1
95
+
96
+
97
+ def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
98
+ scaffold = make_scaffold(content, to_translate)
99
+ print("scaffold:")
100
+ print(scaffold.template)
101
+
102
+ # Get original text sections to maintain structure
103
+ original_sections = to_translate.split("\n\n")
104
+
105
+ # Split markdown sections to get headers and anchors
106
+ divided = split_markdown_sections(to_translate)
107
+ print("divided:")
108
+ print(divided)
109
+ anchors = get_anchors(divided)
110
+
111
+ # Split translated content by markdown sections
112
+ translated_divided = split_markdown_sections(translated)
113
+ print("translated divided:")
114
+ print(translated_divided)
115
+
116
+ # Ensure we have the same number of headers as the original
117
+ if len(translated_divided[1::3]) != len(anchors):
118
+ print(f"Warning: Header count mismatch. Original: {len(anchors)}, Translated: {len(translated_divided[1::3])}")
119
+ # Adjust anchors list to match translated headers
120
+ if len(translated_divided[1::3]) < len(anchors):
121
+ anchors = anchors[:len(translated_divided[1::3])]
122
+ else:
123
+ # Add empty anchors for extra headers
124
+ anchors.extend([""] * (len(translated_divided[1::3]) - len(anchors)))
125
+
126
+ # Add anchors to translated headers only if they're not in code blocks
127
+ for i, korean_title in enumerate(translated_divided[1::3]):
128
+ if i < len(anchors):
129
+ # Find the position of this header in the original translated text
130
+ header_pos = translated.find(korean_title.strip())
131
+ if header_pos != -1 and not is_in_code_block(translated, header_pos):
132
+ translated_divided[1 + i * 3] = f"{korean_title} {anchors[i]}"
133
+ else:
134
+ translated_divided[1 + i * 3] = korean_title
135
+
136
+ # Reconstruct translated content with proper structure
137
+ reconstructed_translated = "".join([
138
+ "".join(translated_divided[i * 3 : i * 3 + 3])
139
+ for i in range(len(translated_divided) // 3)
140
+ ])
141
+
142
+ # Split by double newlines to match original structure
143
+ translated_sections = reconstructed_translated.split("\n\n")
144
+
145
+ print("scaffold template count:")
146
+ print(scaffold.template.count("$hf_i18n_placeholder"))
147
+ print("original sections length:")
148
+ print(len(original_sections))
149
+ print("translated sections length:")
150
+ print(len(translated_sections))
151
+
152
+ # Ensure section counts match
153
+ placeholder_count = scaffold.template.count("$hf_i18n_placeholder")
154
+
155
+ if len(translated_sections) < placeholder_count:
156
+ # Add empty sections if translated has fewer sections
157
+ translated_sections.extend([""] * (placeholder_count - len(translated_sections)))
158
+ elif len(translated_sections) > placeholder_count:
159
+ # Truncate if translated has more sections
160
+ translated_sections = translated_sections[:placeholder_count]
161
+
162
+ # Final check
163
+ if len(translated_sections) != placeholder_count:
164
+ return f"Error: Section count mismatch. Expected: {placeholder_count}, Got: {len(translated_sections)}"
165
+
166
+ translated_doc = scaffold.safe_substitute(
167
+ {f"hf_i18n_placeholder{i}": text for i, text in enumerate(translated_sections)}
168
+ )
169
+ return translated_doc
170
+
171
+
172
+ def llm_translate(to_translate: str) -> tuple[str, str]:
173
+ anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY")
174
+ aws_bearer_token_bedrock = os.environ.get("AWS_BEARER_TOKEN_BEDROCK")
175
+
176
+ if anthropic_api_key:
177
+ # Use Anthropic API Key
178
+ model = ChatAnthropic(
179
+ model="claude-sonnet-4-20250514", max_tokens=64000, streaming=True
180
+ )
181
+ ai_message = model.invoke(to_translate)
182
+ cb = "Anthropic API Key used"
183
+ return str(cb), ai_message.content
184
+
185
+ elif aws_bearer_token_bedrock:
186
+ # Use AWS Bedrock with bearer token (assuming standard AWS credential chain is configured)
187
+ # Note: boto3 does not directly use a 'bearer_token' named environment variable for SigV4 authentication.
188
+ # It relies on AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_SESSION_TOKEN, or IAM roles.
189
+ # If AWS_BEARER_TOKEN_BEDROCK is meant to be one of these, it should be renamed accordingly.
190
+ # For now, we proceed assuming standard AWS credential chain is configured to pick up credentials.
191
+ client = boto3.client("bedrock-runtime", region_name="eu-north-1")
192
+
193
+ body = {
194
+ "messages": [
195
+ {"role": "user", "content": to_translate}
196
+ ],
197
+ "max_tokens": 128000,
198
+ "anthropic_version": "bedrock-2023-05-31"
199
+ }
200
+
201
+ response = client.invoke_model(
202
+ modelId="arn:aws:bedrock:eu-north-1:235729104418:inference-profile/eu.anthropic.claude-3-7-sonnet-20250219-v1:0",
203
+ contentType="application/json",
204
+ accept="application/json",
205
+ body=json.dumps(body),
206
+ )
207
+ result = json.loads(response["body"].read())
208
+ cb = result["usage"]
209
+ ai_message = result["content"][0]["text"]
210
+
211
+ return str(cb), ai_message
212
+
213
+ else:
214
+ raise ValueError("No API key found for translation. Please set ANTHROPIC_API_KEY or AWS_BEARER_TOKEN_BEDROCK environment variable.")
translator/model.py CHANGED
@@ -1,70 +1,70 @@
1
- from enum import Enum, unique
2
-
3
- from pydantic import BaseModel, computed_field
4
-
5
-
6
- @unique
7
- class Languages(Enum):
8
- az = "az"
9
- bn = "bn"
10
- de = "de"
11
- em = "em"
12
- es = "es"
13
- fa = "fa"
14
- fr = "fr"
15
- he = "he"
16
- hu = "hu"
17
- id = "id"
18
- it = "it"
19
- ja = "ja"
20
- ko = "ko"
21
- pl = "pl"
22
- pt = "pt"
23
- ru = "ru"
24
- tr = "tr"
25
- uk = "uk"
26
- ur = "ur"
27
- vi = "vi"
28
- yo = "yo"
29
- zh = "zh"
30
- zh_hant = "zh-hant"
31
-
32
-
33
- class TranslationDoc(BaseModel):
34
- official_lang: str = "en"
35
- translation_lang: str
36
- original_file: str
37
- translation_file: str | None = None
38
- translation_exists: bool
39
-
40
-
41
- class Summary(BaseModel):
42
- lang: str
43
- files_analyzed: int = 0
44
- files_translated: int = 0
45
- files_outdated: int = 0
46
- files_missing_translation: int = 0
47
- files: list[TranslationDoc] = []
48
-
49
- @computed_field # type: ignore
50
- @property
51
- def percentage_missing_translation(self) -> float:
52
- try:
53
- return (
54
- 100 * float(self.files_missing_translation) / float(self.files_analyzed)
55
- )
56
- except Exception:
57
- return 0.0
58
-
59
- def append_file(self, doc: TranslationDoc) -> None:
60
- self.files.append(doc)
61
- self.files_analyzed += 1
62
-
63
- if doc.translation_exists:
64
- self.files_translated += 1
65
-
66
- if not doc.translation_exists:
67
- self.files_missing_translation += 1
68
-
69
- def first_missing_translation_files(self, length: int = 10) -> list[TranslationDoc]:
70
- return list(filter(lambda d: not d.translation_exists, self.files))[:length]
 
1
+ from enum import Enum, unique
2
+
3
+ from pydantic import BaseModel, computed_field
4
+
5
+
6
+ @unique
7
+ class Languages(Enum):
8
+ az = "az"
9
+ bn = "bn"
10
+ de = "de"
11
+ em = "em"
12
+ es = "es"
13
+ fa = "fa"
14
+ fr = "fr"
15
+ he = "he"
16
+ hu = "hu"
17
+ id = "id"
18
+ it = "it"
19
+ ja = "ja"
20
+ ko = "ko"
21
+ pl = "pl"
22
+ pt = "pt"
23
+ ru = "ru"
24
+ tr = "tr"
25
+ uk = "uk"
26
+ ur = "ur"
27
+ vi = "vi"
28
+ yo = "yo"
29
+ zh = "zh"
30
+ zh_hant = "zh-hant"
31
+
32
+
33
+ class TranslationDoc(BaseModel):
34
+ official_lang: str = "en"
35
+ translation_lang: str
36
+ original_file: str
37
+ translation_file: str | None = None
38
+ translation_exists: bool
39
+
40
+
41
+ class Summary(BaseModel):
42
+ lang: str
43
+ files_analyzed: int = 0
44
+ files_translated: int = 0
45
+ files_outdated: int = 0
46
+ files_missing_translation: int = 0
47
+ files: list[TranslationDoc] = []
48
+
49
+ @computed_field # type: ignore
50
+ @property
51
+ def percentage_missing_translation(self) -> float:
52
+ try:
53
+ return (
54
+ 100 * float(self.files_missing_translation) / float(self.files_analyzed)
55
+ )
56
+ except Exception:
57
+ return 0.0
58
+
59
+ def append_file(self, doc: TranslationDoc) -> None:
60
+ self.files.append(doc)
61
+ self.files_analyzed += 1
62
+
63
+ if doc.translation_exists:
64
+ self.files_translated += 1
65
+
66
+ if not doc.translation_exists:
67
+ self.files_missing_translation += 1
68
+
69
+ def first_missing_translation_files(self, length: int = 10) -> list[TranslationDoc]:
70
+ return list(filter(lambda d: not d.translation_exists, self.files))[:length]
translator/project_config.py CHANGED
@@ -1,48 +1,48 @@
1
- """Project configuration for different HuggingFace repositories."""
2
-
3
- from dataclasses import dataclass
4
- from typing import Dict
5
-
6
-
7
- @dataclass
8
- class ProjectConfig:
9
- """Configuration for a specific HuggingFace project."""
10
- name: str
11
- repo_url: str
12
- api_url: str
13
- docs_path: str
14
- github_issues: Dict[str, str] # language -> issue_id
15
- reference_pr_url: str
16
-
17
-
18
- # Project configurations
19
- PROJECTS = {
20
- "transformers": ProjectConfig(
21
- name="Transformers",
22
- repo_url="https://github.com/huggingface/transformers",
23
- api_url="https://api.github.com/repos/huggingface/transformers/git/trees/main?recursive=1",
24
- docs_path="docs/source",
25
- github_issues={"ko": "20179"},
26
- reference_pr_url="https://github.com/huggingface/transformers/pull/24968"
27
- ),
28
- "smolagents": ProjectConfig(
29
- name="SmolAgents",
30
- repo_url="https://github.com/huggingface/smolagents",
31
- api_url="https://api.github.com/repos/huggingface/smolagents/git/trees/main?recursive=1",
32
- docs_path="docs/source",
33
- github_issues={"ko": "20179"}, # To be filled when issue is created
34
- reference_pr_url="https://github.com/huggingface/smolagents/pull/1581" # To be filled with actual PR URL
35
- )
36
- }
37
-
38
-
39
- def get_project_config(project_key: str) -> ProjectConfig:
40
- """Get project configuration by key."""
41
- if project_key not in PROJECTS:
42
- raise ValueError(f"Unknown project: {project_key}. Available: {list(PROJECTS.keys())}")
43
- return PROJECTS[project_key]
44
-
45
-
46
- def get_available_projects() -> list[str]:
47
- """Get list of available project keys."""
48
  return list(PROJECTS.keys())
 
1
+ """Project configuration for different HuggingFace repositories."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Dict
5
+
6
+
7
+ @dataclass
8
+ class ProjectConfig:
9
+ """Configuration for a specific HuggingFace project."""
10
+ name: str
11
+ repo_url: str
12
+ api_url: str
13
+ docs_path: str
14
+ github_issues: Dict[str, str] # language -> issue_id
15
+ reference_pr_url: str
16
+
17
+
18
+ # Project configurations
19
+ PROJECTS = {
20
+ "transformers": ProjectConfig(
21
+ name="Transformers",
22
+ repo_url="https://github.com/huggingface/transformers",
23
+ api_url="https://api.github.com/repos/huggingface/transformers/git/trees/main?recursive=1",
24
+ docs_path="docs/source",
25
+ github_issues={"ko": "20179"},
26
+ reference_pr_url="https://github.com/huggingface/transformers/pull/24968"
27
+ ),
28
+ "smolagents": ProjectConfig(
29
+ name="SmolAgents",
30
+ repo_url="https://github.com/huggingface/smolagents",
31
+ api_url="https://api.github.com/repos/huggingface/smolagents/git/trees/main?recursive=1",
32
+ docs_path="docs/source",
33
+ github_issues={"ko": "20179"}, # To be filled when issue is created
34
+ reference_pr_url="https://github.com/huggingface/smolagents/pull/1581" # To be filled with actual PR URL
35
+ )
36
+ }
37
+
38
+
39
+ def get_project_config(project_key: str) -> ProjectConfig:
40
+ """Get project configuration by key."""
41
+ if project_key not in PROJECTS:
42
+ raise ValueError(f"Unknown project: {project_key}. Available: {list(PROJECTS.keys())}")
43
+ return PROJECTS[project_key]
44
+
45
+
46
+ def get_available_projects() -> list[str]:
47
+ """Get list of available project keys."""
48
  return list(PROJECTS.keys())
translator/prompt_glossary.py CHANGED
@@ -1,126 +1,126 @@
1
- PROMPT_WITH_GLOSSARY = """
2
- You have a glossary of terms with their Korean translations. When translating a sentence, you need to check if any of the words in the sentence are in the glossary, and if so, translate them according to the provided Korean terms. Here is the glossary:
3
-
4
- 🔹 Glossary (English → Korean):
5
- - revision: 개정
6
- - method: 메소드
7
- - secrets: 비밀값
8
- - search helper: 검색 헬퍼
9
- - logging level: 로그 레벨
10
- - workflow: 워크플로우
11
- - corner case: 코너 케이스
12
- - tokenization: 토큰화
13
- - architecture: 아키텍처
14
- - attention mask: 어텐션 마스크
15
- - backbone: 백본
16
- - argmax: argmax
17
- - beam search: 빔 서치
18
- - clustering: 군집화
19
- - configuration: 구성
20
- - context: 문맥
21
- - cross entropy: 교차 엔트로피
22
- - cross-attention: 크로스 어텐션
23
- - dictionary: 딕셔너리
24
- - entry: 엔트리
25
- - few shot: 퓨샷
26
- - flatten: 평탄화
27
- - ground truth: 정답
28
- - head: 헤드
29
- - helper function: 헬퍼 함수
30
- - image captioning: 이미지 캡셔닝
31
- - image patch: 이미지 패치
32
- - inference: 추론
33
- - instance: 인스턴스
34
- - Instantiate: 인스턴스화
35
- - knowledge distillation: 지식 증류
36
- - labels: 레이블
37
- - large language models (LLM): 대규모 언어 모델
38
- - layer: 레이어
39
- - learning rate scheduler: Learning Rate Scheduler
40
- - localization: 로컬리제이션
41
- - log mel-filter bank: 로그 멜 필터 뱅크
42
- - look-up table: 룩업 테이블
43
- - loss function: 손실 함수
44
- - machine learning: 머신 러닝
45
- - mapping: 매핑
46
- - masked language modeling (MLM): 마스크드 언어 모델
47
- - malware: 악성코드
48
- - metric: 지표
49
- - mixed precision: 혼합 정밀도
50
- - modality: 모달리티
51
- - monolingual model: 단일 언어 모델
52
- - multi gpu: 다중 GPU
53
- - multilingual model: 다국어 모델
54
- - parsing: 파싱
55
- - perplexity (PPL): 펄플렉서티(Perplexity)
56
- - pipeline: 파이프라인
57
- - pixel values: 픽셀 값
58
- - pooling: 풀링
59
- - position IDs: 위치 ID
60
- - preprocessing: 전처리
61
- - prompt: 프롬프트
62
- - pythonic: 파이써닉
63
- - query: 쿼리
64
- - question answering: 질의 응답
65
- - raw audio waveform: 원시 오디오 파형
66
- - recurrent neural network (RNN): 순환 신경망
67
- - accelerator: 가속기
68
- - Accelerate: Accelerate
69
- - architecture: 아키텍처
70
- - arguments: 인수
71
- - attention mask: 어텐션 마스크
72
- - augmentation: 증강
73
- - autoencoding models: 오토인코딩 모델
74
- - autoregressive models: 자기회귀 모델
75
- - backward: 역방향
76
- - bounding box: 바운딩 박스
77
- - causal language modeling: 인과적 언어 모델링(causal language modeling)
78
- - channel: 채널
79
- - checkpoint: 체크포인트(checkpoint)
80
- - chunk: 묶음
81
- - computer vision: 컴퓨터 비전
82
- - convolution: 합성곱
83
- - crop: 자르기
84
- - custom: 사용자 정의
85
- - customize: 맞춤 설정하다
86
- - data collator: 데이터 콜레이터
87
- - dataset: 데이터 세트
88
- - decoder input IDs: 디코더 입력 ID
89
- - decoder models: 디코더 모델
90
- - deep learning (DL): 딥러닝
91
- - directory: 디렉터리
92
- - distributed training: 분산 학습
93
- - downstream: 다운스트림
94
- - encoder models: 인코더 모델
95
- - entity: 개체
96
- - epoch: 에폭
97
- - evaluation method: 평가 방법
98
- - feature extraction: 특성 추출
99
- - feature matrix: 특성 행렬(feature matrix)
100
- - fine-tunning: 미세 조정
101
- - finetuned models: 미세 조정 모델
102
- - hidden state: 은닉 상태
103
- - hyperparameter: 하이퍼파라미터
104
- - learning: 학습
105
- - load: 가져오다
106
- - method: 메소드
107
- - optimizer: 옵티마이저
108
- - pad (padding): 패드 (패딩)
109
- - parameter: 매개변수
110
- - pretrained model: 사전훈련된 모델
111
- - separator (* [SEP]를 부르는 이름): 분할 토큰
112
- - sequence: 시퀀스
113
- - silent error: 조용한 오류
114
- - token: 토큰
115
- - tokenizer: 토크나이저
116
- - training: 훈련
117
- - workflow: 워크플로우
118
-
119
- 📌 Instructions:
120
- 1. Whenever a source term from the glossary appears **in any form** (full match or partial match within a larger phrase), **replace it with the exact Korean translation** from the glossary, keeping the rest of the phrase in Korean.
121
- - Example: “Attention Interface” → “어텐션 인터페이스”
122
- - Example: “Architecture details” → “아키텍처 상세”
123
- 2. Non-glossary words should be translated naturally, respecting context and technical nuance.
124
-
125
- Please revise the translated sentences accordingly using the terms provided in this glossary.
126
- """
 
1
+ PROMPT_WITH_GLOSSARY = """
2
+ You have a glossary of terms with their Korean translations. When translating a sentence, you need to check if any of the words in the sentence are in the glossary, and if so, translate them according to the provided Korean terms. Here is the glossary:
3
+
4
+ 🔹 Glossary (English → Korean):
5
+ - revision: 개정
6
+ - method: 메소드
7
+ - secrets: 비밀값
8
+ - search helper: 검색 헬퍼
9
+ - logging level: 로그 레벨
10
+ - workflow: 워크플로우
11
+ - corner case: 코너 케이스
12
+ - tokenization: 토큰화
13
+ - architecture: 아키텍처
14
+ - attention mask: 어텐션 마스크
15
+ - backbone: 백본
16
+ - argmax: argmax
17
+ - beam search: 빔 서치
18
+ - clustering: 군집화
19
+ - configuration: 구성
20
+ - context: 문맥
21
+ - cross entropy: 교차 엔트로피
22
+ - cross-attention: 크로스 어텐션
23
+ - dictionary: 딕셔너리
24
+ - entry: 엔트리
25
+ - few shot: 퓨샷
26
+ - flatten: 평탄화
27
+ - ground truth: 정답
28
+ - head: 헤드
29
+ - helper function: 헬퍼 함수
30
+ - image captioning: 이미지 캡셔닝
31
+ - image patch: 이미지 패치
32
+ - inference: 추론
33
+ - instance: 인스턴스
34
+ - Instantiate: 인스턴스화
35
+ - knowledge distillation: 지식 증류
36
+ - labels: 레이블
37
+ - large language models (LLM): 대규모 언어 모델
38
+ - layer: 레이어
39
+ - learning rate scheduler: Learning Rate Scheduler
40
+ - localization: 로컬리제이션
41
+ - log mel-filter bank: 로그 멜 필터 뱅크
42
+ - look-up table: 룩업 테이블
43
+ - loss function: 손실 함수
44
+ - machine learning: 머신 러닝
45
+ - mapping: 매핑
46
+ - masked language modeling (MLM): 마스크드 언어 모델
47
+ - malware: 악성코드
48
+ - metric: 지표
49
+ - mixed precision: 혼합 정밀도
50
+ - modality: 모달리티
51
+ - monolingual model: 단일 언어 모델
52
+ - multi gpu: 다중 GPU
53
+ - multilingual model: 다국어 모델
54
+ - parsing: 파싱
55
+ - perplexity (PPL): 펄플렉서티(Perplexity)
56
+ - pipeline: 파이프라인
57
+ - pixel values: 픽셀 값
58
+ - pooling: 풀링
59
+ - position IDs: 위치 ID
60
+ - preprocessing: 전처리
61
+ - prompt: 프롬프트
62
+ - pythonic: 파이써닉
63
+ - query: 쿼리
64
+ - question answering: 질의 응답
65
+ - raw audio waveform: 원시 오디오 파형
66
+ - recurrent neural network (RNN): 순환 신경망
67
+ - accelerator: 가속기
68
+ - Accelerate: Accelerate
69
+ - architecture: 아키텍처
70
+ - arguments: 인수
71
+ - attention mask: 어텐션 마스크
72
+ - augmentation: 증강
73
+ - autoencoding models: 오토인코딩 모델
74
+ - autoregressive models: 자기회귀 모델
75
+ - backward: 역방향
76
+ - bounding box: 바운딩 박스
77
+ - causal language modeling: 인과적 언어 모델링(causal language modeling)
78
+ - channel: 채널
79
+ - checkpoint: 체크포인트(checkpoint)
80
+ - chunk: 묶음
81
+ - computer vision: 컴퓨터 비전
82
+ - convolution: 합성곱
83
+ - crop: 자르기
84
+ - custom: 사용자 정의
85
+ - customize: 맞춤 설정하다
86
+ - data collator: 데이터 콜레이터
87
+ - dataset: 데이터 세트
88
+ - decoder input IDs: 디코더 입력 ID
89
+ - decoder models: 디코더 모델
90
+ - deep learning (DL): 딥러닝
91
+ - directory: 디렉터리
92
+ - distributed training: 분산 학습
93
+ - downstream: 다운스트림
94
+ - encoder models: 인코더 모델
95
+ - entity: 개체
96
+ - epoch: 에폭
97
+ - evaluation method: 평가 방법
98
+ - feature extraction: 특성 추출
99
+ - feature matrix: 특성 행렬(feature matrix)
100
+ - fine-tunning: 미세 조정
101
+ - finetuned models: 미세 조정 모델
102
+ - hidden state: 은닉 상태
103
+ - hyperparameter: 하이퍼파라미터
104
+ - learning: 학습
105
+ - load: 가져오다
106
+ - method: 메소드
107
+ - optimizer: 옵티마이저
108
+ - pad (padding): 패드 (패딩)
109
+ - parameter: 매개변수
110
+ - pretrained model: 사전훈련된 모델
111
+ - separator (* [SEP]를 부르는 이름): 분할 토큰
112
+ - sequence: 시퀀스
113
+ - silent error: 조용한 오류
114
+ - token: 토큰
115
+ - tokenizer: 토크나이저
116
+ - training: 훈련
117
+ - workflow: 워크플로우
118
+
119
+ 📌 Instructions:
120
+ 1. Whenever a source term from the glossary appears **in any form** (full match or partial match within a larger phrase), **replace it with the exact Korean translation** from the glossary, keeping the rest of the phrase in Korean.
121
+ - Example: “Attention Interface” → “어텐션 인터페이스”
122
+ - Example: “Architecture details” → “아키텍처 상세”
123
+ 2. Non-glossary words should be translated naturally, respecting context and technical nuance.
124
+
125
+ Please revise the translated sentences accordingly using the terms provided in this glossary.
126
+ """
translator/retriever.py CHANGED
@@ -1,199 +1,199 @@
1
- import re
2
- import os
3
- from pathlib import Path
4
-
5
- import requests
6
-
7
- from .model import Languages, Summary, TranslationDoc
8
- from .project_config import get_project_config
9
-
10
-
11
- def get_github_repo_files(project: str = "transformers"):
12
- """
13
- Get github repo files
14
- """
15
- config = get_project_config(project)
16
-
17
- # Add GitHub token if available to avoid rate limiting (optional)
18
- headers = {}
19
- github_token = os.environ.get("GITHUB_TOKEN")
20
- if github_token:
21
- headers["Authorization"] = f"token {github_token}"
22
-
23
- response = requests.get(config.api_url, headers=headers)
24
-
25
- # Handle rate limit with helpful message
26
- if response.status_code == 403 and "rate limit" in response.text.lower():
27
- raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
28
-
29
- data = response.json()
30
- all_items = data.get("tree", [])
31
-
32
- file_paths = [
33
- item["path"]
34
- for item in all_items
35
- if item["type"] == "blob" and (item["path"].startswith("docs"))
36
- ]
37
- return file_paths
38
-
39
-
40
- def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko", all_files: list = None):
41
- """
42
- Get open PR in the github issue, filtered by title containing '[i18n-KO]'.
43
- """
44
- config = get_project_config(project)
45
- issue_id = config.github_issues.get(lang)
46
-
47
- # For projects without GitHub issue tracking, still search for PRs
48
- if not issue_id:
49
- raise ValueError(f"⚠️ No GitHub issue registered for {project}.")
50
-
51
- # Require all_files parameter
52
- if all_files is None:
53
- raise ValueError("Repository file list must be provided")
54
-
55
- headers = {
56
- "Accept": "application/vnd.github+json",
57
- }
58
-
59
- # Add GitHub token if available to avoid rate limiting (optional)
60
- github_token = os.environ.get("GITHUB_TOKEN")
61
- if github_token:
62
- headers["Authorization"] = f"token {github_token}"
63
-
64
- all_open_prs = []
65
- page = 1
66
- per_page = 100 # Maximum allowed by GitHub API
67
-
68
- while True:
69
- repo_path = config.repo_url.replace("https://github.com/", "")
70
- url = f"https://api.github.com/repos/{repo_path}/pulls?state=open&page={page}&per_page={per_page}"
71
- response = requests.get(url, headers=headers)
72
-
73
- if response.status_code == 403 and "rate limit" in response.text.lower():
74
- raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
75
- elif response.status_code != 200:
76
- raise Exception(f"GitHub API error: {response.status_code} {response.text}")
77
-
78
- page_prs = response.json()
79
- if not page_prs: # No more PRs
80
- break
81
-
82
- all_open_prs.extend(page_prs)
83
- page += 1
84
-
85
- # Break if we got less than per_page results (last page)
86
- if len(page_prs) < per_page:
87
- break
88
-
89
- filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]]
90
-
91
- # Pattern to match filenames after "Translated" keyword
92
- pattern = re.compile(r"Translated\s+(?:`([^`]+)`|(\S+))\s+to")
93
-
94
- def find_original_file_path(filename_from_title, all_files):
95
- """Find the exact file path from repo files by matching filename"""
96
- if not filename_from_title:
97
- return None
98
-
99
- # Remove .md extension for matching
100
- base_name = filename_from_title.replace('.md', '')
101
-
102
- # Look for exact matches in repo files
103
- for file_path in all_files:
104
- if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
105
- file_base = file_path.split("/")[-1].replace('.md', '')
106
- if file_base == base_name:
107
- return file_path
108
-
109
- # If no exact match, fallback to simple path
110
- return f"docs/source/en/{filename_from_title}"
111
-
112
- filenames = []
113
- pr_info_list = []
114
-
115
- for pr in filtered_prs:
116
- match = pattern.search(pr["title"])
117
- if match:
118
- # Use group 1 (with backticks) or group 2 (without backticks)
119
- filename = match.group(1) or match.group(2)
120
- # Add .md extension if not present
121
- if not filename.endswith('.md'):
122
- filename += '.md'
123
-
124
- # Find the correct file path by matching filename
125
- correct_path = None
126
- if filename:
127
- # Remove .md extension for matching
128
- base_name = filename.replace('.md', '')
129
-
130
- # Look for exact matches in repo files
131
- for file_path in all_files:
132
- if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
133
- file_base = file_path.split("/")[-1].replace('.md', '')
134
- if file_base == base_name:
135
- correct_path = file_path
136
- break
137
-
138
- # If no exact match, fallback to simple path
139
- if not correct_path:
140
- correct_path = f"docs/source/en/{filename}"
141
- if correct_path:
142
- filenames.append(correct_path)
143
- pr_info_list.append(f"{config.repo_url}/pull/{pr['url'].rstrip('/').split('/')[-1]}")
144
- return filenames, pr_info_list
145
-
146
-
147
- def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
148
- """
149
- Retrieve missing docs
150
- """
151
-
152
- report = f"""
153
- | Item | Count | Percentage |
154
- |------|-------|------------|
155
- | 📂 HuggingFaces docs | {summary.files_analyzed} | - |
156
- | 🪹 Missing translations | {summary.files_missing_translation} | {summary.percentage_missing_translation:.2f}% |
157
- """
158
- print(report)
159
- first_missing_docs = list()
160
- for file in summary.first_missing_translation_files(table_size):
161
- first_missing_docs.append(file.original_file)
162
-
163
- print(first_missing_docs)
164
- return report, first_missing_docs
165
-
166
-
167
- def report(project: str, target_lang: str, top_k: int = 1, docs_file: list = None) -> tuple[str, list[str]]:
168
- """
169
- Generate a report for the translated docs
170
- """
171
- if docs_file is None:
172
- raise ValueError("Repository file list must be provided")
173
-
174
- base_docs_path = Path("docs/source")
175
- en_docs_path = Path("docs/source/en")
176
-
177
- lang = Languages[target_lang]
178
- summary = Summary(lang=lang.value)
179
-
180
- for file in docs_file:
181
- if file.endswith(".md"):
182
- try:
183
- file_relative_path = Path(file).relative_to(en_docs_path)
184
- except ValueError:
185
- continue
186
-
187
- translated_path = os.path.join(
188
- base_docs_path, lang.value, file_relative_path
189
- )
190
- translation_exists = translated_path in docs_file
191
-
192
- doc = TranslationDoc(
193
- translation_lang=lang.value,
194
- original_file=file,
195
- translation_file=translated_path,
196
- translation_exists=translation_exists,
197
- )
198
- summary.append_file(doc)
199
- return retrieve(summary, top_k)
 
1
+ import re
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import requests
6
+
7
+ from .model import Languages, Summary, TranslationDoc
8
+ from .project_config import get_project_config
9
+
10
+
11
+ def get_github_repo_files(project: str = "transformers"):
12
+ """
13
+ Get github repo files
14
+ """
15
+ config = get_project_config(project)
16
+
17
+ # Add GitHub token if available to avoid rate limiting (optional)
18
+ headers = {}
19
+ github_token = os.environ.get("GITHUB_TOKEN")
20
+ if github_token:
21
+ headers["Authorization"] = f"token {github_token}"
22
+
23
+ response = requests.get(config.api_url, headers=headers)
24
+
25
+ # Handle rate limit with helpful message
26
+ if response.status_code == 403 and "rate limit" in response.text.lower():
27
+ raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
28
+
29
+ data = response.json()
30
+ all_items = data.get("tree", [])
31
+
32
+ file_paths = [
33
+ item["path"]
34
+ for item in all_items
35
+ if item["type"] == "blob" and (item["path"].startswith("docs"))
36
+ ]
37
+ return file_paths
38
+
39
+
40
+ def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko", all_files: list = None):
41
+ """
42
+ Get open PR in the github issue, filtered by title containing '[i18n-KO]'.
43
+ """
44
+ config = get_project_config(project)
45
+ issue_id = config.github_issues.get(lang)
46
+
47
+ # For projects without GitHub issue tracking, still search for PRs
48
+ if not issue_id:
49
+ raise ValueError(f"⚠️ No GitHub issue registered for {project}.")
50
+
51
+ # Require all_files parameter
52
+ if all_files is None:
53
+ raise ValueError("Repository file list must be provided")
54
+
55
+ headers = {
56
+ "Accept": "application/vnd.github+json",
57
+ }
58
+
59
+ # Add GitHub token if available to avoid rate limiting (optional)
60
+ github_token = os.environ.get("GITHUB_TOKEN")
61
+ if github_token:
62
+ headers["Authorization"] = f"token {github_token}"
63
+
64
+ all_open_prs = []
65
+ page = 1
66
+ per_page = 100 # Maximum allowed by GitHub API
67
+
68
+ while True:
69
+ repo_path = config.repo_url.replace("https://github.com/", "")
70
+ url = f"https://api.github.com/repos/{repo_path}/pulls?state=open&page={page}&per_page={per_page}"
71
+ response = requests.get(url, headers=headers)
72
+
73
+ if response.status_code == 403 and "rate limit" in response.text.lower():
74
+ raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
75
+ elif response.status_code != 200:
76
+ raise Exception(f"GitHub API error: {response.status_code} {response.text}")
77
+
78
+ page_prs = response.json()
79
+ if not page_prs: # No more PRs
80
+ break
81
+
82
+ all_open_prs.extend(page_prs)
83
+ page += 1
84
+
85
+ # Break if we got less than per_page results (last page)
86
+ if len(page_prs) < per_page:
87
+ break
88
+
89
+ filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]]
90
+
91
+ # Pattern to match filenames after "Translated" keyword
92
+ pattern = re.compile(r"Translated\s+(?:`([^`]+)`|(\S+))\s+to")
93
+
94
+ def find_original_file_path(filename_from_title, all_files):
95
+ """Find the exact file path from repo files by matching filename"""
96
+ if not filename_from_title:
97
+ return None
98
+
99
+ # Remove .md extension for matching
100
+ base_name = filename_from_title.replace('.md', '')
101
+
102
+ # Look for exact matches in repo files
103
+ for file_path in all_files:
104
+ if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
105
+ file_base = file_path.split("/")[-1].replace('.md', '')
106
+ if file_base == base_name:
107
+ return file_path
108
+
109
+ # If no exact match, fallback to simple path
110
+ return f"docs/source/en/{filename_from_title}"
111
+
112
+ filenames = []
113
+ pr_info_list = []
114
+
115
+ for pr in filtered_prs:
116
+ match = pattern.search(pr["title"])
117
+ if match:
118
+ # Use group 1 (with backticks) or group 2 (without backticks)
119
+ filename = match.group(1) or match.group(2)
120
+ # Add .md extension if not present
121
+ if not filename.endswith('.md'):
122
+ filename += '.md'
123
+
124
+ # Find the correct file path by matching filename
125
+ correct_path = None
126
+ if filename:
127
+ # Remove .md extension for matching
128
+ base_name = filename.replace('.md', '')
129
+
130
+ # Look for exact matches in repo files
131
+ for file_path in all_files:
132
+ if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
133
+ file_base = file_path.split("/")[-1].replace('.md', '')
134
+ if file_base == base_name:
135
+ correct_path = file_path
136
+ break
137
+
138
+ # If no exact match, fallback to simple path
139
+ if not correct_path:
140
+ correct_path = f"docs/source/en/{filename}"
141
+ if correct_path:
142
+ filenames.append(correct_path)
143
+ pr_info_list.append(f"{config.repo_url}/pull/{pr['url'].rstrip('/').split('/')[-1]}")
144
+ return filenames, pr_info_list
145
+
146
+
147
+ def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
148
+ """
149
+ Retrieve missing docs
150
+ """
151
+
152
+ report = f"""
153
+ | Item | Count | Percentage |
154
+ |------|-------|------------|
155
+ | 📂 HuggingFaces docs | {summary.files_analyzed} | - |
156
+ | 🪹 Missing translations | {summary.files_missing_translation} | {summary.percentage_missing_translation:.2f}% |
157
+ """
158
+ print(report)
159
+ first_missing_docs = list()
160
+ for file in summary.first_missing_translation_files(table_size):
161
+ first_missing_docs.append(file.original_file)
162
+
163
+ print(first_missing_docs)
164
+ return report, first_missing_docs
165
+
166
+
167
+ def report(project: str, target_lang: str, top_k: int = 1, docs_file: list = None) -> tuple[str, list[str]]:
168
+ """
169
+ Generate a report for the translated docs
170
+ """
171
+ if docs_file is None:
172
+ raise ValueError("Repository file list must be provided")
173
+
174
+ base_docs_path = Path("docs/source")
175
+ en_docs_path = Path("docs/source/en")
176
+
177
+ lang = Languages[target_lang]
178
+ summary = Summary(lang=lang.value)
179
+
180
+ for file in docs_file:
181
+ if file.endswith(".md"):
182
+ try:
183
+ file_relative_path = Path(file).relative_to(en_docs_path)
184
+ except ValueError:
185
+ continue
186
+
187
+ translated_path = os.path.join(
188
+ base_docs_path, lang.value, file_relative_path
189
+ )
190
+ translation_exists = translated_path in docs_file
191
+
192
+ doc = TranslationDoc(
193
+ translation_lang=lang.value,
194
+ original_file=file,
195
+ translation_file=translated_path,
196
+ translation_exists=translation_exists,
197
+ )
198
+ summary.append_file(doc)
199
+ return retrieve(summary, top_k)