File size: 7,250 Bytes
159b6fa
42d1e42
 
 
 
 
 
7e4dd83
42d1e42
 
7e4dd83
42d1e42
 
 
7e4dd83
 
859119b
7e4dd83
 
 
 
 
 
859119b
 
 
 
42d1e42
 
 
 
 
 
 
 
 
 
 
 
1b1c0d8
159b6fa
7e4dd83
159b6fa
7e4dd83
 
 
 
 
 
159b6fa
1b1c0d8
 
 
 
159b6fa
 
 
ef58ab4
859119b
7e4dd83
 
 
 
ef58ab4
 
 
 
 
7e4dd83
 
ef58ab4
 
859119b
 
 
ef58ab4
 
 
 
 
 
 
 
 
 
 
 
159b6fa
7e4dd83
159b6fa
1b1c0d8
 
159b6fa
1b1c0d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e4dd83
1b1c0d8
 
7e4dd83
 
 
 
 
1b1c0d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159b6fa
 
 
42d1e42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b1c0d8
42d1e42
 
 
1b1c0d8
 
42d1e42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import re
import os
from pathlib import Path

import requests

from .model import Languages, Summary, TranslationDoc
from .project_config import get_project_config


def get_github_repo_files(project: str = "transformers"):
    """
    Get github repo files
    """
    config = get_project_config(project)
    
    # Add GitHub token if available to avoid rate limiting (optional)
    headers = {}
    github_token = os.environ.get("GITHUB_TOKEN")
    if github_token:
        headers["Authorization"] = f"token {github_token}"
    
    response = requests.get(config.api_url, headers=headers)
    
    # Handle rate limit with helpful message
    if response.status_code == 403 and "rate limit" in response.text.lower():
        raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")

    data = response.json()
    all_items = data.get("tree", [])

    file_paths = [
        item["path"]
        for item in all_items
        if item["type"] == "blob" and (item["path"].startswith("docs"))
    ]
    return file_paths


def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko", all_files: list = None):
    """
    Get open PR in the github issue, filtered by title containing '[i18n-KO]'.
    """
    config = get_project_config(project)
    issue_id = config.github_issues.get(lang)
    
    # For projects without GitHub issue tracking, still search for PRs
    if not issue_id:
        raise ValueError(f"⚠️ No GitHub issue registered for {project}.")

    # Require all_files parameter 
    if all_files is None:
        raise ValueError("Repository file list must be provided")
    
    headers = {
        "Accept": "application/vnd.github+json",
    }
    
    # Add GitHub token if available to avoid rate limiting (optional)
    github_token = os.environ.get("GITHUB_TOKEN")
    if github_token:
        headers["Authorization"] = f"token {github_token}"
    
    all_open_prs = []
    page = 1
    per_page = 100  # Maximum allowed by GitHub API
    
    while True:
        repo_path = config.repo_url.replace("https://github.com/", "")
        url = f"https://api.github.com/repos/{repo_path}/pulls?state=open&page={page}&per_page={per_page}"
        response = requests.get(url, headers=headers)
        
        if response.status_code == 403 and "rate limit" in response.text.lower():
            raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
        elif response.status_code != 200:
            raise Exception(f"GitHub API error: {response.status_code} {response.text}")
        
        page_prs = response.json()
        if not page_prs:  # No more PRs
            break
            
        all_open_prs.extend(page_prs)
        page += 1
        
        # Break if we got less than per_page results (last page)
        if len(page_prs) < per_page:
            break

    filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]]

    # Pattern to match filenames after "Translated" keyword
    pattern = re.compile(r"Translated\s+(?:`([^`]+)`|(\S+))\s+to")

    def find_original_file_path(filename_from_title, all_files):
        """Find the exact file path from repo files by matching filename"""
        if not filename_from_title:
            return None
            
        # Remove .md extension for matching
        base_name = filename_from_title.replace('.md', '')
        
        # Look for exact matches in repo files
        for file_path in all_files:
            if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
                file_base = file_path.split("/")[-1].replace('.md', '')
                if file_base == base_name:
                    return file_path
                    
        # If no exact match, fallback to simple path
        return f"docs/source/en/{filename_from_title}"
    
    filenames = []
    pr_info_list = []
    
    for pr in filtered_prs:
        match = pattern.search(pr["title"])
        if match:
            # Use group 1 (with backticks) or group 2 (without backticks)
            filename = match.group(1) or match.group(2)
            # Add .md extension if not present
            if not filename.endswith('.md'):
                filename += '.md'
                
            # Find the correct file path by matching filename
            correct_path = None
            if filename:
                # Remove .md extension for matching
                base_name = filename.replace('.md', '')
                
                # Look for exact matches in repo files
                for file_path in all_files:
                    if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
                        file_base = file_path.split("/")[-1].replace('.md', '')
                        if file_base == base_name:
                            correct_path = file_path
                            break
                            
                # If no exact match, fallback to simple path
                if not correct_path:
                    correct_path = f"docs/source/en/{filename}"
            if correct_path:
                filenames.append(correct_path)
                pr_info_list.append(f"{config.repo_url}/pull/{pr['url'].rstrip('/').split('/')[-1]}")
    return filenames, pr_info_list


def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
    """
    Retrieve missing docs
    """

    report = f"""
| Item | Count | Percentage |
|------|-------|------------|
| 📂 HuggingFaces docs | {summary.files_analyzed} | - |
| 🪹 Missing translations | {summary.files_missing_translation} | {summary.percentage_missing_translation:.2f}% |
"""
    print(report)
    first_missing_docs = list()
    for file in summary.first_missing_translation_files(table_size):
        first_missing_docs.append(file.original_file)

    print(first_missing_docs)
    return report, first_missing_docs


def report(project: str, target_lang: str, top_k: int = 1, docs_file: list = None) -> tuple[str, list[str]]:
    """
    Generate a report for the translated docs
    """
    if docs_file is None:
        raise ValueError("Repository file list must be provided")

    base_docs_path = Path("docs/source")
    en_docs_path = Path("docs/source/en")

    lang = Languages[target_lang]
    summary = Summary(lang=lang.value)

    for file in docs_file:
        if file.endswith(".md"):
            try:
                file_relative_path = Path(file).relative_to(en_docs_path)
            except ValueError:
                continue

            translated_path = os.path.join(
                base_docs_path, lang.value, file_relative_path
            )
            translation_exists = translated_path in docs_file

            doc = TranslationDoc(
                translation_lang=lang.value,
                original_file=file,
                translation_file=translated_path,
                translation_exists=translation_exists,
            )
            summary.append_file(doc)
    return retrieve(summary, top_k)