Spaces:
Sleeping
Sleeping
File size: 7,250 Bytes
159b6fa 42d1e42 7e4dd83 42d1e42 7e4dd83 42d1e42 7e4dd83 859119b 7e4dd83 859119b 42d1e42 1b1c0d8 159b6fa 7e4dd83 159b6fa 7e4dd83 159b6fa 1b1c0d8 159b6fa ef58ab4 859119b 7e4dd83 ef58ab4 7e4dd83 ef58ab4 859119b ef58ab4 159b6fa 7e4dd83 159b6fa 1b1c0d8 159b6fa 1b1c0d8 7e4dd83 1b1c0d8 7e4dd83 1b1c0d8 159b6fa 42d1e42 1b1c0d8 42d1e42 1b1c0d8 42d1e42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
import re
import os
from pathlib import Path
import requests
from .model import Languages, Summary, TranslationDoc
from .project_config import get_project_config
def get_github_repo_files(project: str = "transformers"):
"""
Get github repo files
"""
config = get_project_config(project)
# Add GitHub token if available to avoid rate limiting (optional)
headers = {}
github_token = os.environ.get("GITHUB_TOKEN")
if github_token:
headers["Authorization"] = f"token {github_token}"
response = requests.get(config.api_url, headers=headers)
# Handle rate limit with helpful message
if response.status_code == 403 and "rate limit" in response.text.lower():
raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
data = response.json()
all_items = data.get("tree", [])
file_paths = [
item["path"]
for item in all_items
if item["type"] == "blob" and (item["path"].startswith("docs"))
]
return file_paths
def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko", all_files: list = None):
"""
Get open PR in the github issue, filtered by title containing '[i18n-KO]'.
"""
config = get_project_config(project)
issue_id = config.github_issues.get(lang)
# For projects without GitHub issue tracking, still search for PRs
if not issue_id:
raise ValueError(f"⚠️ No GitHub issue registered for {project}.")
# Require all_files parameter
if all_files is None:
raise ValueError("Repository file list must be provided")
headers = {
"Accept": "application/vnd.github+json",
}
# Add GitHub token if available to avoid rate limiting (optional)
github_token = os.environ.get("GITHUB_TOKEN")
if github_token:
headers["Authorization"] = f"token {github_token}"
all_open_prs = []
page = 1
per_page = 100 # Maximum allowed by GitHub API
while True:
repo_path = config.repo_url.replace("https://github.com/", "")
url = f"https://api.github.com/repos/{repo_path}/pulls?state=open&page={page}&per_page={per_page}"
response = requests.get(url, headers=headers)
if response.status_code == 403 and "rate limit" in response.text.lower():
raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
elif response.status_code != 200:
raise Exception(f"GitHub API error: {response.status_code} {response.text}")
page_prs = response.json()
if not page_prs: # No more PRs
break
all_open_prs.extend(page_prs)
page += 1
# Break if we got less than per_page results (last page)
if len(page_prs) < per_page:
break
filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]]
# Pattern to match filenames after "Translated" keyword
pattern = re.compile(r"Translated\s+(?:`([^`]+)`|(\S+))\s+to")
def find_original_file_path(filename_from_title, all_files):
"""Find the exact file path from repo files by matching filename"""
if not filename_from_title:
return None
# Remove .md extension for matching
base_name = filename_from_title.replace('.md', '')
# Look for exact matches in repo files
for file_path in all_files:
if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
file_base = file_path.split("/")[-1].replace('.md', '')
if file_base == base_name:
return file_path
# If no exact match, fallback to simple path
return f"docs/source/en/{filename_from_title}"
filenames = []
pr_info_list = []
for pr in filtered_prs:
match = pattern.search(pr["title"])
if match:
# Use group 1 (with backticks) or group 2 (without backticks)
filename = match.group(1) or match.group(2)
# Add .md extension if not present
if not filename.endswith('.md'):
filename += '.md'
# Find the correct file path by matching filename
correct_path = None
if filename:
# Remove .md extension for matching
base_name = filename.replace('.md', '')
# Look for exact matches in repo files
for file_path in all_files:
if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
file_base = file_path.split("/")[-1].replace('.md', '')
if file_base == base_name:
correct_path = file_path
break
# If no exact match, fallback to simple path
if not correct_path:
correct_path = f"docs/source/en/{filename}"
if correct_path:
filenames.append(correct_path)
pr_info_list.append(f"{config.repo_url}/pull/{pr['url'].rstrip('/').split('/')[-1]}")
return filenames, pr_info_list
def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
"""
Retrieve missing docs
"""
report = f"""
| Item | Count | Percentage |
|------|-------|------------|
| 📂 HuggingFaces docs | {summary.files_analyzed} | - |
| 🪹 Missing translations | {summary.files_missing_translation} | {summary.percentage_missing_translation:.2f}% |
"""
print(report)
first_missing_docs = list()
for file in summary.first_missing_translation_files(table_size):
first_missing_docs.append(file.original_file)
print(first_missing_docs)
return report, first_missing_docs
def report(project: str, target_lang: str, top_k: int = 1, docs_file: list = None) -> tuple[str, list[str]]:
"""
Generate a report for the translated docs
"""
if docs_file is None:
raise ValueError("Repository file list must be provided")
base_docs_path = Path("docs/source")
en_docs_path = Path("docs/source/en")
lang = Languages[target_lang]
summary = Summary(lang=lang.value)
for file in docs_file:
if file.endswith(".md"):
try:
file_relative_path = Path(file).relative_to(en_docs_path)
except ValueError:
continue
translated_path = os.path.join(
base_docs_path, lang.value, file_relative_path
)
translation_exists = translated_path in docs_file
doc = TranslationDoc(
translation_lang=lang.value,
original_file=file,
translation_file=translated_path,
translation_exists=translation_exists,
)
summary.append_file(doc)
return retrieve(summary, top_k)
|