Spaces:

huggingface-KREW
/

i18n-agent

Sleeping

File size: 7,250 Bytes

import re
import os
from pathlib import Path

import requests

from .model import Languages, Summary, TranslationDoc
from .project_config import get_project_config


def get_github_repo_files(project: str = "transformers"):
    """
    Get github repo files
    """
    config = get_project_config(project)
    
    # Add GitHub token if available to avoid rate limiting (optional)
    headers = {}
    github_token = os.environ.get("GITHUB_TOKEN")
    if github_token:
        headers["Authorization"] = f"token {github_token}"
    
    response = requests.get(config.api_url, headers=headers)
    
    # Handle rate limit with helpful message
    if response.status_code == 403 and "rate limit" in response.text.lower():
        raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")

    data = response.json()
    all_items = data.get("tree", [])

    file_paths = [
        item["path"]
        for item in all_items
        if item["type"] == "blob" and (item["path"].startswith("docs"))
    ]
    return file_paths


def get_github_issue_open_pr(project: str = "transformers", lang: str = "ko", all_files: list = None):
    """
    Get open PR in the github issue, filtered by title containing '[i18n-KO]'.
    """
    config = get_project_config(project)
    issue_id = config.github_issues.get(lang)
    
    # For projects without GitHub issue tracking, still search for PRs
    if not issue_id:
        raise ValueError(f"⚠️ No GitHub issue registered for {project}.")

    # Require all_files parameter 
    if all_files is None:
        raise ValueError("Repository file list must be provided")
    
    headers = {
        "Accept": "application/vnd.github+json",
    }
    
    # Add GitHub token if available to avoid rate limiting (optional)
    github_token = os.environ.get("GITHUB_TOKEN")
    if github_token:
        headers["Authorization"] = f"token {github_token}"
    
    all_open_prs = []
    page = 1
    per_page = 100  # Maximum allowed by GitHub API
    
    while True:
        repo_path = config.repo_url.replace("https://github.com/", "")
        url = f"https://api.github.com/repos/{repo_path}/pulls?state=open&page={page}&per_page={per_page}"
        response = requests.get(url, headers=headers)
        
        if response.status_code == 403 and "rate limit" in response.text.lower():
            raise Exception(f"GitHub API rate limit exceeded. To avoid this, set GITHUB_TOKEN in your environment or provide a GitHub token in the UI. Details: {response.text}")
        elif response.status_code != 200:
            raise Exception(f"GitHub API error: {response.status_code} {response.text}")
        
        page_prs = response.json()
        if not page_prs:  # No more PRs
            break
            
        all_open_prs.extend(page_prs)
        page += 1
        
        # Break if we got less than per_page results (last page)
        if len(page_prs) < per_page:
            break

    filtered_prs = [pr for pr in all_open_prs if "[i18n-KO]" in pr["title"]]

    # Pattern to match filenames after "Translated" keyword
    pattern = re.compile(r"Translated\s+(?:`([^`]+)`|(\S+))\s+to")

    def find_original_file_path(filename_from_title, all_files):
        """Find the exact file path from repo files by matching filename"""
        if not filename_from_title:
            return None
            
        # Remove .md extension for matching
        base_name = filename_from_title.replace('.md', '')
        
        # Look for exact matches in repo files
        for file_path in all_files:
            if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
                file_base = file_path.split("/")[-1].replace('.md', '')
                if file_base == base_name:
                    return file_path
                    
        # If no exact match, fallback to simple path
        return f"docs/source/en/{filename_from_title}"
    
    filenames = []
    pr_info_list = []
    
    for pr in filtered_prs:
        match = pattern.search(pr["title"])
        if match:
            # Use group 1 (with backticks) or group 2 (without backticks)
            filename = match.group(1) or match.group(2)
            # Add .md extension if not present
            if not filename.endswith('.md'):
                filename += '.md'
                
            # Find the correct file path by matching filename
            correct_path = None
            if filename:
                # Remove .md extension for matching
                base_name = filename.replace('.md', '')
                
                # Look for exact matches in repo files
                for file_path in all_files:
                    if file_path.startswith("docs/source/en/") and file_path.endswith(".md"):
                        file_base = file_path.split("/")[-1].replace('.md', '')
                        if file_base == base_name:
                            correct_path = file_path
                            break
                            
                # If no exact match, fallback to simple path
                if not correct_path:
                    correct_path = f"docs/source/en/{filename}"
            if correct_path:
                filenames.append(correct_path)
                pr_info_list.append(f"{config.repo_url}/pull/{pr['url'].rstrip('/').split('/')[-1]}")
    return filenames, pr_info_list


def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
    """
    Retrieve missing docs
    """

    report = f"""
| Item | Count | Percentage |
|------|-------|------------|
| 📂 HuggingFaces docs | {summary.files_analyzed} | - |
| 🪹 Missing translations | {summary.files_missing_translation} | {summary.percentage_missing_translation:.2f}% |
"""
    print(report)
    first_missing_docs = list()
    for file in summary.first_missing_translation_files(table_size):
        first_missing_docs.append(file.original_file)

    print(first_missing_docs)
    return report, first_missing_docs


def report(project: str, target_lang: str, top_k: int = 1, docs_file: list = None) -> tuple[str, list[str]]:
    """
    Generate a report for the translated docs
    """
    if docs_file is None:
        raise ValueError("Repository file list must be provided")

    base_docs_path = Path("docs/source")
    en_docs_path = Path("docs/source/en")

    lang = Languages[target_lang]
    summary = Summary(lang=lang.value)

    for file in docs_file:
        if file.endswith(".md"):
            try:
                file_relative_path = Path(file).relative_to(en_docs_path)
            except ValueError:
                continue

            translated_path = os.path.join(
                base_docs_path, lang.value, file_relative_path
            )
            translation_exists = translated_path in docs_file

            doc = TranslationDoc(
                translation_lang=lang.value,
                original_file=file,
                translation_file=translated_path,
                translation_exists=translation_exists,
            )
            summary.append_file(doc)
    return retrieve(summary, top_k)