import gradio as gr import os import subprocess import tempfile import zipfile import pathlib import shutil from pathspec import PathSpec from pathspec.patterns import GitWildMatchPattern import traceback # Import traceback for better error logging # --- Configuration --- (Keep as before) DEFAULT_IGNORE_PATTERNS = """ # Default Ignore Patterns (Gitignore Syntax) /.git/ /.hg/ /.svn/ /.vscode/ /.idea/ /node_modules/ /vendor/ /build/ /dist/ /target/ *.pyc *.log *.swp *~ __pycache__/ .DS_Store """ MAX_OUTPUT_LINES = 10000 # Limit potential output size in display INDENT_CHAR = " " # 4 spaces for indentation FOLDER_ICON = "📁" FILE_ICON = "📄" # --- Core Logic --- (Keep get_repo_path and generate_markdown_structure as before) def get_repo_path(source_type, repo_url, branch_tag, zip_file_obj, progress=gr.Progress()): """Clones or extracts the repository, returning the local path.""" temp_dir = tempfile.mkdtemp() repo_path = None try: if source_type == "URL": if not repo_url: raise ValueError("GitHub Repository URL is required.") progress(0.1, desc="Cloning repository...") git_command = ["git", "clone", "--depth", "1"] # Shallow clone for speed if branch_tag: git_command.extend(["--branch", branch_tag]) git_command.extend([repo_url, temp_dir]) print(f"Running command: {' '.join(git_command)}") # For debugging result = subprocess.run(git_command, capture_output=True, text=True, check=False) if result.returncode != 0: # Attempt clone without branch if specific one failed (might be default branch) if branch_tag: progress(0.2, desc=f"Branch '{branch_tag}' not found or clone failed, trying default branch...") git_command = ["git", "clone", "--depth", "1", repo_url, temp_dir] print(f"Running command: {' '.join(git_command)}") # For debugging result = subprocess.run(git_command, capture_output=True, text=True, check=False) if result.returncode != 0: error_message = f"Git clone failed:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}" print(error_message) # Log detailed error # Try to extract a user-friendly message if "Authentication failed" in result.stderr: raise ValueError("Authentication failed. Private repositories require different handling (e.g., tokens) which is not supported here.") elif "not found" in result.stderr: raise ValueError(f"Repository or branch '{branch_tag or 'default'}' not found at URL: {repo_url}") else: raise ValueError(f"Git clone failed. Check URL and branch/tag. Error: {result.stderr.splitlines()[-1] if result.stderr else 'Unknown error'}") repo_path = pathlib.Path(temp_dir) progress(0.5, desc="Repository cloned.") print(f"Cloned repo to: {repo_path}") # Debugging elif source_type == "Upload ZIP": if zip_file_obj is None: raise ValueError("ZIP file upload is required.") progress(0.1, desc="Extracting ZIP file...") zip_path = zip_file_obj.name # Gradio provides a temp file path with zipfile.ZipFile(zip_path, 'r') as zip_ref: # Check for common zip structure (single top-level dir) top_level_dirs = list(set(p.split('/')[0] for p in zip_ref.namelist() if '/' in p and p.split('/')[0])) extract_target = temp_dir potential_repo_root = temp_dir if len(top_level_dirs) == 1: # If zip contains repo-main/file structure, extract *into* temp_dir # The actual repo content will be inside temp_dir/repo-main/ zip_ref.extractall(extract_target) potential_repo_root = os.path.join(temp_dir, top_level_dirs[0]) print(f"ZIP has single top-level dir: {top_level_dirs[0]}. Potential root: {potential_repo_root}") else: # Otherwise, extract directly into temp_dir zip_ref.extractall(extract_target) print(f"ZIP structure seems flat or multi-root. Using extract target as root: {extract_target}") # Basic check if potential_repo_root looks like a valid directory if os.path.isdir(potential_repo_root): repo_path = pathlib.Path(potential_repo_root) else: # Fallback if single dir logic failed or wasn't applicable repo_path = pathlib.Path(extract_target) progress(0.5, desc="ZIP extracted.") print(f"Extracted ZIP to: {repo_path}") # Debugging else: raise ValueError("Invalid source type selected.") if not repo_path or not repo_path.is_dir(): # Add more specific debugging info here print(f"Debug Info: Temp dir content: {list(os.listdir(temp_dir))}") if 'potential_repo_root' in locals() and potential_repo_root != temp_dir: print(f"Debug Info: Potential repo root '{potential_repo_root}' exists: {os.path.exists(potential_repo_root)}, is dir: {os.path.isdir(potential_repo_root)}") raise ValueError(f"Could not determine repository root directory within: {temp_dir}") return repo_path, temp_dir # Return both the repo content path and the parent temp dir for cleanup except Exception as e: # Clean up the temporary directory on error before re-raising shutil.rmtree(temp_dir, ignore_errors=True) print(f"Error in get_repo_path: {e}") # Log error traceback.print_exc() # Print full traceback for debugging get_repo_path issues raise e # Re-raise the exception to be caught by the main function def generate_markdown_structure( repo_root_path: pathlib.Path, include_content: bool, max_size_kb: int, ignore_patterns_str: str, progress=gr.Progress() ): """Generates the Markdown string from the repository structure.""" repo_root_path = pathlib.Path(repo_root_path) # Ensure it's a Path object markdown_lines = [] max_file_size_bytes = max_size_kb * 1024 if max_size_kb > 0 else 0 # --- Prepare ignore patterns --- # Combine default and user patterns full_ignore_patterns = DEFAULT_IGNORE_PATTERNS.strip() + "\n" + ignore_patterns_str.strip() # Filter out empty lines and comments patterns = [line for line in full_ignore_patterns.splitlines() if line.strip() and not line.strip().startswith('#')] # Create unique list while preserving order (important if later patterns override earlier ones) seen = set() unique_patterns = [x for x in patterns if not (x in seen or seen.add(x))] spec = PathSpec.from_lines(GitWildMatchPattern, unique_patterns) print(f"Using unique ignore patterns: {unique_patterns}") # Debugging # --- Add header --- repo_name = repo_root_path.name markdown_lines.append(f"# {FOLDER_ICON} {repo_name}") markdown_lines.append("") # --- Walk through the directory --- progress(0.6, desc="Scanning repository structure...") files_processed = 0 # Need to iterate through items relative to the root for pathspec matching all_items = sorted(list(repo_root_path.rglob('*'))) total_items_estimate = len(all_items) # More accurate estimate items_scanned = 0 for item_path in all_items: items_scanned += 1 if items_scanned % 50 == 0: # Update progress periodically progress(0.6 + (0.3 * (items_scanned / max(1, total_items_estimate))), desc=f"Scanning: {item_path.name}") relative_path = item_path.relative_to(repo_root_path) # Pathspec matches against the path string relative to the root where .gitignore would be # Important: Add a leading '/' for patterns like '/node_modules/' to only match at the root path_str_for_match = str(relative_path) # Check if the path itself should be ignored # Pathspec automatically handles directory patterns (e.g., node_modules/ matches files and dirs inside) if spec.match_file(path_str_for_match): print(f"Ignoring: {relative_path}") # Debugging # If it's a directory, we don't need to manually skip recursion because # rglob already gave us all paths; we just skip processing this specific path. # If we were using os.walk, we'd modify the dirs list here. continue # Calculate depth and indentation depth = len(relative_path.parts) -1 # 0-based depth relative to root content indent = INDENT_CHAR * depth # Add entry to Markdown if item_path.is_dir(): # Check if dir is empty *after* considering ignores. This is tricky with rglob. # A simple heuristic: check if any non-ignored children exist directly within it. # This isn't perfect but avoids complex lookahead. # has_children = any(p.relative_to(repo_root_path).parts[0] == relative_path.parts[0] and not spec.match_file(str(p.relative_to(repo_root_path))) for p in all_items if p != item_path and p.parent == item_path) # Simpler: Just always list the dir for now. Empty dir check is complex with ignores + rglob. markdown_lines.append(f"{indent}{FOLDER_ICON} **{item_path.name}/**") elif item_path.is_file(): markdown_lines.append(f"{indent}{FILE_ICON} {item_path.name}") files_processed += 1 # Include file content if requested and within limits if include_content and max_size_kb > 0: # Check > 0 explicitly try: file_size = item_path.stat().st_size if file_size == 0: markdown_lines.append(f"{indent}{INDENT_CHAR}```") markdown_lines.append(f"{indent}{INDENT_CHAR}[Empty File]") markdown_lines.append(f"{indent}{INDENT_CHAR}```") elif file_size <= max_file_size_bytes: try: # Attempt to detect binary files heuristically before reading large ones is_binary = False try: # Read a small chunk to check for null bytes with open(item_path, 'rb') as bf: chunk = bf.read(1024) if b'\x00' in chunk: is_binary = True except Exception: # Ignore errors during binary check, proceed as text pass if is_binary: markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Likely a binary file (Size: {file_size} bytes)]") else: content = item_path.read_text(encoding='utf-8', errors='replace') # Replace errors instead of failing lang = item_path.suffix.lstrip('.') # Simple lang detection, can be expanded if not lang: lang = "text" markdown_lines.append(f"{indent}{INDENT_CHAR}```{lang}") # Indent content lines content_lines = content.splitlines() # Limit output lines displayed in Markdown preview if necessary # Note: The downloaded file will have full content display_lines = content_lines[:MAX_OUTPUT_LINES] for line in display_lines: markdown_lines.append(f"{indent}{INDENT_CHAR}{line}") if len(content_lines) > MAX_OUTPUT_LINES: markdown_lines.append(f"{indent}{INDENT_CHAR}[... content truncated in preview ...]") markdown_lines.append(f"{indent}{INDENT_CHAR}```") except UnicodeDecodeError: # Should be less common now with errors='replace' markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error decoding file as UTF-8 (Size: {file_size} bytes)]") except Exception as read_err: markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error reading file - {read_err}]") else: markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: File size ({file_size:,} bytes) exceeds limit ({max_file_size_bytes:,} bytes)]") # Added commas except OSError as stat_err: markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Error accessing file stats - {stat_err}]") elif include_content and max_size_kb == 0: # Content included checked, but 0 size limit markdown_lines.append(f"{indent}{INDENT_CHAR}[Content omitted: Max file size set to 0 KB]") # Add a newline for separation, helps readability only if content wasn't added (which adds ```\n) # Or maybe always add it for consistency between file/dir entries markdown_lines.append("") progress(0.95, desc="Formatting output...") final_markdown = "\n".join(markdown_lines) print(f"Processed {files_processed} files.") # Debugging return final_markdown # --- Gradio Interface --- def process_repo( source_type, repo_url, branch_tag, zip_file_obj, include_content, max_size_kb, ignore_patterns, progress=gr.Progress(track_tqdm=True) ): """Main function called by Gradio button.""" status = "" output_markdown = "" output_file_path = None repo_root_path = None temp_dir_to_clean = None # Ensure max_size_kb is treated as a number try: max_size_kb_int = int(max_size_kb) if max_size_kb is not None else 0 except ValueError: yield "Error: Max File Size must be a number.", "", gr.update(value=None, visible=False) return try: progress(0, desc="Starting...") # Initial state update for all outputs yield "Preparing...", "", gr.update(value=None, visible=False) # 1. Get Repository Path yield "Fetching repository...", "", gr.update(value=None, visible=False) repo_root_path, temp_dir_to_clean = get_repo_path( source_type, repo_url, branch_tag, zip_file_obj, progress=progress ) # Check if path finding was successful before proceeding if not repo_root_path: # Error should have been raised in get_repo_path, but double-check raise ValueError("Failed to obtain repository path.") yield f"Repository ready at: {repo_root_path.name}", "", gr.update(value=None, visible=False) # 2. Generate Markdown yield "Generating Markdown structure...", "", gr.update(value=None, visible=False) markdown_content = generate_markdown_structure( repo_root_path, include_content, max_size_kb_int, ignore_patterns, progress=progress ) # Limit preview size robustly preview_limit = 3000 markdown_preview = markdown_content[:preview_limit] if len(markdown_content) > preview_limit: markdown_preview += "\n\n[... Output truncated in preview ...]" # 3. Prepare Output File yield "Saving Markdown to file...", markdown_preview, gr.update(value=None, visible=False) output_filename = f"{repo_root_path.name}_structure.md" # Sanitize filename slightly (replace spaces, etc.) - less critical in temp file context output_filename = "".join(c if c.isalnum() or c in ('_', '-', '.') else '_' for c in output_filename) # Save the file in a place Gradio can access (it manages temp files) with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".md", encoding='utf-8') as temp_file: temp_file.write(markdown_content) output_file_path = temp_file.name # Gradio needs the path to this file # *** CORRECTED YIELD USING gr.update *** yield f"Done. Output file '{output_filename}' ready for download.", markdown_preview, gr.update(value=output_file_path, visible=True, label=f"Download {output_filename}") except ValueError as ve: print(f"Value Error: {ve}") # Log error traceback.print_exc() # *** CORRECTED YIELD USING gr.update *** yield f"Error: {ve}", "", gr.update(value=None, visible=False) except subprocess.CalledProcessError as cpe: error_detail = cpe.stderr or cpe.stdout or "Unknown git error" print(f"Git Error: {error_detail}") # Log error traceback.print_exc() # *** CORRECTED YIELD USING gr.update *** yield f"Git command failed: {error_detail}", "", gr.update(value=None, visible=False) except Exception as e: print(f"Unexpected Error: {e}") # Log error traceback.print_exc() # Print full traceback to logs # *** CORRECTED YIELD USING gr.update *** yield f"An unexpected error occurred: {e}", "", gr.update(value=None, visible=False) finally: # 4. Cleanup if temp_dir_to_clean: print(f"Cleaning up temporary directory: {temp_dir_to_clean}") shutil.rmtree(temp_dir_to_clean, ignore_errors=True) print("Cleanup complete.") # --- Build Gradio UI --- (Keep as before) with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="cyan")) as demo: gr.Markdown("# GitHub Repository to Markdown Converter") gr.Markdown("Convert a GitHub repository's structure (and optionally content) into a single Markdown file.") with gr.Row(): with gr.Column(scale=1): gr.Markdown("## Input Source") input_source = gr.Radio( ["URL", "Upload ZIP"], label="Select Source Type", value="URL" ) url_input_group = gr.Group(visible=True) # Show URL by default with url_input_group: repo_url_input = gr.Textbox(label="Git Repository URL", placeholder="https://github.com/user/repo.git") branch_tag_input = gr.Textbox(label="Branch / Tag (Optional)", placeholder="main") zip_input_group = gr.Group(visible=False) # Hide ZIP by default with zip_input_group: zip_file_input = gr.File(label="Upload Repository ZIP", file_types=[".zip"]) # --- Configuration Options --- gr.Markdown("## Configuration") include_content_checkbox = gr.Checkbox(label="Include File Content in Output", value=False) max_size_input = gr.Number(label="Max File Size for Content (KB)", value=100, minimum=0, step=10, info="Files larger than this won't have content included. Set to 0 to disable content inclusion entirely, even if checked above.") ignore_patterns_input = gr.Textbox( label="Ignore Patterns (comma-separated or newline, gitignore style)", info="Uses .gitignore syntax. Add / for directories. Default patterns provided.", lines=5, value=DEFAULT_IGNORE_PATTERNS.strip() ) generate_button = gr.Button("Generate Markdown", variant="primary") with gr.Column(scale=2): gr.Markdown("## Status & Output") status_output = gr.Textbox(label="Current Status", interactive=False, lines=2) # Use a Textbox for preview initially, as Markdown rendering can be slow/heavy markdown_preview_output = gr.Textbox(label="Markdown Preview (Truncated)", interactive=False, lines=20) # Use gr.File for the final download link download_output = gr.File(label="Download Markdown File", visible=False, interactive=False) # Set interactive=False # --- Event Handlers --- (Keep as before) def toggle_input_visibility(choice): if choice == "URL": return gr.update(visible=True), gr.update(visible=False) else: # ZIP return gr.update(visible=False), gr.update(visible=True) input_source.change( fn=toggle_input_visibility, inputs=input_source, outputs=[url_input_group, zip_input_group], ) generate_button.click( fn=process_repo, inputs=[ input_source, repo_url_input, branch_tag_input, zip_file_input, include_content_checkbox, max_size_input, ignore_patterns_input ], outputs=[status_output, markdown_preview_output, download_output], # api_name="generate_markdown" # Optional: for API access ) # --- Launch the App --- (Keep as before) if __name__ == "__main__": # Ensure queue is enabled for HF Spaces deployment # debug=True is useful for local testing, might remove/set to False for production space demo.queue().launch(debug=True)