Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from pydantic import BaseModel, Field | |
| from typing import Optional, Any | |
| # Import statements that should only run once | |
| if gr.NO_RELOAD: | |
| import random | |
| import os | |
| from datetime import datetime | |
| from huggingface_hub import HfApi | |
| from typing import Optional | |
| from PIL import Image # Needed for working with PIL images | |
| import datasets | |
| import numpy as np # Added to help handle numpy array images | |
| import pandas as pd # Added for pandas DataFrame | |
| import cv2 # Added for OpenCV | |
| # Load environment variables from .env if available. | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| # The list of sentences from our previous conversation. | |
| sentences = [ | |
| "Optical character recognition (OCR) is the process of converting images of text into machine-readable data.", | |
| "When applied to handwriting, OCR faces additional challenges because of the natural variability in individual penmanship.", | |
| "Over the last century, advances in computer vision and machine learning have transformed handwriting OCR from bulky, specialized hardware into highly accurate, software-driven systems.", | |
| "The origins of OCR date back to the early 20th century.", | |
| "Early pioneers explored how machines might read text.", | |
| "In the 1920s, inventors such as Emanuel Goldberg developed early devices that could capture printed characters by converting them into telegraph codes.", | |
| "Around the same time, Gustav Tauschek created the Reading Machine using template-matching methods to detect letters in images.", | |
| "These devices were designed for printed text and depended on fixed, machine-friendly fonts rather than natural handwriting.", | |
| "In the 1950s, systems like David Shepard's GISMO emerged to begin automating the conversion of paper records into digital form.", | |
| "Although these early OCR systems were limited in scope and accuracy, they laid the groundwork for later innovations.", | |
| "The 1960s saw OCR technology being applied to real-world tasks.", | |
| "In 1965, American inventor Jacob Rabinow developed an OCR machine specifically aimed at sorting mail by reading addresses.", | |
| "This was a critical step for the U.S. Postal Service.", | |
| "Soon after, research groups, including those at IBM, began developing machines such as the IBM 1287, which was capable of reading handprinted numbers on envelopes to facilitate automated mail processing.", | |
| "These systems marked the first attempts to apply computer vision to handwritten data on a large scale.", | |
| "By the late 1980s and early 1990s, researchers such as Yann LeCun and his colleagues developed neural network architectures to recognize handwritten digits.", | |
| "Their work, initially applied to reading ZIP codes on mail, demonstrated that carefully designed, constrained neural networks could achieve error rates as low as about 1% on USPS data.", | |
| "Sargur Srihari and his team at the Center of Excellence for Document Analysis and Recognition extended these ideas to develop complete handwritten address interpretation systems.", | |
| "These systems, deployed by the USPS and postal agencies worldwide, helped automate the routing of mail and revolutionized the sorting process.", | |
| "The development and evaluation of handwriting OCR have been driven in part by standard benchmark datasets.", | |
| "The MNIST dataset, introduced in the 1990s, consists of 70,000 images of handwritten digits and became the de facto benchmark for handwritten digit recognition.", | |
| "Complementing MNIST is the USPS dataset, which provides images of hand‐written digits derived from actual envelopes and captures real-world variability.", | |
| "Handwriting OCR entered a new era with the introduction of neural network models.", | |
| "In 1989, LeCun et al. applied backpropagation to a convolutional neural network tailored for handwritten digit recognition, an innovation that evolved into the LeNet series.", | |
| "By automatically learning features rather than relying on hand-designed templates, these networks drastically improved recognition performance.", | |
| "As computational power increased and large labeled datasets became available, deep learning models, particularly convolutional neural networks and recurrent neural networks, pushed the accuracy of handwriting OCR to near-human levels.", | |
| "Modern systems can handle both printed and cursive text, automatically segmenting and recognizing characters in complex handwritten documents.", | |
| "Cursive handwriting presents a classic challenge known as Sayre's paradox, where word recognition requires letter segmentation and letter segmentation requires word recognition.", | |
| "Contemporary approaches use implicit segmentation methods, often combined with hidden Markov models or end-to-end neural networks, to circumvent this paradox.", | |
| "Today's handwriting OCR systems are highly accurate and widely deployed.", | |
| "Modern systems combine OCR with artificial intelligence to not only recognize text but also extract meaning, verify data, and integrate into larger enterprise workflows.", | |
| "Projects such as In Codice Ratio use deep convolutional networks to transcribe historical handwritten documents, further expanding OCR applications.", | |
| "Despite impressive advances, handwriting OCR continues to face challenges with highly variable or degraded handwriting.", | |
| "Ongoing research aims to improve recognition accuracy, particularly for cursive and unconstrained handwriting, and to extend support across languages and historical scripts.", | |
| "With improvements in deep learning architectures, increased computing power, and large annotated datasets, future OCR systems are expected to become even more robust, handling real-world handwriting in diverse applications from postal services to archival digitization.", | |
| "Today's research in handwriting OCR benefits from a wide array of well-established datasets and ongoing evaluation challenges.", | |
| "These resources help drive the development of increasingly robust systems for both digit and full-text recognition.", | |
| "For handwritten digit recognition, the MNIST dataset remains the most widely used benchmark thanks to its simplicity and broad adoption.", | |
| "Complementing MNIST is the USPS dataset, which is derived from actual mail envelopes and provides additional challenges with real-world variability.", | |
| "The IAM Handwriting Database is one of the most popular datasets for unconstrained offline handwriting recognition and includes scanned pages of handwritten English text with corresponding transcriptions.", | |
| "It is frequently used to train and evaluate models that work on full-line or full-page recognition tasks.", | |
| "For systems designed to capture the dynamic aspects of handwriting, such as pen stroke trajectories, the IAM On-Line Handwriting Database offers valuable data.", | |
| "The CVL dataset provides multi-writer handwritten texts with a range of writing styles, making it useful for assessing the generalization capabilities of OCR systems across diverse handwriting samples.", | |
| "The RIMES dataset, developed for French handwriting recognition, contains scanned documents and is a key resource for evaluating systems in multilingual settings.", | |
| "Various ICDAR competitions, such as ICDAR 2013 and ICDAR 2017, have released datasets that reflect the complexities of real-world handwriting, including historical documents and unconstrained writing.", | |
| "For Arabic handwriting recognition, the KHATT dataset offers a collection of handwritten texts that capture the unique challenges of cursive and context-dependent scripts.", | |
| "These datasets, along with continual evaluation efforts through competitions hosted at ICDAR and ICFHR, ensure that the field keeps pushing toward higher accuracy, better robustness, and broader language coverage.", | |
| "Emerging benchmarks, often tailored to specific scripts, historical documents, or noisy real-world data, will further refine the state-of-the-art in handwriting OCR.", | |
| "This array of resources continues to shape the development of handwriting OCR systems today.", | |
| "This additional section outlines today's most influential datasets and benchmarks, highlighting how they continue to shape the development of handwriting OCR systems." | |
| ] | |
| class SubmissionData(BaseModel): | |
| text: str = Field(..., description="Text to be handwritten") | |
| profile: Any = Field(..., description="Gradio OAuth profile") | |
| image: Optional[Image.Image] = Field(None, description="Uploaded handwritten image") | |
| max_words: int = Field(..., ge=1, le=201, description="Maximum number of words") | |
| public_checkbox: bool = Field(..., description="Submit to public dataset") | |
| model_config = { | |
| "arbitrary_types_allowed": True # Allow PIL.Image.Image type | |
| } | |
| class OCRDataCollector: | |
| def __init__(self): | |
| self.collected_pairs = [] | |
| self.last_text_block = None | |
| self.current_text_block = self.get_random_text_block(201) # Default max words | |
| self.hf_api = HfApi() | |
| def get_random_text_block(self, max_words: int): | |
| attempts = 0 | |
| max_attempts = 10 # Prevent infinite loop in case of very small sentence list | |
| while attempts < max_attempts: | |
| block_length = random.randint(1, 5) | |
| start_index = random.randint(0, len(sentences) - block_length) | |
| block = " ".join(sentences[start_index:start_index + block_length]) | |
| # Truncate to max_words if necessary | |
| words = block.split() | |
| if len(words) > max_words: | |
| block = " ".join(words[:max_words]) | |
| # If this block is different from the last one, use it | |
| if block != self.last_text_block: | |
| self.last_text_block = block | |
| return block | |
| attempts += 1 | |
| # If we couldn't find a different block after max attempts, | |
| # force a different block by using the next available sentences | |
| current_start = sentences.index(self.last_text_block.split('.')[0] + '.') if self.last_text_block else 0 | |
| next_start = (current_start + 1) % len(sentences) | |
| block = sentences[next_start] | |
| # Truncate to max_words if necessary | |
| words = block.split() | |
| if len(words) > max_words: | |
| block = " ".join(words[:max_words]) | |
| self.last_text_block = block | |
| return block | |
| def submit_image(self, image, text_block, username: Optional[str] = None): | |
| if image is not None and username: | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| self.collected_pairs.append({ | |
| "text": text_block, | |
| "image": image, | |
| "timestamp": timestamp, | |
| "username": username | |
| }) | |
| return self.get_random_text_block(201) | |
| def skip_text(self, text_block, username: Optional[str] = None): | |
| return self.get_random_text_block(201) | |
| def get_leaderboard(self): | |
| try: | |
| dataset = datasets.load_dataset("rawwerks/handwriting-ocr-all", split="train") | |
| # Count contributions by non-anonymous users | |
| user_counts = {} | |
| for item in dataset: | |
| if item['user'] != 'anonymous': | |
| user_counts[item['user']] = user_counts.get(item['user'], 0) + 1 | |
| # Create a pandas DataFrame for better styling | |
| df = pd.DataFrame(user_counts.items(), columns=['Username', 'Contributions']) | |
| df['Rank'] = range(1, len(df) + 1) | |
| df['Medal'] = df['Rank'].apply(lambda x: "🏆" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else "👏") | |
| # Reorder columns | |
| df = df[['Rank', 'Medal', 'Username', 'Contributions']] | |
| # Style the DataFrame | |
| styled_df = df.style\ | |
| .set_properties(**{ | |
| 'text-align': 'center', | |
| 'font-size': '16px', | |
| 'padding': '10px', | |
| 'border': '1px solid #ddd' | |
| })\ | |
| .set_table_styles([ | |
| {'selector': 'th', 'props': [ | |
| ('background-color', '#f4f4f4'), | |
| ('color', '#333'), | |
| ('font-weight', 'bold'), | |
| ('text-align', 'center'), | |
| ('padding', '12px'), | |
| ('border', '1px solid #ddd') | |
| ]}, | |
| {'selector': 'tr:nth-of-type(odd)', 'props': [ | |
| ('background-color', '#f9f9f9') | |
| ]}, | |
| {'selector': 'tr:hover', 'props': [ | |
| ('background-color', '#f5f5f5') | |
| ]} | |
| ]) | |
| return styled_df | |
| except Exception as e: | |
| print(f"Error fetching leaderboard: {e}") | |
| return pd.DataFrame(columns=['Rank', 'Medal', 'Username', 'Contributions']) | |
| def strip_metadata(image: Image.Image) -> Image.Image: | |
| """ | |
| Helper function to strip all metadata from the provided image data. | |
| """ | |
| if image is None: | |
| raise gr.Error("No valid image provided") | |
| # Create a new image with the same pixel data but no metadata | |
| data = list(image.getdata()) | |
| stripped_image = Image.new(image.mode, image.size) | |
| stripped_image.putdata(data) | |
| return stripped_image | |
| def transform_webcam(image: np.ndarray) -> np.ndarray: | |
| """Transform webcam input to ensure text is readable""" | |
| if image is None: | |
| return None | |
| # Flip the image horizontally to un-mirror it | |
| return cv2.flip(image, 1) | |
| class UserState: | |
| def __init__(self): | |
| self.username = None | |
| self.is_logged_in = False | |
| def update_from_profile(self, profile: gr.OAuthProfile | None) -> None: | |
| """Update user state from Gradio OAuth profile""" | |
| self.is_logged_in = profile is not None and getattr(profile, "username", None) is not None | |
| self.username = profile.username if self.is_logged_in else None | |
| def create_gradio_interface(): | |
| collector = OCRDataCollector() | |
| user_state = UserState() | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Handwriting OCR Dataset Creator") | |
| gr.Markdown("## After almost 100 years of research, handwriting recognition still sucks. Together, we can change that.") | |
| # Add leaderboard section at the top | |
| gr.Markdown("### 🏆 Top Contributors", show_label=False) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| pass | |
| with gr.Column(scale=2, min_width=400): | |
| leaderboard = gr.Dataframe( | |
| value=collector.get_leaderboard(), | |
| elem_id="leaderboard", | |
| visible=True, | |
| interactive=False, | |
| show_label=False | |
| ) | |
| with gr.Column(scale=1): | |
| pass | |
| gr.Markdown("### Step 1: Log in with your Hugging Face account to use this app.") | |
| # Login section - centered | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| pass | |
| with gr.Column(scale=2, min_width=200): | |
| login_btn = gr.LoginButton(elem_id="login_btn") | |
| # Activate the login button so OAuth is correctly initialized. | |
| login_btn.activate() | |
| user_info = gr.Markdown( | |
| value="<center>Please log in with your Hugging Face account to contribute to the dataset.</center>", | |
| elem_id="user_info" | |
| ) | |
| # Create a hidden state component to store the OAuth profile. | |
| profile_state = gr.State() | |
| with gr.Column(scale=1): | |
| pass | |
| # Update user info based on the OAuth profile. | |
| def update_user_info(profile: gr.OAuthProfile | None) -> str: | |
| if profile and getattr(profile, "username", None): | |
| return f"<center>Logged in as: {profile.username}</center>" | |
| else: | |
| return "<center>Please log in with your Hugging Face account to contribute to the dataset.</center>" | |
| demo.load(update_user_info, inputs=None, outputs=user_info) | |
| # Store the OAuth profile in the hidden state. | |
| def store_profile(profile: gr.OAuthProfile | None) -> gr.OAuthProfile | None: | |
| return profile | |
| demo.load(store_profile, inputs=None, outputs=profile_state) | |
| gr.Markdown( | |
| "### Step 2: Read the text. " | |
| "You will be shown between 1 and 5 consecutive sentences. Please handwrite them on paper and upload an image of your handwriting. " | |
| "You can change the maximum number of words you are willing to write by using the slider below. " | |
| "If you wish to skip the current text, click 'Skip'." | |
| ) | |
| text_box = gr.Textbox( | |
| value=collector.current_text_block, | |
| label="Text to Handwrite", | |
| interactive=False, | |
| lines=10, | |
| show_copy_button=True, | |
| visible=True, | |
| elem_id="text_box" | |
| ) | |
| max_words_slider = gr.Slider( | |
| 1, 201, step=5, value=201, | |
| label="Maximum Number of Words", | |
| interactive=True, | |
| visible=True, | |
| elem_id="max_words_slider" | |
| ) | |
| regenerate_btn = gr.Button( | |
| "Regenerate Text", | |
| visible=True, | |
| elem_id="regenerate_btn" | |
| ) | |
| gr.Markdown("### Step 3: Upload an image of your handwritten version of the text") | |
| upload_info = gr.Markdown( | |
| value="You must be logged in to do this, to help us prevent spam submissions", | |
| elem_id="upload_info" | |
| ) | |
| image_input = gr.Image( | |
| type="pil", | |
| label="Upload Handwritten Image", | |
| sources=["upload", "webcam"], | |
| mirror_webcam=False, # Explicitly set to false to ensure text is readable | |
| visible=False, | |
| elem_id="image_input" | |
| ) | |
| with gr.Column(visible=False) as dataset_options: | |
| private_checkbox = gr.Checkbox( | |
| value=True, | |
| label="Private", | |
| interactive=True, | |
| elem_id="private_cb" | |
| ) | |
| private_explanation = gr.Markdown( | |
| "*Private: Creates a new dataset on your account named '/handwriting-ocr-private' and appends data there.*", | |
| elem_id="private_exp" | |
| ) | |
| public_checkbox = gr.Checkbox( | |
| value=True, | |
| label="Public", | |
| interactive=True, | |
| elem_id="public_cb" | |
| ) | |
| public_explanation = gr.Markdown( | |
| "*Public: Will be added to our [public Handwriting OCR dataset](https://huggingface.co/datasets/rawwerks/handwriting-ocr-all). By submitting, you are giving permission to be added to the dataset.*", | |
| elem_id="public_exp" | |
| ) | |
| anonymous_checkbox = gr.Checkbox( | |
| value=False, | |
| label="Submit Anonymously", | |
| interactive=True, | |
| elem_id="anonymous_cb" | |
| ) | |
| anonymous_explanation = gr.Markdown( | |
| "*If un-checked, your HF username will be appended next to your submission and you will be added to the leaderboard. If checked, your submission will be anonymous in the public dataset.*", | |
| elem_id="anonymous_exp" | |
| ) | |
| with gr.Row(visible=False) as button_row: | |
| submit_btn = gr.Button("Submit", elem_id="submit_btn") | |
| # Update user state when profile changes | |
| def update_user_state(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None = None, *args): | |
| user_state.update_from_profile(profile) | |
| is_logged_in = user_state.is_logged_in | |
| message = "Please upload your handwritten image of the text below." if is_logged_in else "You must be logged in to do this, to help us prevent spam submissions" | |
| return { | |
| upload_info: gr.update(value=message), | |
| image_input: gr.update(visible=is_logged_in), | |
| dataset_options: gr.update(visible=is_logged_in), | |
| button_row: gr.update(visible=is_logged_in) | |
| } | |
| # Load initial state and update UI visibility | |
| demo.load(update_user_state, inputs=profile_state, outputs=[upload_info, image_input, dataset_options, button_row]) | |
| # Also load leaderboard on page load | |
| demo.load(fn=lambda: collector.get_leaderboard(), outputs=leaderboard) | |
| def handle_submit( | |
| text: str, | |
| upload_image: Image.Image, | |
| max_words: int, | |
| public_checkbox: bool, | |
| anonymous_checkbox: bool, | |
| collector: OCRDataCollector | None = None, | |
| profile: gr.OAuthProfile | None = None, | |
| oauth_token: gr.OAuthToken | None = None, | |
| *args | |
| ): | |
| """Handle submission using separate credentials: | |
| - For public dataset updates, the master token is loaded from .env. | |
| - For private dataset updates, the user's OAuth token is used.""" | |
| print(f"Debug - Initial params:") | |
| print(f"Text: {text[:50]}") | |
| image = upload_image if upload_image is not None else None | |
| print(f"Image type: {type(image)}") | |
| print(f"Max words: {max_words}") | |
| print(f"Public checkbox: {public_checkbox}") | |
| print(f"Anonymous checkbox: {anonymous_checkbox}") | |
| print(f"Collector type: {type(collector)}") | |
| if collector is None: | |
| raise gr.Error("Internal error: OCR collector not initialized") | |
| if not user_state.is_logged_in: | |
| raise gr.Error("Please log in to use this application") | |
| if not isinstance(image, Image.Image): | |
| raise gr.Error("Please upload a valid image before submitting") | |
| # Strip metadata from validated image | |
| stripped_image = strip_metadata(image) | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| temp_dir = "temp" | |
| os.makedirs(temp_dir, exist_ok=True) | |
| # Public dataset submission using master credentials from .env | |
| if public_checkbox: | |
| master_token = os.getenv("PUBLIC_DATASET_TOKEN") | |
| if not master_token: | |
| raise gr.Error("Master token for public dataset not configured in .env") | |
| public_repo_id = "rawwerks/handwriting-ocr-all" | |
| filename_public = f"{timestamp}_public.png" | |
| temp_path_public = os.path.join(temp_dir, filename_public) | |
| stripped_image.save(temp_path_public) | |
| try: | |
| collector.hf_api.dataset_info(public_repo_id) | |
| except Exception: | |
| collector.hf_api.create_repo(public_repo_id, repo_type="dataset", private=False) | |
| features = datasets.Features({ | |
| 'text': datasets.Value('string'), | |
| 'image': datasets.Image(), | |
| 'timestamp': datasets.Value('string'), | |
| 'user': datasets.Value('string') | |
| }) | |
| try: | |
| dataset = datasets.load_dataset(public_repo_id, split="train") | |
| except Exception: | |
| dataset = datasets.Dataset.from_dict({ | |
| 'text': [], | |
| 'image': [], | |
| 'timestamp': [], | |
| 'user': [] | |
| }, features=features) | |
| dataset = dataset.add_item({ | |
| 'text': text, | |
| 'image': temp_path_public, | |
| 'timestamp': timestamp, | |
| 'user': 'anonymous' if anonymous_checkbox else user_state.username | |
| }) | |
| dataset.push_to_hub(public_repo_id, split="train", token=master_token) | |
| os.remove(temp_path_public) | |
| # Private dataset submission using user's OAuth token | |
| if private_checkbox: # Only proceed with private dataset if checkbox is checked | |
| if oauth_token is None: | |
| raise gr.Error("Authentication token is missing. Please log in again.") | |
| if not hasattr(oauth_token, 'token') or not oauth_token.token: | |
| raise gr.Error("Invalid OAuth token. Please log in again with the required scopes (write-repos, manage-repos).") | |
| private_repo_id = f"{user_state.username}/handwriting-ocr-private" | |
| filename_private = f"{timestamp}_private.png" | |
| temp_path_private = os.path.join(temp_dir, filename_private) | |
| stripped_image.save(temp_path_private) | |
| try: | |
| # Initialize HfApi with the OAuth token | |
| hf_api = HfApi(token=oauth_token.token) | |
| try: | |
| # Try to get dataset info first | |
| hf_api.dataset_info(private_repo_id) | |
| except Exception: | |
| # Create repo if it doesn't exist | |
| hf_api.create_repo( | |
| repo_id=private_repo_id, | |
| repo_type="dataset", | |
| private=True, | |
| token=oauth_token.token # Explicitly pass token here | |
| ) | |
| features = datasets.Features({ | |
| 'text': datasets.Value('string'), | |
| 'image': datasets.Image(), | |
| 'timestamp': datasets.Value('string') | |
| }) | |
| try: | |
| # Load dataset with explicit token | |
| dataset = datasets.load_dataset(private_repo_id, split="train", token=oauth_token.token) | |
| except Exception: | |
| # If dataset doesn't exist yet, create an empty one | |
| dataset = datasets.Dataset.from_dict({ | |
| 'text': [], | |
| 'image': [], | |
| 'timestamp': [] | |
| }, features=features) | |
| # Add the new item | |
| dataset = dataset.add_item({ | |
| 'text': text, | |
| 'image': temp_path_private, | |
| 'timestamp': timestamp | |
| }) | |
| # Push to hub with explicit token | |
| dataset.push_to_hub( | |
| private_repo_id, | |
| split="train", | |
| token=oauth_token.token, | |
| private=True | |
| ) | |
| os.remove(temp_path_private) | |
| except Exception as e: | |
| raise gr.Error(f"Failed to save to private dataset: {str(e)}") | |
| # Ensure at least one checkbox is selected | |
| if not public_checkbox and not private_checkbox: | |
| raise gr.Error("Please select at least one dataset (public or private) to save to.") | |
| new_text = collector.get_random_text_block(max_words) | |
| return None, new_text, collector.get_leaderboard() | |
| # Submit button click handler with simplified inputs | |
| submit_btn.click( | |
| fn=handle_submit, | |
| inputs=[ | |
| text_box, # Text to handwrite | |
| image_input, # Uploaded image | |
| max_words_slider, # Max words | |
| public_checkbox, # Public dataset option | |
| anonymous_checkbox, | |
| gr.State(collector), | |
| gr.State(None), # Profile will be filled by Gradio | |
| gr.State(None) # Token will be filled by Gradio | |
| ], | |
| outputs=[image_input, text_box, leaderboard] | |
| ) | |
| def handle_regenerate(text, max_words): | |
| # Allow anyone to regenerate text regardless of login status. | |
| return collector.get_random_text_block(max_words) | |
| regenerate_btn.click( | |
| fn=handle_regenerate, | |
| inputs=[text_box, max_words_slider], | |
| outputs=text_box | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_gradio_interface() | |
| demo.launch() |