Spaces:

rockerritesh
/

preeti-unicode

Sleeping

App Files Files Community

rockerritesh commited on Nov 19, 2024

Commit

2d857e8

verified ·

1 Parent(s): 93294e9

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -154

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import io
 import os
 import re
-# Existing mapping dictionaries
 unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
 unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
 unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
@@ -17,182 +17,159 @@ symbolsDict = {
     "å": "द्व", "÷": "/"
 }
-def is_preeti_text(text):
     """
-    Check if text segment is likely to be Preeti-encoded Nepali.
-    Returns True if the text contains common Preeti patterns.
     """
-    preeti_patterns = [
-        r'cf', r'qm', r'If', r'0f', r'km', r'f]',  # Common Preeti combinations
-        r'[a-zA-Z]{2,}[\\|\[\]{}]',  # Preeti vowel signs and consonants
-    ]
-    return any(re.search(pattern, text) for pattern in preeti_patterns)
-def normalizePreeti(preetitxt):
-    """Normalized Preeti text with improved handling"""
-    normalized = ''
-    previoussymbol = ''
-    # Common Preeti substitutions
-    replacements = {
-        'qm': 's|',
-        'f]': 'ो',
-        'km': 'फ',
-        '0f': 'ण',
-        'If': 'क्ष',
-        'if': 'ष',
-        'cf': 'आ'
-    }
-    for old, new in replacements.items():
-        preetitxt = preetitxt.replace(old, new)
-    index = -1
-    while index + 1 < len(preetitxt):
-        index += 1
-        character = preetitxt[index]
-        try:
-            if index + 2 < len(preetitxt) and preetitxt[index + 2] == '{':
-                if preetitxt[index + 1] == 'f' or preetitxt[index + 1] == 'ो':
-                    normalized += '{' + character + preetitxt[index + 1]
-                    index += 2
-                    continue
-            if index + 1 < len(preetitxt) and preetitxt[index + 1] == '{':
-                if character != 'f':
-                    normalized += '{' + character
-                    index += 1
-                    continue
-        except IndexError:
-            pass
-        if character == 'l':
-            previoussymbol = 'l'
-            continue
         else:
-            normalized += character + previoussymbol
-            previoussymbol = ''
     return normalized
-def convert_preeti_segment(preeti):
     """Convert a single Preeti segment to Unicode"""
     converted = ''
-    normalizedpreeti = normalizePreeti(preeti)
-    for character in normalizedpreeti:
-        try:
-            if ord('a') <= ord(character) <= ord('z'):
-                converted += unicodeatoz[ord(character) - ord('a')]
-            elif ord('A') <= ord(character) <= ord('Z'):
-                converted += unicodeAtoZ[ord(character) - ord('A')]
-            elif ord('0') <= ord(character) <= ord('9'):
-                converted += unicode0to9[ord(character) - ord('0')]
-            else:
-                converted += symbolsDict.get(character, character)
-        except (KeyError, IndexError):
-            converted += character
     return converted
-def smart_convert(text):
     """
-    Convert text while preserving English segments.
-    Uses pattern matching to identify and preserve English text.
     """
-    # Patterns to identify different text segments
-    patterns = [
-        # Email addresses
-        r'\b[\w\.-]+@[\w\.-]+\.\w+\b',
-        # URLs
-        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
-        # Date patterns
-        r'\b\d{1,4}[-/]\d{1,2}[-/]\d{1,4}\b',
-        # Common English words (3 or more characters)
-        r'\b[A-Za-z]{3,}\b',
-        # Numbers with units
-        r'\b\d+\s*[A-Za-z]+\b',
-    ]
-    # Combine patterns
-    combined_pattern = '|'.join(patterns)
-    # Split text into segments while preserving delimiters
-    segments = []
-    last_end = 0
-    for match in re.finditer(combined_pattern, text):
-        start, end = match.span()
-        # Add text before match
-        if start > last_end:
-            segment = text[last_end:start]
-            if segment.strip():
-                segments.append((segment, is_preeti_text(segment)))
-        # Add matched text (preserve it)
-        segments.append((match.group(), False))
-        last_end = end
-    # Add remaining text
-    if last_end < len(text):
-        segment = text[last_end:]
-        if segment.strip():
-            segments.append((segment, is_preeti_text(segment)))
-    # Convert segments
-    result = ''
-    for segment, is_preeti in segments:
-        if is_preeti:
-            result += convert_preeti_segment(segment)
-        else:
-            result += segment
     return result
-def extract_text_from_pdf(pdf_file):
-    """Extract text from PDF with improved encoding handling"""
-    text = ''
-    try:
-        with open(pdf_file, 'rb') as file:
-            reader = PyPDF2.PdfReader(file)
-            for page in reader.pages:
-                text += page.extract_text() or ''
-    except Exception as e:
-        st.error(f"Error reading PDF: {str(e)}")
-        return ''
-    return text
 def main():
-    st.title("Smart Preeti to Unicode Converter")
-    st.write("This converter preserves English text while converting Preeti to Unicode")
-    uploaded_file = st.file_uploader("Choose a PDF or TXT file", type=["pdf", "txt"])
-    if uploaded_file is not None:
-        try:
-            if uploaded_file.name.lower().endswith('.pdf'):
-                pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read()))
-                text = ""
-                for page in pdf_reader.pages:
-                    text += page.extract_text() or ''
-            else:  # .txt file
-                text = uploaded_file.getvalue().decode("utf-8")
-            converted_text = smart_convert(text)
-            col1, col2 = st.columns(2)
-            with col1:
-                st.subheader("Original Text")
-                st.text_area("", value=text, height=300)
-            with col2:
-                st.subheader("Converted Text")
-                st.text_area("", value=converted_text, height=300)
             st.download_button(
                 label="Download Converted Text",
                 data=converted_text.encode("utf-8"),
@@ -200,8 +177,5 @@ def main():
                 mime="text/plain"
             )
-        except Exception as e:
-            st.error(f"An error occurred: {str(e)}")
 if __name__ == "__main__":
     main()

 import os
 import re
+# Existing mapping dictionaries remain the same
 unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
 unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
 unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
     "å": "द्व", "÷": "/"
 }
+# Common Preeti patterns that need to be preserved as units
+preeti_compounds = {
+    'qm': 's|',
+    'f]': 'ो',
+    'km': 'फ',
+    '0f': 'ण',
+    'If': 'क्ष',
+    'if': 'ष',
+    'cf': 'आ',
+    '6«': 'ट्र',
+    'g]': 'ने',
+    '8f': 'डा',
+    '«': '्र',
+    'j|m': 'क्र',
+    ';+': 'सं'
+}
+def is_nepali_unicode(char):
+    """Check if character is already in Nepali Unicode range"""
+    return '\u0900' <= char <= '\u097F'
+def get_preeti_segment(text, start_idx):
     """
+    Extract a complete Preeti segment starting from given index.
+    Returns the segment and the ending index.
     """
+    if start_idx >= len(text):
+        return "", start_idx
+    current_idx = start_idx
+    segment = ""
+    while current_idx < len(text):
+        # Check for compound characters first
+        matched = False
+        for compound in sorted(preeti_compounds.keys(), key=len, reverse=True):
+            if text[current_idx:].startswith(compound):
+                segment += compound
+                current_idx += len(compound)
+                matched = True
+                break
+        if not matched:
+            char = text[current_idx]
+            if char.isspace() or is_nepali_unicode(char):
+                break
+            segment += char
+            current_idx += 1
+    return segment, current_idx
+def normalize_preeti(preetitxt):
+    """Normalize Preeti text with improved compound handling"""
+    # First handle the compound characters
+    for old, new in preeti_compounds.items():
+        preetitxt = preetitxt.replace(old, new)
+    # Handle remaining special cases
+    normalized = ''
+    idx = 0
+    while idx < len(preetitxt):
+        if idx + 1 < len(preetitxt) and preetitxt[idx] == 'l':
+            normalized += preetitxt[idx + 1] + 'ि'
+            idx += 2
         else:
+            normalized += preetitxt[idx]
+            idx += 1
     return normalized
+def convert_segment(segment):
     """Convert a single Preeti segment to Unicode"""
+    if not segment.strip():
+        return segment
+    # If already in Nepali Unicode, return as is
+    if all(is_nepali_unicode(char) for char in segment if char.strip()):
+        return segment
     converted = ''
+    normalized = normalize_preeti(segment)
+    for char in normalized:
+        if is_nepali_unicode(char):
+            converted += char
+        elif char.isascii():
+            try:
+                if 'a' <= char <= 'z':
+                    converted += unicodeatoz[ord(char) - ord('a')]
+                elif 'A' <= char <= 'Z':
+                    converted += unicodeAtoZ[ord(char) - ord('A')]
+                elif '0' <= char <= '9':
+                    converted += unicode0to9[ord(char) - ord('0')]
+                else:
+                    converted += symbolsDict.get(char, char)
+            except (IndexError, KeyError):
+                converted += char
+        else:
+            converted += char
     return converted
+def smart_convert_mixed(text):
     """
+    Convert text while handling mixed Preeti, Unicode and English.
+    Processes text character by character to maintain proper segmentation.
     """
+    result = ""
+    idx = 0
+    while idx < len(text):
+        char = text[idx]
+        # Skip spaces and preserve them
+        if char.isspace():
+            result += char
+            idx += 1
+            continue
+        # If character is already in Nepali Unicode, preserve it
+        if is_nepali_unicode(char):
+            result += char
+            idx += 1
+            continue
+        # If it's a potential Preeti character, get the complete segment
+        if char.isascii():
+            preeti_segment, new_idx = get_preeti_segment(text, idx)
+            if preeti_segment:
+                result += convert_segment(preeti_segment)
+                idx = new_idx
+                continue
+        # Default case: preserve the character
+        result += char
+        idx += 1
     return result
 def main():
+    st.title("Advanced Mixed Text Converter")
+    st.write("Converts Preeti text while preserving existing Nepali Unicode and English")
+    # Input area
+    input_text = st.text_area("Enter text to convert", height=200)
+    if st.button("Convert"):
+        if input_text:
+            converted_text = smart_convert_mixed(input_text)
+            st.subheader("Converted Text")
+            st.text_area("", value=converted_text, height=200)
             st.download_button(
                 label="Download Converted Text",
                 data=converted_text.encode("utf-8"),
                 mime="text/plain"
             )
 if __name__ == "__main__":
     main()