Spaces:
Sleeping
Sleeping
used nltk
Browse files
app.py
CHANGED
|
@@ -2,8 +2,17 @@ import streamlit as st
|
|
| 2 |
import PyPDF2
|
| 3 |
import io
|
| 4 |
import os
|
|
|
|
|
|
|
|
|
|
| 5 |
|
|
|
|
|
|
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
|
| 8 |
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
|
| 9 |
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
|
|
@@ -81,22 +90,43 @@ def normalizePreeti(preetitxt):
|
|
| 81 |
previoussymbol = ''
|
| 82 |
return normalized
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
def convert(preeti):
|
| 85 |
converted = ''
|
| 86 |
normalizedpreeti = normalizePreeti(preeti)
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
| 95 |
else:
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
return converted
|
| 101 |
|
| 102 |
def extract_text_from_pdf(pdf_file):
|
|
@@ -149,4 +179,4 @@ def main():
|
|
| 149 |
)
|
| 150 |
|
| 151 |
if __name__ == "__main__":
|
| 152 |
-
main()
|
|
|
|
| 2 |
import PyPDF2
|
| 3 |
import io
|
| 4 |
import os
|
| 5 |
+
import re
|
| 6 |
+
import nltk
|
| 7 |
+
from nltk.corpus import words
|
| 8 |
|
| 9 |
+
# Download the words corpus if not already downloaded
|
| 10 |
+
nltk.download('words')
|
| 11 |
|
| 12 |
+
# Create a set of English words for quick lookup
|
| 13 |
+
english_words_set = set(words.words())
|
| 14 |
+
|
| 15 |
+
# Your existing mappings
|
| 16 |
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
|
| 17 |
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
|
| 18 |
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
|
|
|
|
| 90 |
previoussymbol = ''
|
| 91 |
return normalized
|
| 92 |
|
| 93 |
+
def is_english_word(word):
|
| 94 |
+
# Remove punctuation and convert to lowercase
|
| 95 |
+
word_clean = re.sub(r'\W+', '', word).lower()
|
| 96 |
+
return word_clean in english_words_set
|
| 97 |
+
|
| 98 |
def convert(preeti):
|
| 99 |
converted = ''
|
| 100 |
normalizedpreeti = normalizePreeti(preeti)
|
| 101 |
+
|
| 102 |
+
# Split the text into tokens (words and non-words)
|
| 103 |
+
tokens = re.findall(r'\w+|\W+', normalizedpreeti)
|
| 104 |
+
|
| 105 |
+
for token in tokens:
|
| 106 |
+
if re.match(r'\w+', token):
|
| 107 |
+
# This is a word
|
| 108 |
+
if is_english_word(token):
|
| 109 |
+
# English word, skip conversion
|
| 110 |
+
converted += token
|
| 111 |
else:
|
| 112 |
+
# Convert the word
|
| 113 |
+
converted_word = ''
|
| 114 |
+
for index, character in enumerate(token):
|
| 115 |
+
try:
|
| 116 |
+
if ord(character) >= 97 and ord(character) <= 122:
|
| 117 |
+
converted_word += unicodeatoz[ord(character) - 97]
|
| 118 |
+
elif ord(character) >= 65 and ord(character) <= 90:
|
| 119 |
+
converted_word += unicodeAtoZ[ord(character) - 65]
|
| 120 |
+
elif ord(character) >= 48 and ord(character) <= 57:
|
| 121 |
+
converted_word += unicode0to9[ord(character) - 48]
|
| 122 |
+
else:
|
| 123 |
+
converted_word += symbolsDict[character]
|
| 124 |
+
except KeyError:
|
| 125 |
+
converted_word += character
|
| 126 |
+
converted += converted_word
|
| 127 |
+
else:
|
| 128 |
+
# Non-word token (punctuation, whitespace)
|
| 129 |
+
converted += token
|
| 130 |
return converted
|
| 131 |
|
| 132 |
def extract_text_from_pdf(pdf_file):
|
|
|
|
| 179 |
)
|
| 180 |
|
| 181 |
if __name__ == "__main__":
|
| 182 |
+
main()
|