Spaces:
Sleeping
Sleeping
used nltk
Browse files
app.py
CHANGED
@@ -2,8 +2,17 @@ import streamlit as st
|
|
2 |
import PyPDF2
|
3 |
import io
|
4 |
import os
|
|
|
|
|
|
|
5 |
|
|
|
|
|
6 |
|
|
|
|
|
|
|
|
|
7 |
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
|
8 |
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
|
9 |
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
|
@@ -81,22 +90,43 @@ def normalizePreeti(preetitxt):
|
|
81 |
previoussymbol = ''
|
82 |
return normalized
|
83 |
|
|
|
|
|
|
|
|
|
|
|
84 |
def convert(preeti):
|
85 |
converted = ''
|
86 |
normalizedpreeti = normalizePreeti(preeti)
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
|
|
|
|
95 |
else:
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
return converted
|
101 |
|
102 |
def extract_text_from_pdf(pdf_file):
|
@@ -149,4 +179,4 @@ def main():
|
|
149 |
)
|
150 |
|
151 |
if __name__ == "__main__":
|
152 |
-
main()
|
|
|
2 |
import PyPDF2
|
3 |
import io
|
4 |
import os
|
5 |
+
import re
|
6 |
+
import nltk
|
7 |
+
from nltk.corpus import words
|
8 |
|
9 |
+
# Download the words corpus if not already downloaded
|
10 |
+
nltk.download('words')
|
11 |
|
12 |
+
# Create a set of English words for quick lookup
|
13 |
+
english_words_set = set(words.words())
|
14 |
+
|
15 |
+
# Your existing mappings
|
16 |
unicodeatoz = ["ब", "द", "अ", "म", "भ", "ा", "न", "ज", "ष्", "व", "प", "ि", "फ", "ल", "य", "उ", "त्र", "च", "क", "त", "ग", "ख", "ध", "ह", "थ", "श"]
|
17 |
unicodeAtoZ = ["ब्", "ध", "ऋ", "म्", "भ्", "ँ", "न्", "ज्", "क्ष्", "व्", "प्", "ी", "ः", "ल्", "इ", "ए", "त्त", "च्", "क्", "त्", "ग्", "ख्", "ध्", "ह्", "थ्", "श्"]
|
18 |
unicode0to9 = ["ण्", "ज्ञ", "द्द", "घ", "द्ध", "छ", "ट", "ठ", "ड", "ढ"]
|
|
|
90 |
previoussymbol = ''
|
91 |
return normalized
|
92 |
|
93 |
+
def is_english_word(word):
|
94 |
+
# Remove punctuation and convert to lowercase
|
95 |
+
word_clean = re.sub(r'\W+', '', word).lower()
|
96 |
+
return word_clean in english_words_set
|
97 |
+
|
98 |
def convert(preeti):
|
99 |
converted = ''
|
100 |
normalizedpreeti = normalizePreeti(preeti)
|
101 |
+
|
102 |
+
# Split the text into tokens (words and non-words)
|
103 |
+
tokens = re.findall(r'\w+|\W+', normalizedpreeti)
|
104 |
+
|
105 |
+
for token in tokens:
|
106 |
+
if re.match(r'\w+', token):
|
107 |
+
# This is a word
|
108 |
+
if is_english_word(token):
|
109 |
+
# English word, skip conversion
|
110 |
+
converted += token
|
111 |
else:
|
112 |
+
# Convert the word
|
113 |
+
converted_word = ''
|
114 |
+
for index, character in enumerate(token):
|
115 |
+
try:
|
116 |
+
if ord(character) >= 97 and ord(character) <= 122:
|
117 |
+
converted_word += unicodeatoz[ord(character) - 97]
|
118 |
+
elif ord(character) >= 65 and ord(character) <= 90:
|
119 |
+
converted_word += unicodeAtoZ[ord(character) - 65]
|
120 |
+
elif ord(character) >= 48 and ord(character) <= 57:
|
121 |
+
converted_word += unicode0to9[ord(character) - 48]
|
122 |
+
else:
|
123 |
+
converted_word += symbolsDict[character]
|
124 |
+
except KeyError:
|
125 |
+
converted_word += character
|
126 |
+
converted += converted_word
|
127 |
+
else:
|
128 |
+
# Non-word token (punctuation, whitespace)
|
129 |
+
converted += token
|
130 |
return converted
|
131 |
|
132 |
def extract_text_from_pdf(pdf_file):
|
|
|
179 |
)
|
180 |
|
181 |
if __name__ == "__main__":
|
182 |
+
main()
|