Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
@@ -3,9 +3,9 @@ from fastapi.responses import HTMLResponse
|
|
3 |
from fastapi.staticfiles import StaticFiles
|
4 |
from transformers import pipeline
|
5 |
import textwrap
|
6 |
-
import fitz # PyMuPDF for
|
7 |
from docx import Document
|
8 |
-
import openpyxl # For Excel
|
9 |
from pptx import Presentation
|
10 |
from fastapi.middleware.cors import CORSMiddleware
|
11 |
from functools import lru_cache
|
@@ -15,23 +15,23 @@ from io import BytesIO
|
|
15 |
# Initialize FastAPI app
|
16 |
app = FastAPI()
|
17 |
|
18 |
-
# Enable CORS
|
19 |
app.add_middleware(
|
20 |
CORSMiddleware,
|
21 |
-
allow_origins=["*"],
|
22 |
allow_credentials=True,
|
23 |
-
allow_methods=["*"],
|
24 |
-
allow_headers=["*"],
|
25 |
)
|
26 |
|
27 |
-
#
|
28 |
STATIC_DIR = "static"
|
29 |
|
30 |
-
# Ensure the
|
31 |
if not os.path.exists(STATIC_DIR):
|
32 |
os.makedirs(STATIC_DIR)
|
33 |
|
34 |
-
#
|
35 |
app.mount("/static", StaticFiles(directory=STATIC_DIR, html=True), name="static")
|
36 |
|
37 |
@app.get("/", response_class=HTMLResponse)
|
@@ -43,7 +43,7 @@ async def read_root():
|
|
43 |
except FileNotFoundError:
|
44 |
raise HTTPException(status_code=404, detail="index.html not found in static folder.")
|
45 |
|
46 |
-
# Supported
|
47 |
LANGUAGE_CODES = {
|
48 |
"Anglais": "en",
|
49 |
"Francais": "fr",
|
@@ -61,7 +61,7 @@ AVAILABLE_MODELS = {
|
|
61 |
"en-es": "Helsinki-NLP/opus-mt-en-es",
|
62 |
}
|
63 |
|
64 |
-
# Cache models
|
65 |
@lru_cache(maxsize=10)
|
66 |
def load_translator(src_code: str, tgt_code: str):
|
67 |
model_key = f"{src_code}-{tgt_code}"
|
@@ -78,31 +78,29 @@ def load_translator(src_code: str, tgt_code: str):
|
|
78 |
else:
|
79 |
raise ValueError(f"No model available for {src_code} -> {tgt_code}")
|
80 |
|
81 |
-
#
|
82 |
def chunk_text(text, max_length=400):
|
83 |
return textwrap.wrap(text, max_length)
|
84 |
|
85 |
-
#
|
86 |
def extract_text(file: UploadFile):
|
87 |
try:
|
88 |
-
file_bytes = file.file.read()
|
89 |
-
file_stream = BytesIO(file_bytes)
|
90 |
|
91 |
if file.filename.endswith(".txt"):
|
92 |
return file_bytes.decode("utf-8")
|
93 |
|
94 |
elif file.filename.endswith(".pdf"):
|
95 |
-
doc = fitz.open(stream=
|
96 |
return "\n".join([page.get_text() for page in doc])
|
97 |
|
98 |
elif file.filename.endswith(".docx"):
|
99 |
-
file_stream.seek(0) # Reset cursor position
|
100 |
doc = Document(file_stream)
|
101 |
return "\n".join([para.text for para in doc.paragraphs])
|
102 |
|
103 |
elif file.filename.endswith(".xlsx"):
|
104 |
-
|
105 |
-
wb = openpyxl.load_workbook(file_stream, data_only=True)
|
106 |
text = ""
|
107 |
for sheet in wb.sheetnames:
|
108 |
ws = wb[sheet]
|
@@ -111,7 +109,6 @@ def extract_text(file: UploadFile):
|
|
111 |
return text
|
112 |
|
113 |
elif file.filename.endswith(".pptx"):
|
114 |
-
file_stream.seek(0)
|
115 |
prs = Presentation(file_stream)
|
116 |
text = ""
|
117 |
for slide in prs.slides:
|
@@ -126,7 +123,7 @@ def extract_text(file: UploadFile):
|
|
126 |
except Exception as e:
|
127 |
raise HTTPException(status_code=500, detail=f"Error extracting text: {str(e)}")
|
128 |
|
129 |
-
#
|
130 |
@app.post("/upload/")
|
131 |
async def upload_file(
|
132 |
file: UploadFile = File(...),
|
@@ -136,7 +133,7 @@ async def upload_file(
|
|
136 |
text = extract_text(file)
|
137 |
|
138 |
if not text.strip():
|
139 |
-
raise HTTPException(status_code=400, detail="No text extracted from file.")
|
140 |
|
141 |
src_code = LANGUAGE_CODES.get(src_lang)
|
142 |
tgt_code = LANGUAGE_CODES.get(tgt_lang)
|
@@ -145,15 +142,12 @@ async def upload_file(
|
|
145 |
raise HTTPException(status_code=400, detail=f"Unsupported language: {src_lang} -> {tgt_lang}")
|
146 |
|
147 |
try:
|
148 |
-
# Load translation model
|
149 |
translator = load_translator(src_code, tgt_code)
|
150 |
|
151 |
-
# If translation goes through English as an intermediate step
|
152 |
if isinstance(translator, tuple):
|
153 |
translator1, translator2 = translator
|
154 |
intermediate_text = "\n".join([translator1(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
|
155 |
translated_text = "\n".join([translator2(chunk)[0]['translation_text'] for chunk in chunk_text(intermediate_text)])
|
156 |
-
|
157 |
else:
|
158 |
translated_text = "\n".join([translator(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
|
159 |
|
|
|
3 |
from fastapi.staticfiles import StaticFiles
|
4 |
from transformers import pipeline
|
5 |
import textwrap
|
6 |
+
import fitz # PyMuPDF for PDF handling
|
7 |
from docx import Document
|
8 |
+
import openpyxl # For Excel
|
9 |
from pptx import Presentation
|
10 |
from fastapi.middleware.cors import CORSMiddleware
|
11 |
from functools import lru_cache
|
|
|
15 |
# Initialize FastAPI app
|
16 |
app = FastAPI()
|
17 |
|
18 |
+
# Enable CORS to allow frontend communication
|
19 |
app.add_middleware(
|
20 |
CORSMiddleware,
|
21 |
+
allow_origins=["*"],
|
22 |
allow_credentials=True,
|
23 |
+
allow_methods=["*"],
|
24 |
+
allow_headers=["*"],
|
25 |
)
|
26 |
|
27 |
+
# Directory for static files
|
28 |
STATIC_DIR = "static"
|
29 |
|
30 |
+
# Ensure the directory exists
|
31 |
if not os.path.exists(STATIC_DIR):
|
32 |
os.makedirs(STATIC_DIR)
|
33 |
|
34 |
+
# Serve static files correctly
|
35 |
app.mount("/static", StaticFiles(directory=STATIC_DIR, html=True), name="static")
|
36 |
|
37 |
@app.get("/", response_class=HTMLResponse)
|
|
|
43 |
except FileNotFoundError:
|
44 |
raise HTTPException(status_code=404, detail="index.html not found in static folder.")
|
45 |
|
46 |
+
# Supported languages
|
47 |
LANGUAGE_CODES = {
|
48 |
"Anglais": "en",
|
49 |
"Francais": "fr",
|
|
|
61 |
"en-es": "Helsinki-NLP/opus-mt-en-es",
|
62 |
}
|
63 |
|
64 |
+
# Cache models for better performance
|
65 |
@lru_cache(maxsize=10)
|
66 |
def load_translator(src_code: str, tgt_code: str):
|
67 |
model_key = f"{src_code}-{tgt_code}"
|
|
|
78 |
else:
|
79 |
raise ValueError(f"No model available for {src_code} -> {tgt_code}")
|
80 |
|
81 |
+
# Function to split text into chunks
|
82 |
def chunk_text(text, max_length=400):
|
83 |
return textwrap.wrap(text, max_length)
|
84 |
|
85 |
+
# Function to extract text from files
|
86 |
def extract_text(file: UploadFile):
|
87 |
try:
|
88 |
+
file_bytes = file.file.read()
|
89 |
+
file_stream = BytesIO(file_bytes)
|
90 |
|
91 |
if file.filename.endswith(".txt"):
|
92 |
return file_bytes.decode("utf-8")
|
93 |
|
94 |
elif file.filename.endswith(".pdf"):
|
95 |
+
doc = fitz.open(stream=file_bytes, filetype="pdf")
|
96 |
return "\n".join([page.get_text() for page in doc])
|
97 |
|
98 |
elif file.filename.endswith(".docx"):
|
|
|
99 |
doc = Document(file_stream)
|
100 |
return "\n".join([para.text for para in doc.paragraphs])
|
101 |
|
102 |
elif file.filename.endswith(".xlsx"):
|
103 |
+
wb = openpyxl.load_workbook(file_stream)
|
|
|
104 |
text = ""
|
105 |
for sheet in wb.sheetnames:
|
106 |
ws = wb[sheet]
|
|
|
109 |
return text
|
110 |
|
111 |
elif file.filename.endswith(".pptx"):
|
|
|
112 |
prs = Presentation(file_stream)
|
113 |
text = ""
|
114 |
for slide in prs.slides:
|
|
|
123 |
except Exception as e:
|
124 |
raise HTTPException(status_code=500, detail=f"Error extracting text: {str(e)}")
|
125 |
|
126 |
+
# Correctly defined POST route for file upload
|
127 |
@app.post("/upload/")
|
128 |
async def upload_file(
|
129 |
file: UploadFile = File(...),
|
|
|
133 |
text = extract_text(file)
|
134 |
|
135 |
if not text.strip():
|
136 |
+
raise HTTPException(status_code=400, detail="No text extracted from the file.")
|
137 |
|
138 |
src_code = LANGUAGE_CODES.get(src_lang)
|
139 |
tgt_code = LANGUAGE_CODES.get(tgt_lang)
|
|
|
142 |
raise HTTPException(status_code=400, detail=f"Unsupported language: {src_lang} -> {tgt_lang}")
|
143 |
|
144 |
try:
|
|
|
145 |
translator = load_translator(src_code, tgt_code)
|
146 |
|
|
|
147 |
if isinstance(translator, tuple):
|
148 |
translator1, translator2 = translator
|
149 |
intermediate_text = "\n".join([translator1(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
|
150 |
translated_text = "\n".join([translator2(chunk)[0]['translation_text'] for chunk in chunk_text(intermediate_text)])
|
|
|
151 |
else:
|
152 |
translated_text = "\n".join([translator(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
|
153 |
|