Spaces:

rayhane123
/

Ai-traduction

Running

App Files Files Community

rayhane123 commited on Mar 24

Commit

272b484

verified ·

1 Parent(s): 8db0473

Update main.py

Browse files

Files changed (1) hide show

main.py +19 -25

main.py CHANGED Viewed

@@ -3,9 +3,9 @@ from fastapi.responses import HTMLResponse
 from fastapi.staticfiles import StaticFiles
 from transformers import pipeline
 import textwrap
-import fitz  # PyMuPDF for PDFs
 from docx import Document
-import openpyxl  # For Excel files
 from pptx import Presentation
 from fastapi.middleware.cors import CORSMiddleware
 from functools import lru_cache
@@ -15,23 +15,23 @@ from io import BytesIO
 # Initialize FastAPI app
 app = FastAPI()
-# Enable CORS
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],
     allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
 )
-# Define static files directory
 STATIC_DIR = "static"
-# Ensure the static directory exists
 if not os.path.exists(STATIC_DIR):
     os.makedirs(STATIC_DIR)
-# Mount static files correctly
 app.mount("/static", StaticFiles(directory=STATIC_DIR, html=True), name="static")
 @app.get("/", response_class=HTMLResponse)
@@ -43,7 +43,7 @@ async def read_root():
     except FileNotFoundError:
         raise HTTPException(status_code=404, detail="index.html not found in static folder.")
-# Supported language codes
 LANGUAGE_CODES = {
     "Anglais": "en",
     "Francais": "fr",
@@ -61,7 +61,7 @@ AVAILABLE_MODELS = {
     "en-es": "Helsinki-NLP/opus-mt-en-es",
 }
-# Cache models to improve performance
 @lru_cache(maxsize=10)
 def load_translator(src_code: str, tgt_code: str):
     model_key = f"{src_code}-{tgt_code}"
@@ -78,31 +78,29 @@ def load_translator(src_code: str, tgt_code: str):
     else:
         raise ValueError(f"No model available for {src_code} -> {tgt_code}")
-# Split text into chunks
 def chunk_text(text, max_length=400):
     return textwrap.wrap(text, max_length)
-# Extract text from different file types
 def extract_text(file: UploadFile):
     try:
-        file_bytes = file.file.read()  # Read file content
-        file_stream = BytesIO(file_bytes)  # Convert to binary stream
         if file.filename.endswith(".txt"):
             return file_bytes.decode("utf-8")
         elif file.filename.endswith(".pdf"):
-            doc = fitz.open(stream=file_stream, filetype="pdf")
             return "\n".join([page.get_text() for page in doc])
         elif file.filename.endswith(".docx"):
-            file_stream.seek(0)  # Reset cursor position
             doc = Document(file_stream)
             return "\n".join([para.text for para in doc.paragraphs])
         elif file.filename.endswith(".xlsx"):
-            file_stream.seek(0)
-            wb = openpyxl.load_workbook(file_stream, data_only=True)
             text = ""
             for sheet in wb.sheetnames:
                 ws = wb[sheet]
@@ -111,7 +109,6 @@ def extract_text(file: UploadFile):
             return text
         elif file.filename.endswith(".pptx"):
-            file_stream.seek(0)
             prs = Presentation(file_stream)
             text = ""
             for slide in prs.slides:
@@ -126,7 +123,7 @@ def extract_text(file: UploadFile):
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error extracting text: {str(e)}")
-# Upload and translate file
 @app.post("/upload/")
 async def upload_file(
     file: UploadFile = File(...),
@@ -136,7 +133,7 @@ async def upload_file(
     text = extract_text(file)
     if not text.strip():
-        raise HTTPException(status_code=400, detail="No text extracted from file.")
     src_code = LANGUAGE_CODES.get(src_lang)
     tgt_code = LANGUAGE_CODES.get(tgt_lang)
@@ -145,15 +142,12 @@ async def upload_file(
         raise HTTPException(status_code=400, detail=f"Unsupported language: {src_lang} -> {tgt_lang}")
     try:
-        # Load translation model
         translator = load_translator(src_code, tgt_code)
-        # If translation goes through English as an intermediate step
         if isinstance(translator, tuple):
             translator1, translator2 = translator
             intermediate_text = "\n".join([translator1(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
             translated_text = "\n".join([translator2(chunk)[0]['translation_text'] for chunk in chunk_text(intermediate_text)])
         else:
             translated_text = "\n".join([translator(chunk)[0]['translation_text'] for chunk in chunk_text(text)])

 from fastapi.staticfiles import StaticFiles
 from transformers import pipeline
 import textwrap
+import fitz  # PyMuPDF for PDF handling
 from docx import Document
+import openpyxl  # For Excel
 from pptx import Presentation
 from fastapi.middleware.cors import CORSMiddleware
 from functools import lru_cache
 # Initialize FastAPI app
 app = FastAPI()
+# Enable CORS to allow frontend communication
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
     allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
 )
+# Directory for static files
 STATIC_DIR = "static"
+# Ensure the directory exists
 if not os.path.exists(STATIC_DIR):
     os.makedirs(STATIC_DIR)
+# Serve static files correctly
 app.mount("/static", StaticFiles(directory=STATIC_DIR, html=True), name="static")
 @app.get("/", response_class=HTMLResponse)
     except FileNotFoundError:
         raise HTTPException(status_code=404, detail="index.html not found in static folder.")
+# Supported languages
 LANGUAGE_CODES = {
     "Anglais": "en",
     "Francais": "fr",
     "en-es": "Helsinki-NLP/opus-mt-en-es",
 }
+# Cache models for better performance
 @lru_cache(maxsize=10)
 def load_translator(src_code: str, tgt_code: str):
     model_key = f"{src_code}-{tgt_code}"
     else:
         raise ValueError(f"No model available for {src_code} -> {tgt_code}")
+# Function to split text into chunks
 def chunk_text(text, max_length=400):
     return textwrap.wrap(text, max_length)
+# Function to extract text from files
 def extract_text(file: UploadFile):
     try:
+        file_bytes = file.file.read()
+        file_stream = BytesIO(file_bytes)
         if file.filename.endswith(".txt"):
             return file_bytes.decode("utf-8")
         elif file.filename.endswith(".pdf"):
+            doc = fitz.open(stream=file_bytes, filetype="pdf")
             return "\n".join([page.get_text() for page in doc])
         elif file.filename.endswith(".docx"):
             doc = Document(file_stream)
             return "\n".join([para.text for para in doc.paragraphs])
         elif file.filename.endswith(".xlsx"):
+            wb = openpyxl.load_workbook(file_stream)
             text = ""
             for sheet in wb.sheetnames:
                 ws = wb[sheet]
             return text
         elif file.filename.endswith(".pptx"):
             prs = Presentation(file_stream)
             text = ""
             for slide in prs.slides:
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error extracting text: {str(e)}")
+# Correctly defined POST route for file upload
 @app.post("/upload/")
 async def upload_file(
     file: UploadFile = File(...),
     text = extract_text(file)
     if not text.strip():
+        raise HTTPException(status_code=400, detail="No text extracted from the file.")
     src_code = LANGUAGE_CODES.get(src_lang)
     tgt_code = LANGUAGE_CODES.get(tgt_lang)
         raise HTTPException(status_code=400, detail=f"Unsupported language: {src_lang} -> {tgt_lang}")
     try:
         translator = load_translator(src_code, tgt_code)
         if isinstance(translator, tuple):
             translator1, translator2 = translator
             intermediate_text = "\n".join([translator1(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
             translated_text = "\n".join([translator2(chunk)[0]['translation_text'] for chunk in chunk_text(intermediate_text)])
         else:
             translated_text = "\n".join([translator(chunk)[0]['translation_text'] for chunk in chunk_text(text)])