rayhane123 commited on
Commit
272b484
·
verified ·
1 Parent(s): 8db0473

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +19 -25
main.py CHANGED
@@ -3,9 +3,9 @@ from fastapi.responses import HTMLResponse
3
  from fastapi.staticfiles import StaticFiles
4
  from transformers import pipeline
5
  import textwrap
6
- import fitz # PyMuPDF for PDFs
7
  from docx import Document
8
- import openpyxl # For Excel files
9
  from pptx import Presentation
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from functools import lru_cache
@@ -15,23 +15,23 @@ from io import BytesIO
15
  # Initialize FastAPI app
16
  app = FastAPI()
17
 
18
- # Enable CORS
19
  app.add_middleware(
20
  CORSMiddleware,
21
- allow_origins=["*"],
22
  allow_credentials=True,
23
- allow_methods=["*"],
24
- allow_headers=["*"],
25
  )
26
 
27
- # Define static files directory
28
  STATIC_DIR = "static"
29
 
30
- # Ensure the static directory exists
31
  if not os.path.exists(STATIC_DIR):
32
  os.makedirs(STATIC_DIR)
33
 
34
- # Mount static files correctly
35
  app.mount("/static", StaticFiles(directory=STATIC_DIR, html=True), name="static")
36
 
37
  @app.get("/", response_class=HTMLResponse)
@@ -43,7 +43,7 @@ async def read_root():
43
  except FileNotFoundError:
44
  raise HTTPException(status_code=404, detail="index.html not found in static folder.")
45
 
46
- # Supported language codes
47
  LANGUAGE_CODES = {
48
  "Anglais": "en",
49
  "Francais": "fr",
@@ -61,7 +61,7 @@ AVAILABLE_MODELS = {
61
  "en-es": "Helsinki-NLP/opus-mt-en-es",
62
  }
63
 
64
- # Cache models to improve performance
65
  @lru_cache(maxsize=10)
66
  def load_translator(src_code: str, tgt_code: str):
67
  model_key = f"{src_code}-{tgt_code}"
@@ -78,31 +78,29 @@ def load_translator(src_code: str, tgt_code: str):
78
  else:
79
  raise ValueError(f"No model available for {src_code} -> {tgt_code}")
80
 
81
- # Split text into chunks
82
  def chunk_text(text, max_length=400):
83
  return textwrap.wrap(text, max_length)
84
 
85
- # Extract text from different file types
86
  def extract_text(file: UploadFile):
87
  try:
88
- file_bytes = file.file.read() # Read file content
89
- file_stream = BytesIO(file_bytes) # Convert to binary stream
90
 
91
  if file.filename.endswith(".txt"):
92
  return file_bytes.decode("utf-8")
93
 
94
  elif file.filename.endswith(".pdf"):
95
- doc = fitz.open(stream=file_stream, filetype="pdf")
96
  return "\n".join([page.get_text() for page in doc])
97
 
98
  elif file.filename.endswith(".docx"):
99
- file_stream.seek(0) # Reset cursor position
100
  doc = Document(file_stream)
101
  return "\n".join([para.text for para in doc.paragraphs])
102
 
103
  elif file.filename.endswith(".xlsx"):
104
- file_stream.seek(0)
105
- wb = openpyxl.load_workbook(file_stream, data_only=True)
106
  text = ""
107
  for sheet in wb.sheetnames:
108
  ws = wb[sheet]
@@ -111,7 +109,6 @@ def extract_text(file: UploadFile):
111
  return text
112
 
113
  elif file.filename.endswith(".pptx"):
114
- file_stream.seek(0)
115
  prs = Presentation(file_stream)
116
  text = ""
117
  for slide in prs.slides:
@@ -126,7 +123,7 @@ def extract_text(file: UploadFile):
126
  except Exception as e:
127
  raise HTTPException(status_code=500, detail=f"Error extracting text: {str(e)}")
128
 
129
- # Upload and translate file
130
  @app.post("/upload/")
131
  async def upload_file(
132
  file: UploadFile = File(...),
@@ -136,7 +133,7 @@ async def upload_file(
136
  text = extract_text(file)
137
 
138
  if not text.strip():
139
- raise HTTPException(status_code=400, detail="No text extracted from file.")
140
 
141
  src_code = LANGUAGE_CODES.get(src_lang)
142
  tgt_code = LANGUAGE_CODES.get(tgt_lang)
@@ -145,15 +142,12 @@ async def upload_file(
145
  raise HTTPException(status_code=400, detail=f"Unsupported language: {src_lang} -> {tgt_lang}")
146
 
147
  try:
148
- # Load translation model
149
  translator = load_translator(src_code, tgt_code)
150
 
151
- # If translation goes through English as an intermediate step
152
  if isinstance(translator, tuple):
153
  translator1, translator2 = translator
154
  intermediate_text = "\n".join([translator1(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
155
  translated_text = "\n".join([translator2(chunk)[0]['translation_text'] for chunk in chunk_text(intermediate_text)])
156
-
157
  else:
158
  translated_text = "\n".join([translator(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
159
 
 
3
  from fastapi.staticfiles import StaticFiles
4
  from transformers import pipeline
5
  import textwrap
6
+ import fitz # PyMuPDF for PDF handling
7
  from docx import Document
8
+ import openpyxl # For Excel
9
  from pptx import Presentation
10
  from fastapi.middleware.cors import CORSMiddleware
11
  from functools import lru_cache
 
15
  # Initialize FastAPI app
16
  app = FastAPI()
17
 
18
+ # Enable CORS to allow frontend communication
19
  app.add_middleware(
20
  CORSMiddleware,
21
+ allow_origins=["*"],
22
  allow_credentials=True,
23
+ allow_methods=["*"],
24
+ allow_headers=["*"],
25
  )
26
 
27
+ # Directory for static files
28
  STATIC_DIR = "static"
29
 
30
+ # Ensure the directory exists
31
  if not os.path.exists(STATIC_DIR):
32
  os.makedirs(STATIC_DIR)
33
 
34
+ # Serve static files correctly
35
  app.mount("/static", StaticFiles(directory=STATIC_DIR, html=True), name="static")
36
 
37
  @app.get("/", response_class=HTMLResponse)
 
43
  except FileNotFoundError:
44
  raise HTTPException(status_code=404, detail="index.html not found in static folder.")
45
 
46
+ # Supported languages
47
  LANGUAGE_CODES = {
48
  "Anglais": "en",
49
  "Francais": "fr",
 
61
  "en-es": "Helsinki-NLP/opus-mt-en-es",
62
  }
63
 
64
+ # Cache models for better performance
65
  @lru_cache(maxsize=10)
66
  def load_translator(src_code: str, tgt_code: str):
67
  model_key = f"{src_code}-{tgt_code}"
 
78
  else:
79
  raise ValueError(f"No model available for {src_code} -> {tgt_code}")
80
 
81
+ # Function to split text into chunks
82
  def chunk_text(text, max_length=400):
83
  return textwrap.wrap(text, max_length)
84
 
85
+ # Function to extract text from files
86
  def extract_text(file: UploadFile):
87
  try:
88
+ file_bytes = file.file.read()
89
+ file_stream = BytesIO(file_bytes)
90
 
91
  if file.filename.endswith(".txt"):
92
  return file_bytes.decode("utf-8")
93
 
94
  elif file.filename.endswith(".pdf"):
95
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
96
  return "\n".join([page.get_text() for page in doc])
97
 
98
  elif file.filename.endswith(".docx"):
 
99
  doc = Document(file_stream)
100
  return "\n".join([para.text for para in doc.paragraphs])
101
 
102
  elif file.filename.endswith(".xlsx"):
103
+ wb = openpyxl.load_workbook(file_stream)
 
104
  text = ""
105
  for sheet in wb.sheetnames:
106
  ws = wb[sheet]
 
109
  return text
110
 
111
  elif file.filename.endswith(".pptx"):
 
112
  prs = Presentation(file_stream)
113
  text = ""
114
  for slide in prs.slides:
 
123
  except Exception as e:
124
  raise HTTPException(status_code=500, detail=f"Error extracting text: {str(e)}")
125
 
126
+ # Correctly defined POST route for file upload
127
  @app.post("/upload/")
128
  async def upload_file(
129
  file: UploadFile = File(...),
 
133
  text = extract_text(file)
134
 
135
  if not text.strip():
136
+ raise HTTPException(status_code=400, detail="No text extracted from the file.")
137
 
138
  src_code = LANGUAGE_CODES.get(src_lang)
139
  tgt_code = LANGUAGE_CODES.get(tgt_lang)
 
142
  raise HTTPException(status_code=400, detail=f"Unsupported language: {src_lang} -> {tgt_lang}")
143
 
144
  try:
 
145
  translator = load_translator(src_code, tgt_code)
146
 
 
147
  if isinstance(translator, tuple):
148
  translator1, translator2 = translator
149
  intermediate_text = "\n".join([translator1(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
150
  translated_text = "\n".join([translator2(chunk)[0]['translation_text'] for chunk in chunk_text(intermediate_text)])
 
151
  else:
152
  translated_text = "\n".join([translator(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
153