SMSristi commited on
Commit
433f3fd
Β·
1 Parent(s): 700b796
Files changed (1) hide show
  1. app.py +32 -19
app.py CHANGED
@@ -29,19 +29,19 @@ tts_models = None
29
  qa_model = None
30
  summarization_model = None
31
 
32
- # ==================== SURYA OCR MODULE ====================
33
  def load_surya_models():
34
  """Load Surya OCR models (cached)"""
35
  global ocr_models
36
  if ocr_models is None:
37
- # βœ… CORRECT imports for latest Surya
38
- from surya.model.detection.model import load_model as load_detection_model
39
- from surya.model.detection.model import load_processor as load_detection_processor
40
  from surya.model.recognition.model import load_model as load_recognition_model
41
  from surya.model.recognition.processor import load_processor as load_recognition_processor
42
 
43
- det_model = load_detection_model()
44
  det_processor = load_detection_processor()
 
45
  rec_model = load_recognition_model()
46
  rec_processor = load_recognition_processor()
47
 
@@ -54,12 +54,15 @@ def load_surya_models():
54
  return ocr_models
55
 
56
  def extract_text_with_surya(pdf_path):
57
- """Extract text from PDF using Surya OCR"""
58
- from surya.ocr import run_ocr
 
 
 
59
 
60
  models = load_surya_models()
61
 
62
- # Read PDF file from path
63
  if isinstance(pdf_path, str):
64
  with open(pdf_path, 'rb') as f:
65
  pdf_bytes = f.read()
@@ -71,25 +74,35 @@ def extract_text_with_surya(pdf_path):
71
  # Convert PDF to images
72
  images = convert_from_bytes(pdf_bytes)
73
 
74
- # Prepare language list (one list per image)
75
- langs = [["bn"]] * len(images) # Must be list of lists
76
 
77
- # Run OCR on all images
78
- predictions = run_ocr(
 
 
 
79
  images,
80
- langs,
81
  models['det_model'],
82
- models['det_processor'],
 
 
 
 
 
 
83
  models['rec_model'],
84
- models['rec_processor']
 
85
  )
86
 
87
- # Extract text from predictions
88
  full_text = ""
89
- for page_result in predictions:
90
  page_text = ""
91
- for text_line in page_result.text_lines:
92
- page_text += text_line.text + " "
 
93
  full_text += page_text.strip() + "\n\n"
94
 
95
  return full_text.strip()
 
29
  qa_model = None
30
  summarization_model = None
31
 
32
+ # ==================== SURYA OCR MODULE (v0.17.0) ====================
33
  def load_surya_models():
34
  """Load Surya OCR models (cached)"""
35
  global ocr_models
36
  if ocr_models is None:
37
+ # βœ… CORRECT imports for Surya 0.17.0
38
+ from surya.model.detection.segformer import load_model as load_detection_model
39
+ from surya.model.detection.segformer import load_processor as load_detection_processor
40
  from surya.model.recognition.model import load_model as load_recognition_model
41
  from surya.model.recognition.processor import load_processor as load_recognition_processor
42
 
 
43
  det_processor = load_detection_processor()
44
+ det_model = load_detection_model()
45
  rec_model = load_recognition_model()
46
  rec_processor = load_recognition_processor()
47
 
 
54
  return ocr_models
55
 
56
  def extract_text_with_surya(pdf_path):
57
+ """Extract text from PDF using Surya OCR v0.17.0"""
58
+ # βœ… Import the actual OCR function from correct location
59
+ from surya.recognition import batch_recognition
60
+ from surya.detection import batch_detection
61
+ from surya.input.processing import convert_if_not_rgb
62
 
63
  models = load_surya_models()
64
 
65
+ # Read PDF file
66
  if isinstance(pdf_path, str):
67
  with open(pdf_path, 'rb') as f:
68
  pdf_bytes = f.read()
 
74
  # Convert PDF to images
75
  images = convert_from_bytes(pdf_bytes)
76
 
77
+ # Convert to RGB if needed
78
+ images = [convert_if_not_rgb(img) for img in images]
79
 
80
+ # Language for each image
81
+ langs = [["bn"]] * len(images)
82
+
83
+ # Step 1: Detect text lines
84
+ line_predictions = batch_detection(
85
  images,
 
86
  models['det_model'],
87
+ models['det_processor']
88
+ )
89
+
90
+ # Step 2: Recognize text from detected lines
91
+ text_predictions = batch_recognition(
92
+ images,
93
+ langs,
94
  models['rec_model'],
95
+ models['rec_processor'],
96
+ bboxes=line_predictions
97
  )
98
 
99
+ # Extract text
100
  full_text = ""
101
+ for page_result in text_predictions:
102
  page_text = ""
103
+ if hasattr(page_result, 'text_lines'):
104
+ for text_line in page_result.text_lines:
105
+ page_text += text_line.text + " "
106
  full_text += page_text.strip() + "\n\n"
107
 
108
  return full_text.strip()