SMSristi commited on
Commit
77655bf
Β·
1 Parent(s): 433f3fd
Files changed (1) hide show
  1. app.py +28 -0
app.py CHANGED
@@ -54,6 +54,34 @@ def load_surya_models():
54
  return ocr_models
55
 
56
  def extract_text_with_surya(pdf_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  """Extract text from PDF using Surya OCR v0.17.0"""
58
  # βœ… Import the actual OCR function from correct location
59
  from surya.recognition import batch_recognition
 
54
  return ocr_models
55
 
56
  def extract_text_with_surya(pdf_path):
57
+ """Extract text from PDF using Surya OCR"""
58
+ det_predictor, rec_predictor = load_surya_models()
59
+
60
+ # Read PDF file from path
61
+ if isinstance(pdf_path, str):
62
+ with open(pdf_path, 'rb') as f:
63
+ pdf_bytes = f.read()
64
+ elif hasattr(pdf_path, 'read'):
65
+ pdf_bytes = pdf_path.read()
66
+ else:
67
+ pdf_bytes = pdf_path
68
+
69
+ images = convert_from_bytes(pdf_bytes)
70
+
71
+ # βœ… CORRECT WAY: Pass detection predictor as parameter
72
+ predictions = rec_predictor(images, det_predictor=det_predictor)
73
+
74
+ # Extract text
75
+ full_text = ""
76
+ for page_result in predictions:
77
+ page_text = ""
78
+ if hasattr(page_result, 'text_lines'):
79
+ for text_line in page_result.text_lines:
80
+ page_text += text_line.text + " "
81
+ full_text += page_text.strip() + "\n\n"
82
+
83
+ return full_text.strip()
84
+
85
  """Extract text from PDF using Surya OCR v0.17.0"""
86
  # βœ… Import the actual OCR function from correct location
87
  from surya.recognition import batch_recognition