Exam_Checker_AI / transcriber.py
sharoz's picture
innit
c924c3e
import os
import base64
from openai import OpenAI
from dotenv import load_dotenv
from helper import encode_image, pdf_to_images
# from pdf_processor import pdf_to_images
from tqdm import tqdm
import json
load_dotenv()
def transcribe_image(image_path, handwritten_flag=True):
# Getting the Base64 string
base64_image = encode_image(image_path)
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
if handwritten_flag:
system_cmd = "You are a professional transcriber, you will be given an input image which has handwritten text and your job is to transcribe it to the best of your ability.\n\nYou are not allowed to correct any mistakes in the imput, the output text should be exactly the same as in the image input. \nJust output raw text."
else:
system_cmd = "You are a professional transcriber, you will be given an input image which has text and your job is to transcribe it to the best of your ability.\n\nYou are not allowed to correct any mistakes in the imput, the output text should be exactly the same as in the image input."
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": [
{
"type": "text",
"text": system_cmd,
}
]
},
{
"role": "user",
"content": [
{
"type": "text",
# "text": "tables should be in markdown format without any padding, encapsulation or ``` ```.",
"text": "Transcribe the following image to text in markdown format.",
},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
},
],
}
],
)
# print(response.choices[0].message.content)
return response.choices[0].message.content
def transcribe_pdf(pdf_path, interim_files_folder_path, save_dict=False, save_dict_path=None):
image_paths = pdf_to_images(pdf_path, interim_files_folder_path, zoom_x=2.0, zoom_y=2.0)
transcriptions = {}
for i, image_path in enumerate(tqdm(image_paths, desc="Transcribing PDF")):
transcriptions[f"page_{i+1}"] = transcribe_image(image_path)
if save_dict and save_dict_path:
with open(save_dict_path, 'w') as f:
json.dump(transcriptions, f)
return transcriptions