File size: 3,363 Bytes
8953fd3
 
 
 
 
 
ab9f628
8953fd3
ab9f628
 
 
 
 
 
 
 
 
 
 
 
f069ded
 
 
3146891
ab9f628
f069ded
 
 
3146891
8953fd3
 
 
 
 
 
 
 
 
 
 
 
3ce84fd
 
 
8953fd3
 
 
3ce84fd
 
 
8953fd3
3ce84fd
8953fd3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3146891
659a935
f069ded
ab9f628
 
8953fd3
 
3146891
8953fd3
ab9f628
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
import gradio as gr
from transformers import ViTFeatureExtractor, ViTModel
from PIL import Image
from transformers import AutoTokenizer, AutoModel
import torch
from pdf2image import convert_from_path

# CSS styles
css = """
.button { 
    padding: 10px 20px; 
    background: #007BFF; 
    color: white; 
    border: none; 
    cursor: pointer; 
    font-size: 16px; 
    margin: 10px;
}
"""

# Define layout with custom styles
layout = [
    gr.Row([gr.File(label="Upload PDF", type="filepath")]),  # Changed 'file' to 'filepath'
    gr.Row([gr.Button("Generate Insights", type="submit")]),
    gr.Row([gr.Textbox("Placeholder for PDF insights", label="Insights", type="text")])
]


# Function to get image embeddings using ViT
def get_image_embeddings(image_path, model_name='google/vit-base-patch16-224'):
    feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)
    model = ViTModel.from_pretrained(model_name)
    
    image = Image.open(image_path)
    inputs = feature_extractor(images=image, return_tensors="pt")
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
    return embeddings

# Function to convert PDF to images
def pdf_to_images(pdf_file, img_dir):
    images = convert_from_path(pdf_file)
    
    # Create the directory if it doesn't exist
    os.makedirs(img_dir, exist_ok=True)

    for i, image in enumerate(images):
        image_path = f"{img_dir}/page_{i + 1}.png"
        image.save(image_path, "PNG")

    print(f"Converted {len(images)} pages to images and saved in {img_dir}")

# Function to get text embeddings using a transformer model
def get_text_embeddings(text, model_name='bert-base-uncased'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Mean pooling
    return embeddings

# Function to process PDF and generate a response
def process_pdf_and_generate_response(pdf_file):
    # Convert PDF to images
    img_dir = "pdf_images"
    pdf_to_images(pdf_file, img_dir)

    # Generate embeddings for each image
    image_embeddings = []
    for filename in os.listdir(img_dir):
        if filename.endswith(".png"):
            image_path = os.path.join(img_dir, filename)
            image_embeddings.append(get_image_embeddings(image_path))

    # Perform some text analysis on the PDF content (replace with your logic)
    pdf_text = "PDF content analysis placeholder"
    text_embeddings = get_text_embeddings(pdf_text)

    # Combine image and text embeddings and generate a response (replace with your logic)
    combined_embeddings = torch.cat([*image_embeddings, text_embeddings], dim=0)
    response = "Response based on the processed PDF"
    return response

iface = gr.Interface(
    fn=process_pdf_and_generate_response,
    inputs=gr.File(label="Upload PDF", type="filepath"),  # Changed 'file' to 'filepath'
    outputs=gr.Textbox("Placeholder for PDF insights", label="Insights", type="text"),
    title="pdf-chatbot",
    description="Upload a PDF and receive insights based on its content.",
    css=css  # Add the CSS styles here
)


if __name__ == "__main__":
    iface.launch()