|
import gradio as gr |
|
import pandas as pd |
|
import re |
|
import os |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer") |
|
model = AutoModelForSeq2SeqLM.from_pretrained("potsawee/t5-large-generation-squad-QuestionAnswer") |
|
|
|
def generate_question_answer_pairs(input_text): |
|
if input_text is None: |
|
return "Please enter a text" |
|
|
|
d = {'Question':[],'Answer':[]} |
|
df = pd.DataFrame(data=d) |
|
|
|
sentences = re.split(r'(?<=[.!?])', input_text) |
|
question_answer_pairs = [] |
|
|
|
for sentence in sentences: |
|
input_ids = tokenizer.encode(sentence, return_tensors="pt") |
|
outputs = model.generate(input_ids, max_length=100, num_return_sequences=1) |
|
question_answer = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
question_answer_pairs.append(question_answer) |
|
|
|
result = '' |
|
|
|
for question_answer in question_answer_pairs: |
|
qa_parts = question_answer.split("?") |
|
if len(qa_parts) >= 2: |
|
question_part = qa_parts[0] + "?" |
|
answer_part = qa_parts[1].strip() |
|
new_data = {'Question': [question_part], 'Answer': [answer_part]} |
|
df = pd.concat([df, pd.DataFrame(new_data)], ignore_index=True) |
|
result += f"Question: {question_part}\nAnswer: {answer_part}\n\n" |
|
|
|
df.to_csv("QAPairs.csv") |
|
return result, "QAPairs.csv" |
|
|
|
title = "Question-Answer Pairs Generation" |
|
input_text = gr.Textbox(lines=4, label="Text") |
|
output_file = gr.File(label="Download as csv") |
|
output_text = gr.Textbox() |
|
|
|
interface = gr.Interface( |
|
fn=generate_question_answer_pairs, |
|
inputs=input_text, |
|
outputs=[output_text, output_file], |
|
title=title, |
|
) |
|
interface.launch() |