import os from groq import Groq from PyPDF2 import PdfReader from docx import Document import gradio as gr Groqkey = os.getenv("LitReview") # Groq API Key GROQ_API_KEY = Groqkey # Initialize Groq client client = Groq(api_key=GROQ_API_KEY) def extract_text_from_file(file): """Extracts text from uploaded PDF or DOCX files.""" if file.name.endswith(".pdf"): reader = PdfReader(file) text = "" for page in reader.pages: text += page.extract_text() return text elif file.name.endswith(".docx"): doc = Document(file) text = "\n".join([p.text for p in doc.paragraphs]) return text else: return "Unsupported file format. Please upload a PDF or DOCX file." def chunk_text(text, max_tokens=4000): """ Splits text into chunks small enough for the Llama model to process. Each chunk is limited to `max_tokens` for safe processing. """ words = text.split() chunks = [] current_chunk = [] for word in words: current_chunk.append(word) if len(" ".join(current_chunk)) > max_tokens: chunks.append(" ".join(current_chunk)) current_chunk = [] if current_chunk: chunks.append(" ".join(current_chunk)) return chunks def analyze_chunk(chunk): """ Analyzes a single chunk of text using the Llama model. """ messages = [ { "role": "system", "content": ( "You are an experienced scholar tasked with analyzing research articles. " "Focus on extracting insights based on: Author (APA format with et al if applicable), Year of publication, Title of the article; " "Problem addressed; Methodology (datasets, tools, techniques, algorithms); " "Results (specific, quantifiable metrics); and Remarks (strengths, weaknesses, improvements). " "Summarize only insights related to these fields and disregard irrelevant content." ) }, { "role": "user", "content": chunk } ] try: completion = client.chat.completions.create( model="llama-3.3-70b-versatile", messages=messages, temperature=0.7, max_tokens=4096, top_p=1, stream=False ) return completion.choices[0].message.content except Exception as e: return f"An error occurred while processing a chunk: {e}" def analyze_document(file): """Processes and analyzes the uploaded document.""" text = extract_text_from_file(file) if text.startswith("Unsupported file format"): yield f"**Error:** {text}" return chunks = chunk_text(text) all_insights = [] yield "**Processing the document. Please wait...**\n" for i, chunk in enumerate(chunks, 1): yield f"**Processing chunk {i} of {len(chunks)}...**" result = analyze_chunk(chunk) if result.strip(): # Only append non-empty results all_insights.append(result) if not all_insights: yield "**Error:** No valid insights were extracted from the document." return yield "**Consolidating all insights into a final summary...**" consolidated_summary_prompt = ( "Below are insights extracted from multiple chunks of a document. " "Consolidate these insights into a single output organized as follows: " "Author, Year, Title; Problem addressed; Methodology; Results; and Remarks. " "Make the final output concise and coherent." ) try: final_summary = client.chat.completions.create( model="llama-3.3-70b-versatile", messages=[ {"role": "system", "content": consolidated_summary_prompt}, {"role": "user", "content": "\n\n".join(all_insights)} ], temperature=0.7, max_tokens=4096, top_p=1, stream=False ) yield f"**Final Summary:**\n\n{final_summary.choices[0].message.content}" except Exception as e: yield f"**Error:** An error occurred during consolidation: {e}" # Define the Gradio interface interface = gr.Interface( fn=analyze_document, inputs=gr.File(label="Upload a PDF or DOCX file"), outputs=gr.Markdown(label="Literature Analysis"), title="Automated Literature Review", description=( "Upload a PDF or DOCX document, and this tool will analyze it to extract and consolidate its content. " "It might take a while, be patient. You are advised to upload smaller documents with shorter text as it may take a while to process longer files." ), ) # Launch the interface if __name__ == "__main__": interface.launch()