import requests import torch import streamlit as st import google.generativeai as genai import numpy as np from transformers import AutoTokenizer, AutoModel from sklearn.cluster import KMeans from sklearn.metrics.pairwise import cosine_similarity import time # ✅ Load SciBERT model and tokenizer SCIBERT_MODEL = "allenai/scibert_scivocab_uncased" tokenizer = AutoTokenizer.from_pretrained(SCIBERT_MODEL) model = AutoModel.from_pretrained(SCIBERT_MODEL) # ✅ Configure Gemini API (Replace with your key) gemini_model = genai.GenerativeModel("models/gemini-1.5-pro") def fetch_papers(query, limit=5): """Fetches research papers using the Semantic Scholar API.""" url = f"https://api.semanticscholar.org/graph/v1/paper/search?query={query}&limit={limit}&fields=title,abstract,authors,year,citationCount" papers = [] status, response = get_response(url) if status !=200: return([]) else: data = response.json() for paper in data.get("data", []): if "abstract" in paper: title = paper["title"] abstract = paper["abstract"] year = paper.get("year", "N/A") citation_count = paper.get("citationCount", 0) authors = ", ".join([author["name"] for author in paper.get("authors", []) if "name" in author]) citation = f"{title}, {authors}, {year} (Cited by {citation_count})" papers.append((title, abstract, citation)) return papers def extract_embeddings(text): """Generates SciBERT embeddings for a given text.""" inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) outputs = model(**inputs) # ✅ Ensure embeddings are in 2D format for KMeans embedding = torch.mean(outputs.last_hidden_state, dim=1).detach().numpy().squeeze() return embedding def get_response(url, max_retries=5, backoff_factor=2): """Fetches response from a URL with retry mechanism for handling 429 errors.""" for attempt in range(max_retries): response = requests.get(url) if response.status_code == 200: return response.status_code, response elif response.status_code == 429: # Too many requests wait_time = backoff_factor ** attempt # Exponential backoff #st.write(f"Rate limit hit. Retrying in {wait_time} seconds...") time.sleep(wait_time) else: break # Break out for non-retryable errors st.write("Server busy. Please retry later.") return response.status_code, response def analyze_literature(query): """Fetches papers, generates embeddings, clusters them, and finds research gaps.""" papers = fetch_papers(query, limit=10) valid_papers = [(title, abstract, citation) for title, abstract, citation in papers if abstract] if not valid_papers: return "No research papers found.", "No citations available." try: # ✅ Extract embeddings & ensure correct shape #st.write("starting embeddings...") embeddings = np.array([extract_embeddings(abstract) for _, abstract, _ in valid_papers]) st.write("finished embeddings...") if len(embeddings.shape) != 2: return "Error: Embeddings have incorrect dimensions.", "No citations available." #st.write("starting clustering ...") # ✅ Clustering research topics kmeans = KMeans(n_clusters=min(3, len(embeddings)), random_state=42).fit(embeddings) clusters = kmeans.labels_ st.write("finished clustering...") # ✅ Compute similarity matrix #st.write("starting similarity matrix ...") similarity_matrix = cosine_similarity(embeddings) st.write("finished similarity matrix...") # ✅ Identify research gaps (low similarity papers) #st.write("starting gaps...") low_sim_indices = [(i, j) for i in range(len(similarity_matrix)) for j in range(len(similarity_matrix[i])) if similarity_matrix[i][j] < 0.5] st.write("completing gaps ...") # ✅ Generate summary using Gemini API #st.write("generating paper_text...") paper_texts = [f"- {title}: {abstract[:300]}..." for title, abstract, _ in valid_papers] st.write("completed generating paper_text...") if not paper_texts: # ✅ Ensure we don't send empty content return "Error: No valid abstracts found.", "No citations available." paper_text_str = "\n".join(paper_texts) # Store it in a variable first gemini_prompt = f"Summarize the key findings and propose research gaps for:\n{paper_text_str}" #st.write("trying for response...") # ✅ Ensure the Gemini API call is formatted correctly response = gemini_model.generate_content(gemini_prompt) # Corrected here st.write("obtained response...") # ✅ Handle cases where the response might not have `.text` gemini_summary = getattr(response, "text", "Summary not available.") # ✅ Format output output = f"## Literature Review for '{query}'\n\n" output += gemini_summary + "\n\n" output += "## Research Gaps Identified:\n" for i, j in low_sim_indices: output += f"- **Paper:** '{papers[i][0]}' (Cited by {papers[i][2].split()[-1]}) has low similarity with **'{papers[j][0]}'**.\n" # ✅ Format citations citations = "\n\n".join([f"**{title}**\n{citation}" for title, _, citation in papers]) return output, citations except Exception as e: return f"Error processing literature: {str(e)}", "No citations available." # ✅ Streamlit UI st.title("📚 Automated Literature Review & Research Gap Finder") query = st.text_input("Enter a research topic:", "Capital Structure in Indian firms") if st.button("Analyze Literature"): with st.spinner("Fetching and analyzing papers..."): review_text, citation_text = analyze_literature(query) st.subheader("Literature Review Summary") st.write(review_text) st.subheader("Citations & References") st.write(citation_text)