|
|
"""------Applied TF-IDF for better semantic search------""" |
|
|
import feedparser |
|
|
import urllib.parse |
|
|
import yaml |
|
|
from tools.final_answer import FinalAnswerTool |
|
|
import numpy as np |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
import gradio as gr |
|
|
from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool |
|
|
import nltk |
|
|
|
|
|
import datetime |
|
|
import requests |
|
|
import pytz |
|
|
from tools.final_answer import FinalAnswerTool |
|
|
|
|
|
from Gradio_UI import GradioUI |
|
|
|
|
|
nltk.download("stopwords") |
|
|
from nltk.corpus import stopwords |
|
|
|
|
|
@tool |
|
|
def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list: |
|
|
"""Fetches and ranks arXiv papers using TF-IDF and Cosine Similarity. |
|
|
Args: |
|
|
keywords: List of keywords for search. |
|
|
num_results: Number of results to return. |
|
|
Returns: |
|
|
List of the most relevant papers based on TF-IDF ranking. |
|
|
""" |
|
|
try: |
|
|
print(f"DEBUG: Searching arXiv papers with keywords: {keywords}") |
|
|
|
|
|
|
|
|
query = "+AND+".join([f"all:{kw}" for kw in keywords]) |
|
|
query_encoded = urllib.parse.quote(query) |
|
|
url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending" |
|
|
|
|
|
print(f"DEBUG: Query URL - {url}") |
|
|
|
|
|
feed = feedparser.parse(url) |
|
|
papers = [] |
|
|
|
|
|
|
|
|
for entry in feed.entries: |
|
|
papers.append({ |
|
|
"title": entry.title, |
|
|
"authors": ", ".join(author.name for author in entry.authors), |
|
|
"year": entry.published[:4], |
|
|
"abstract": entry.summary, |
|
|
"link": entry.link |
|
|
}) |
|
|
|
|
|
if not papers: |
|
|
return [{"error": "No results found. Try different keywords."}] |
|
|
|
|
|
|
|
|
corpus = [paper["title"] + " " + paper["abstract"] for paper in papers] |
|
|
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english')) |
|
|
tfidf_matrix = vectorizer.fit_transform(corpus) |
|
|
|
|
|
|
|
|
query_str = " ".join(keywords) |
|
|
query_vec = vectorizer.transform([query_str]) |
|
|
|
|
|
|
|
|
similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten() |
|
|
|
|
|
|
|
|
ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True) |
|
|
|
|
|
|
|
|
return [paper[0] for paper in ranked_papers[:num_results]] |
|
|
|
|
|
except Exception as e: |
|
|
print(f"ERROR: {str(e)}") |
|
|
return [{"error": f"Error fetching research papers: {str(e)}"}] |
|
|
|
|
|
@tool |
|
|
def get_current_time_in_timezone(timezone: str) -> str: |
|
|
"""A tool that fetches the current local time in a specified timezone. |
|
|
Args: |
|
|
timezone: A string representing a valid timezone (e.g., 'America/New_York'). |
|
|
""" |
|
|
try: |
|
|
|
|
|
tz = pytz.timezone(timezone) |
|
|
|
|
|
local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S") |
|
|
return f"The current local time in {timezone} is: {local_time}" |
|
|
except Exception as e: |
|
|
return f"Error fetching time for timezone '{timezone}': {str(e)}" |
|
|
|
|
|
|
|
|
final_answer = FinalAnswerTool() |
|
|
|
|
|
|
|
|
|
|
|
model = HfApiModel( |
|
|
max_tokens=2096, |
|
|
temperature=0.5, |
|
|
model_id='Qwen/Qwen2.5-Coder-32B-Instruct', |
|
|
custom_role_conversions=None, |
|
|
) |
|
|
|
|
|
|
|
|
image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True) |
|
|
|
|
|
|
|
|
|
|
|
with open("prompts.yaml", 'r') as stream: |
|
|
prompt_templates = yaml.safe_load(stream) |
|
|
|
|
|
|
|
|
agent = CodeAgent( |
|
|
model=model, |
|
|
tools=[final_answer,fetch_latest_arxiv_papers], |
|
|
max_steps=6, |
|
|
verbosity_level=1, |
|
|
grammar=None, |
|
|
planning_interval=None, |
|
|
name="ScholarAgent", |
|
|
description="An AI agent that fetches the latest research papers from arXiv based on user-defined keywords and filters.", |
|
|
prompt_templates=prompt_templates |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def search_papers(user_input): |
|
|
keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()] |
|
|
print(f"DEBUG: Received input keywords - {keywords}") |
|
|
|
|
|
if not keywords: |
|
|
print("DEBUG: No valid keywords provided.") |
|
|
return "Error: Please enter at least one valid keyword." |
|
|
|
|
|
results = fetch_latest_arxiv_papers(keywords, num_results=3) |
|
|
print(f"DEBUG: Results received - {results}") |
|
|
|
|
|
|
|
|
if isinstance(results, list) and len(results) > 0 and "error" in results[0]: |
|
|
return results[0]["error"] |
|
|
|
|
|
|
|
|
if isinstance(results, list) and results and isinstance(results[0], dict): |
|
|
formatted_results = "\n\n".join([ |
|
|
f"---\n\n" |
|
|
f"π **Title:** {paper['title']}\n\n" |
|
|
f"π¨βπ¬ **Authors:** {paper['authors']}\n\n" |
|
|
f"π
**Year:** {paper['year']}\n\n" |
|
|
f"π **Abstract:** {paper['abstract'][:500]}... *(truncated for readability)*\n\n" |
|
|
f"[π Read Full Paper]({paper['link']})\n\n" |
|
|
for paper in results |
|
|
]) |
|
|
return formatted_results |
|
|
|
|
|
print("DEBUG: No results found.") |
|
|
return "No results found. Try different keywords." |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("# ScholarAgent") |
|
|
keyword_input = gr.Textbox(label="Enter keywords(comma-separated) or even full sentences ", placeholder="e.g., deep learning, reinforcement learning or NLP in finance or Deep learning in Medicine") |
|
|
output_display = gr.Markdown() |
|
|
search_button = gr.Button("Search") |
|
|
|
|
|
search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display]) |
|
|
|
|
|
print("DEBUG: Gradio UI is running. Waiting for user input...") |
|
|
|
|
|
|
|
|
demo.launch() |
|
|
|