yuvrajpant56 commited on
Commit
bfcb442
Β·
verified Β·
1 Parent(s): 1c27fba

Create app.py

Browse files

Take user keyword
queries the arXiv API for the latest papers
uses TF-IDF + Cosine Similarity to semantically rank those papers by relevance
Displays the top papers in a Gradio Web Interface.

Files changed (1) hide show
  1. app.py +176 -0
app.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """------Applied TF-IDF for better semantic search------"""
2
+ import feedparser
3
+ import urllib.parse
4
+ import yaml
5
+ from tools.final_answer import FinalAnswerTool
6
+ import numpy as np
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+ import gradio as gr
10
+ from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
11
+ import nltk
12
+
13
+ import datetime
14
+ import requests
15
+ import pytz
16
+ from tools.final_answer import FinalAnswerTool
17
+
18
+ from Gradio_UI import GradioUI
19
+
20
+ nltk.download("stopwords")
21
+ from nltk.corpus import stopwords
22
+
23
+ @tool # βœ… Register the function properly as a SmolAgents tool
24
+ def fetch_latest_arxiv_papers(keywords: list, num_results: int = 5) -> list:
25
+ """Fetches and ranks arXiv papers using TF-IDF and Cosine Similarity.
26
+ Args:
27
+ keywords: List of keywords for search.
28
+ num_results: Number of results to return.
29
+ Returns:
30
+ List of the most relevant papers based on TF-IDF ranking.
31
+ """
32
+ try:
33
+ print(f"DEBUG: Searching arXiv papers with keywords: {keywords}")
34
+
35
+ # Use a general keyword search
36
+ query = "+AND+".join([f"all:{kw}" for kw in keywords])
37
+ query_encoded = urllib.parse.quote(query)
38
+ url = f"http://export.arxiv.org/api/query?search_query={query_encoded}&start=0&max_results=50&sortBy=submittedDate&sortOrder=descending"
39
+
40
+ print(f"DEBUG: Query URL - {url}")
41
+
42
+ feed = feedparser.parse(url)
43
+ papers = []
44
+
45
+ # Extract papers from arXiv
46
+ for entry in feed.entries:
47
+ papers.append({
48
+ "title": entry.title,
49
+ "authors": ", ".join(author.name for author in entry.authors),
50
+ "year": entry.published[:4],
51
+ "abstract": entry.summary,
52
+ "link": entry.link
53
+ })
54
+
55
+ if not papers:
56
+ return [{"error": "No results found. Try different keywords."}]
57
+
58
+ # Prepare TF-IDF Vectorization
59
+ corpus = [paper["title"] + " " + paper["abstract"] for paper in papers]
60
+ vectorizer = TfidfVectorizer(stop_words=stopwords.words('english')) # Remove stopwords
61
+ tfidf_matrix = vectorizer.fit_transform(corpus)
62
+
63
+ # Transform Query into TF-IDF Vector
64
+ query_str = " ".join(keywords)
65
+ query_vec = vectorizer.transform([query_str])
66
+
67
+ #Compute Cosine Similarity
68
+ similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
69
+
70
+ #Sort papers based on similarity score
71
+ ranked_papers = sorted(zip(papers, similarity_scores), key=lambda x: x[1], reverse=True)
72
+
73
+ # Return the most relevant papers
74
+ return [paper[0] for paper in ranked_papers[:num_results]]
75
+
76
+ except Exception as e:
77
+ print(f"ERROR: {str(e)}")
78
+ return [{"error": f"Error fetching research papers: {str(e)}"}]
79
+
80
+ @tool
81
+ def get_current_time_in_timezone(timezone: str) -> str:
82
+ """A tool that fetches the current local time in a specified timezone.
83
+ Args:
84
+ timezone: A string representing a valid timezone (e.g., 'America/New_York').
85
+ """
86
+ try:
87
+ # Create timezone object
88
+ tz = pytz.timezone(timezone)
89
+ # Get current time in that timezone
90
+ local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
91
+ return f"The current local time in {timezone} is: {local_time}"
92
+ except Exception as e:
93
+ return f"Error fetching time for timezone '{timezone}': {str(e)}"
94
+
95
+
96
+ final_answer = FinalAnswerTool()
97
+
98
+
99
+ # AI Model
100
+ model = HfApiModel(
101
+ max_tokens=2096,
102
+ temperature=0.5,
103
+ model_id='Qwen/Qwen2.5-Coder-32B-Instruct',
104
+ custom_role_conversions=None,
105
+ )
106
+
107
+ # Import tool from Hub
108
+ image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
109
+
110
+
111
+ # Load prompt templates
112
+ with open("prompts.yaml", 'r') as stream:
113
+ prompt_templates = yaml.safe_load(stream)
114
+
115
+ # Create the AI Agent
116
+ agent = CodeAgent(
117
+ model=model,
118
+ tools=[final_answer,fetch_latest_arxiv_papers], # Add your tools here
119
+ max_steps=6,
120
+ verbosity_level=1,
121
+ grammar=None,
122
+ planning_interval=None,
123
+ name="ScholarAgent",
124
+ description="An AI agent that fetches the latest research papers from arXiv based on user-defined keywords and filters.",
125
+ prompt_templates=prompt_templates
126
+ )
127
+
128
+
129
+
130
+ #Search Papers
131
+ def search_papers(user_input):
132
+ keywords = [kw.strip() for kw in user_input.split(",") if kw.strip()] # Ensure valid keywords
133
+ print(f"DEBUG: Received input keywords - {keywords}") # Debug user input
134
+
135
+ if not keywords:
136
+ print("DEBUG: No valid keywords provided.")
137
+ return "Error: Please enter at least one valid keyword."
138
+
139
+ results = fetch_latest_arxiv_papers(keywords, num_results=3) # Fetch 3 results
140
+ print(f"DEBUG: Results received - {results}") # Debug function output
141
+
142
+ # Check if the API returned an error
143
+ if isinstance(results, list) and len(results) > 0 and "error" in results[0]:
144
+ return results[0]["error"] # Return the error message directly
145
+
146
+ # Format results only if valid papers exist
147
+ if isinstance(results, list) and results and isinstance(results[0], dict):
148
+ formatted_results = "\n\n".join([
149
+ f"---\n\n"
150
+ f"πŸ“Œ **Title:** {paper['title']}\n\n"
151
+ f"πŸ‘¨β€πŸ”¬ **Authors:** {paper['authors']}\n\n"
152
+ f"πŸ“… **Year:** {paper['year']}\n\n"
153
+ f"πŸ“– **Abstract:** {paper['abstract'][:500]}... *(truncated for readability)*\n\n"
154
+ f"[πŸ”— Read Full Paper]({paper['link']})\n\n"
155
+ for paper in results
156
+ ])
157
+ return formatted_results
158
+
159
+ print("DEBUG: No results found.")
160
+ return "No results found. Try different keywords."
161
+
162
+
163
+
164
+ # Create Gradio UI
165
+ with gr.Blocks() as demo:
166
+ gr.Markdown("# ScholarAgent")
167
+ keyword_input = gr.Textbox(label="Enter keywords(comma-separated) or even full sentences ", placeholder="e.g., deep learning, reinforcement learning or NLP in finance or Deep learning in Medicine")
168
+ output_display = gr.Markdown()
169
+ search_button = gr.Button("Search")
170
+
171
+ search_button.click(search_papers, inputs=[keyword_input], outputs=[output_display])
172
+
173
+ print("DEBUG: Gradio UI is running. Waiting for user input...")
174
+
175
+ # Launch Gradio App
176
+ demo.launch()