Spaces:
Configuration error
Configuration error
Commit
·
d669120
1
Parent(s):
11e2a72
revision
Browse files- __pycache__/agent.cpython-312.pyc +0 -0
- __pycache__/knowledabase.cpython-312.pyc +0 -0
- __pycache__/knowledgebase.cpython-312.pyc +0 -0
- __pycache__/utils.cpython-312.pyc +0 -0
- agent.py +76 -0
- app.py +9 -1
- estructura.txt +39 -0
- knowledgebase.py +86 -0
- requirements.txt +0 -0
- urls.txt +9 -0
- utils.py +71 -0
__pycache__/agent.cpython-312.pyc
ADDED
|
Binary file (3.57 kB). View file
|
|
|
__pycache__/knowledabase.cpython-312.pyc
ADDED
|
Binary file (3.94 kB). View file
|
|
|
__pycache__/knowledgebase.cpython-312.pyc
ADDED
|
Binary file (3.91 kB). View file
|
|
|
__pycache__/utils.cpython-312.pyc
ADDED
|
Binary file (3.44 kB). View file
|
|
|
agent.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain.agents import Tool
|
| 2 |
+
from langchain.agents import initialize_agent
|
| 3 |
+
from langchain_openai import ChatOpenAI
|
| 4 |
+
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
|
| 5 |
+
from langchain.chains import LLMChain
|
| 6 |
+
from langchain.prompts import PromptTemplate
|
| 7 |
+
from utils import get_question_context, google_search_result
|
| 8 |
+
|
| 9 |
+
# Definimos el template para la consulta de turismo
|
| 10 |
+
turism_template = """You are a very experienced turist guide specialised in recommending activities \
|
| 11 |
+
and things to do in Marbella, a city located in Andalusia, Spain. \
|
| 12 |
+
You have an excellent knowledge of and understanding of restaurants, sports, activities, experiences and places to visit in the city \
|
| 13 |
+
specifically targeted to families, couples, friends and solo travelers. \
|
| 14 |
+
You have the ability to think, reflect, debate, discuss and evaluate the data stored in a knowledge base from youtube videos related to \
|
| 15 |
+
turism in Marbella, and the ability to make use of it to support your explanations to the future turists that will visit the city and ask for your advice. \
|
| 16 |
+
Remenber: You answer must be so accurate and based on your knowledbase. \
|
| 17 |
+
Here is a question from a user: \
|
| 18 |
+
{input}"""
|
| 19 |
+
|
| 20 |
+
default_template = """You are a bot specialised in giving answers to questions about a wide range of topics. \
|
| 21 |
+
You are provided with the user answer and context from the first non-sponsored URL from a Google search. \
|
| 22 |
+
If you don't know the answer simply say I don't know but if you do please answer the question precisely.\
|
| 23 |
+
Here is a question from a user and a bit of context from Google Search: \
|
| 24 |
+
{input}"""
|
| 25 |
+
|
| 26 |
+
def get_turism_answer(input):
|
| 27 |
+
input = get_question_context(query=input, top_k=3)
|
| 28 |
+
llm_prompt = PromptTemplate.from_template(turism_template)
|
| 29 |
+
chain = LLMChain(llm=llm, prompt=llm_prompt)
|
| 30 |
+
answer = chain.run(input)
|
| 31 |
+
return answer
|
| 32 |
+
|
| 33 |
+
def get_internet_answer(input):
|
| 34 |
+
context = google_search_result(input)
|
| 35 |
+
input = f"Pregunta del usuario: {input} \n Contexto para responder a la pregunta del usuario: {context}"
|
| 36 |
+
llm_prompt = PromptTemplate.from_template(default_template)
|
| 37 |
+
chain = LLMChain(llm=llm, prompt=llm_prompt)
|
| 38 |
+
answer = chain.run(input)
|
| 39 |
+
return answer
|
| 40 |
+
|
| 41 |
+
tools = [
|
| 42 |
+
Tool(
|
| 43 |
+
name='Turism knowledgebase tool',
|
| 44 |
+
func=get_turism_answer,
|
| 45 |
+
description=('Use this tool when answering questions about turism in Marbella.')
|
| 46 |
+
),
|
| 47 |
+
Tool(
|
| 48 |
+
name='Default knowledgebase tool',
|
| 49 |
+
func=get_internet_answer,
|
| 50 |
+
description=(
|
| 51 |
+
'use this tool when the input question is not related to turism in Marbella.'
|
| 52 |
+
)
|
| 53 |
+
)
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
llm = ChatOpenAI(model='gpt-4',temperature=0)
|
| 57 |
+
|
| 58 |
+
# conversational memory
|
| 59 |
+
conversational_memory = ConversationBufferWindowMemory(
|
| 60 |
+
memory_key='chat_history',
|
| 61 |
+
k=5,
|
| 62 |
+
return_messages=True
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
agent = initialize_agent(
|
| 66 |
+
agent='chat-conversational-react-description',
|
| 67 |
+
tools=tools,
|
| 68 |
+
llm=llm,
|
| 69 |
+
verbose=True,
|
| 70 |
+
max_iterations=3,
|
| 71 |
+
early_stopping_method='generate',
|
| 72 |
+
memory=conversational_memory
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
def call_agent(input):
|
| 76 |
+
return agent(input)['output']
|
app.py
CHANGED
|
@@ -1,11 +1,19 @@
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
# Función del bot que procesa el mensaje del usuario
|
| 4 |
def chatbot(message, history=[]):
|
| 5 |
# Agregar el mensaje del usuario al historial
|
| 6 |
history.append(("Usuario:", message))
|
|
|
|
|
|
|
| 7 |
# Generar una respuesta simple del bot
|
| 8 |
-
response = f"Bot:
|
| 9 |
history.append((response,))
|
| 10 |
# Formatear el historial como un bloque de texto
|
| 11 |
chat_history = "\n".join([f"{msg[0]} {msg[1]}" if len(msg) > 1 else msg[0] for msg in history])
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from agent import call_agent
|
| 3 |
+
import os
|
| 4 |
+
from dotenv import load_dotenv, find_dotenv
|
| 5 |
+
_ = load_dotenv(find_dotenv())
|
| 6 |
+
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
|
| 7 |
+
|
| 8 |
|
| 9 |
# Función del bot que procesa el mensaje del usuario
|
| 10 |
def chatbot(message, history=[]):
|
| 11 |
# Agregar el mensaje del usuario al historial
|
| 12 |
history.append(("Usuario:", message))
|
| 13 |
+
# Consultar al agente de OpenAI
|
| 14 |
+
response = call_agent(message)
|
| 15 |
# Generar una respuesta simple del bot
|
| 16 |
+
response = f"Bot:'{response}'"
|
| 17 |
history.append((response,))
|
| 18 |
# Formatear el historial como un bloque de texto
|
| 19 |
chat_history = "\n".join([f"{msg[0]} {msg[1]}" if len(msg) > 1 else msg[0] for msg in history])
|
estructura.txt
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Primera parte (Generar la base de conocimiento):
|
| 2 |
+
-Partimos de una lista urls de videos de youtube.
|
| 3 |
+
-Descargamos los videos (tmp).
|
| 4 |
+
-Extraer el texto de cada video (txt).
|
| 5 |
+
-Romper los textos, vectorizar y almacenar en la bbdd vectorial (Metadata: Texto, url, tema...).
|
| 6 |
+
|
| 7 |
+
Segunda parte (Generar el Router):
|
| 8 |
+
-Router que:
|
| 9 |
+
-Si la pregunta está relacionada:
|
| 10 |
+
-Entra en acción nuestro Agente*.
|
| 11 |
+
-Si no tiene nada que ver con la temática:
|
| 12 |
+
-Inicialmente cierra la conversación pero idealmente debería de hacer una búsqueda en Google.
|
| 13 |
+
|
| 14 |
+
Tercera parte (Generar el Agente)*:
|
| 15 |
+
-Localizar los fragmentos de transcripciones más relevantes.
|
| 16 |
+
-(Extra) Resume todos los fragmentos de texto relevantes.
|
| 17 |
+
-Realizar la consulta con el contexto.
|
| 18 |
+
-Devolver la respuesta.
|
| 19 |
+
|
| 20 |
+
Cuarta parte (Llevar un hilo de la conversación persistencia con el usuario):
|
| 21 |
+
-? IDs de usuario.
|
| 22 |
+
|
| 23 |
+
Quinta parte (Despliegue de la Interfaz):
|
| 24 |
+
-Definir lo que queremos que el usuario vea (estilo conversación).
|
| 25 |
+
-Implementarlo en Gradio.
|
| 26 |
+
-Alojar el servicio.
|
| 27 |
+
|
| 28 |
+
Sexta parte (Readme y presentación).
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
TODO:
|
| 32 |
+
webapp.py
|
| 33 |
+
-Hacer que el texto del input box se elimine tras pulsar enter.
|
| 34 |
+
|
| 35 |
+
rotuer_chain.py
|
| 36 |
+
-Añadir al menos una cadena adicional al router
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
*PDFs, Libros...
|
knowledgebase.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#yt-dlp --write-subs --skip-download [youtube_url]
|
| 2 |
+
from pinecone import Pinecone
|
| 3 |
+
from pinecone import ServerlessSpec
|
| 4 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
| 5 |
+
import os
|
| 6 |
+
from dotenv import load_dotenv, find_dotenv
|
| 7 |
+
import torch
|
| 8 |
+
from sentence_transformers import SentenceTransformer
|
| 9 |
+
from tqdm import tqdm
|
| 10 |
+
|
| 11 |
+
_ = load_dotenv(find_dotenv())
|
| 12 |
+
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
|
| 13 |
+
|
| 14 |
+
# Get youtube ids
|
| 15 |
+
def get_youtube_ids(route):
|
| 16 |
+
yt_ids = []
|
| 17 |
+
with open(route, 'r') as file:
|
| 18 |
+
for line in file:
|
| 19 |
+
yt_ids.append(line.split('=')[1].strip())
|
| 20 |
+
return yt_ids
|
| 21 |
+
|
| 22 |
+
# Get transcriptions clean
|
| 23 |
+
def get_clean_transcriptions(yt_ids):
|
| 24 |
+
trans_bruto = YouTubeTranscriptApi.get_transcripts(yt_ids, languages=['es','en'])
|
| 25 |
+
return {k:" ".join([d['text'] for d in v if len(v)!=0]) for k, v in trans_bruto[0].items()}
|
| 26 |
+
|
| 27 |
+
# Create index
|
| 28 |
+
def create_index():
|
| 29 |
+
pc = Pinecone(api_key=PINECONE_API_KEY)
|
| 30 |
+
cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
|
| 31 |
+
region = os.environ.get('PINECONE_REGION') or 'us-east-1'
|
| 32 |
+
spec = ServerlessSpec(cloud=cloud, region=region)
|
| 33 |
+
index_name = "youtube-videos"
|
| 34 |
+
if index_name not in pc.list_indexes().names():
|
| 35 |
+
# create the index if it does not exist
|
| 36 |
+
pc.create_index(index_name, dimension=768, metric="cosine", spec=spec)
|
| 37 |
+
# connect to index we created
|
| 38 |
+
index = pc.Index(index_name)
|
| 39 |
+
return pc, index
|
| 40 |
+
|
| 41 |
+
# Load retriever model
|
| 42 |
+
def load_retriever():
|
| 43 |
+
# set device to GPU if available
|
| 44 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 45 |
+
# load the retriever model from huggingface model hub
|
| 46 |
+
retriever = SentenceTransformer('flax-sentence-embeddings/all_datasets_v3_mpnet-base', device=device) #load the retriever model from HuggingFace. Use the flax-sentence-embeddings/all_datasets_v3_mpnet-base model
|
| 47 |
+
return retriever
|
| 48 |
+
|
| 49 |
+
# Create embeddings and upsert them into the index
|
| 50 |
+
def create_embeddings(dicc, index, retriever):
|
| 51 |
+
# Passage id
|
| 52 |
+
p_id = 0
|
| 53 |
+
# Itearte over transcriptions
|
| 54 |
+
for yt_id, transcription in dicc.items():
|
| 55 |
+
# Split the transcription into passages
|
| 56 |
+
passages = [transcription[i:i+1000] for i in range(0, len(transcription), 1000)]
|
| 57 |
+
# For each passage, create an embedding and upsert it into the index
|
| 58 |
+
for passage in tqdm(passages):
|
| 59 |
+
emb = retriever.encode(passage, convert_to_tensor=True)
|
| 60 |
+
meta = {'yt_id': yt_id, 'passage_text': passage}
|
| 61 |
+
to_upsert = [(str(p_id), emb.tolist(), meta)]
|
| 62 |
+
_ = index.upsert(vectors=to_upsert)
|
| 63 |
+
p_id += 1
|
| 64 |
+
# upsert/insert these records to pinecone
|
| 65 |
+
_ = index.upsert(vectors=to_upsert)
|
| 66 |
+
# check that we have all vectors in index
|
| 67 |
+
print(index.describe_index_stats())
|
| 68 |
+
|
| 69 |
+
"""
|
| 70 |
+
# Obtenemos las ids de los vídeos
|
| 71 |
+
ls_ids = get_youtube_ids('./urls.txt')
|
| 72 |
+
|
| 73 |
+
# Obtenemos las transcripciones de los vídeos
|
| 74 |
+
d_trans = get_clean_transcriptions(ls_ids)
|
| 75 |
+
|
| 76 |
+
# Creo el index
|
| 77 |
+
pc, index = create_index()
|
| 78 |
+
|
| 79 |
+
# Load retriever model
|
| 80 |
+
retriever = load_retriever()
|
| 81 |
+
|
| 82 |
+
# Poblate the database
|
| 83 |
+
create_embeddings(d_trans, index, retriever)
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
+
|
requirements.txt
ADDED
|
File without changes
|
urls.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
https://www.youtube.com/watch?v=7nDyUry3esM
|
| 2 |
+
https://www.youtube.com/watch?v=sH9iFSeef-g
|
| 3 |
+
https://www.youtube.com/watch?v=bCy5zSWSKL8
|
| 4 |
+
https://www.youtube.com/watch?v=3CPzO9bHEOM
|
| 5 |
+
https://www.youtube.com/watch?v=spAraLH3N-4
|
| 6 |
+
https://www.youtube.com/watch?v=20UPUvLHKUY
|
| 7 |
+
https://www.youtube.com/watch?v=nDC2PqM4YpY
|
| 8 |
+
https://www.youtube.com/watch?v=QaiOb9I-ogA
|
| 9 |
+
https://www.youtube.com/watch?v=HJd0LnkR63o
|
utils.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from knowledgebase import create_index, load_retriever
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import requests
|
| 4 |
+
import serpapi
|
| 5 |
+
import os
|
| 6 |
+
import re
|
| 7 |
+
from transformers import BartTokenizer
|
| 8 |
+
from dotenv import load_dotenv, find_dotenv
|
| 9 |
+
load_dotenv(find_dotenv())
|
| 10 |
+
SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')
|
| 11 |
+
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
|
| 12 |
+
|
| 13 |
+
def query_pinecone(query, top_k, index, retriever):
|
| 14 |
+
# generate embeddings for the query
|
| 15 |
+
xq = retriever.encode([query], convert_to_tensor=True).tolist()[0]
|
| 16 |
+
# search pinecone index for context passage with the answer
|
| 17 |
+
xc = index.query(vector=xq, top_k=top_k, include_metadata=True)
|
| 18 |
+
return xc
|
| 19 |
+
|
| 20 |
+
def format_query(query, context):
|
| 21 |
+
# extract passage_text from Pinecone search result and add the <P> tag
|
| 22 |
+
context = " ".join([f"<P> {m['metadata']['passage_text']}" for m in context['matches']])
|
| 23 |
+
# contcatinate the query and context passages
|
| 24 |
+
query = f"Pregunta del usuario: {query} \n Contexto para responder a la pregunta del usuario: {context}"
|
| 25 |
+
return query
|
| 26 |
+
|
| 27 |
+
def get_question_context(query, top_k):
|
| 28 |
+
# Creo el index
|
| 29 |
+
_, index = create_index()
|
| 30 |
+
# Load retriever model
|
| 31 |
+
retriever = load_retriever()
|
| 32 |
+
# search pinecone index for context passage with the answer
|
| 33 |
+
context = query_pinecone(query, top_k, index, retriever)
|
| 34 |
+
# format query with context passages
|
| 35 |
+
query = format_query(query, context)
|
| 36 |
+
return query
|
| 37 |
+
|
| 38 |
+
# Función que realiza la búsqueda en Google y extrae el contenido relevante de la primera URL no patrocinada
|
| 39 |
+
def google_search_result(query):
|
| 40 |
+
# Make a Google search
|
| 41 |
+
s = serpapi.search(q=query, engine="google", location="Madrid, Spain", hl="es", gl="es", api_key=SERPAPI_API_KEY)
|
| 42 |
+
# Get the first non-ad URL
|
| 43 |
+
url = s["organic_results"][0]["link"]
|
| 44 |
+
|
| 45 |
+
# Extraer el contenido de la página
|
| 46 |
+
response = requests.get(url)
|
| 47 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 48 |
+
|
| 49 |
+
# Extraer el texto relevante de la página
|
| 50 |
+
page_content = soup.get_text()
|
| 51 |
+
|
| 52 |
+
page_content = re.sub(r'\n+', ' ', page_content)
|
| 53 |
+
page_content = re.sub(r'\s+', ' ', page_content)
|
| 54 |
+
|
| 55 |
+
# Cargar el tokenizador para BART
|
| 56 |
+
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
|
| 57 |
+
|
| 58 |
+
# Tokenizar el contenido para contar los tokens
|
| 59 |
+
tokens = tokenizer.encode(page_content, truncation=True, max_length=1000)
|
| 60 |
+
|
| 61 |
+
# Decodificar los tokens de nuevo en texto truncado si es necesario
|
| 62 |
+
truncated_content = tokenizer.decode(tokens, skip_special_tokens=True)
|
| 63 |
+
|
| 64 |
+
# Resume el contenido de la página
|
| 65 |
+
API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
|
| 66 |
+
# Set the API headers
|
| 67 |
+
headers = {"Authorization":"Bearer "+HUGGINGFACEHUB_API_TOKEN}
|
| 68 |
+
# Make a request to the API
|
| 69 |
+
response = requests.post(API_URL, headers=headers, json={"inputs":truncated_content})
|
| 70 |
+
# Get the summary text from the response
|
| 71 |
+
return response.json()[0]['summary_text'] if len(response.json())>0 else "No se ha podido obtener un resumen de la página"
|