Spaces:

longobardomartin
/

proyectofinal

Configuration error

+from langchain.agents import Tool
+from langchain.agents import initialize_agent
+from langchain_openai import ChatOpenAI
+from langchain.chains.conversation.memory import ConversationBufferWindowMemory
+from langchain.chains import LLMChain
+from langchain.prompts import PromptTemplate
+from utils import get_question_context, google_search_result
+# Definimos el template para la consulta de turismo
+turism_template = """You are a very experienced turist guide specialised in recommending activities \
+and things to do in Marbella, a city located in Andalusia, Spain. \
+You have an excellent knowledge of and understanding of restaurants, sports, activities, experiences and places to visit in the city \
+specifically targeted to families, couples, friends and solo travelers. \
+You have the ability to think, reflect, debate, discuss and evaluate the data stored in a knowledge base from youtube videos related to \
+turism in Marbella, and the ability to make use of it to support your explanations to the future turists that will visit the city and ask for your advice. \
+Remenber: You answer must be so accurate and based on your knowledbase. \
+Here is a question from a user: \
+{input}"""
+default_template = """You are a bot specialised in giving answers to questions about a wide range of topics. \
+You are provided with the user answer and context from the first non-sponsored URL from a Google search. \
+If you don't know the answer simply say I don't know but if you do please answer the question precisely.\
+Here is a question from a user and a bit of context from Google Search: \
+{input}"""
+def get_turism_answer(input):
+    input = get_question_context(query=input, top_k=3)
+    llm_prompt = PromptTemplate.from_template(turism_template)
+    chain = LLMChain(llm=llm, prompt=llm_prompt)
+    answer = chain.run(input)
+    return answer
+def get_internet_answer(input):
+    context = google_search_result(input)
+    input = f"Pregunta del usuario: {input} \n Contexto para responder a la pregunta del usuario: {context}"
+    llm_prompt = PromptTemplate.from_template(default_template)
+    chain = LLMChain(llm=llm, prompt=llm_prompt)
+    answer = chain.run(input)
+    return answer
+tools = [
+    Tool(
+        name='Turism knowledgebase tool',
+        func=get_turism_answer,
+        description=('Use this tool when answering questions about turism in Marbella.')
+    ),
+        Tool(
+        name='Default knowledgebase tool',
+        func=get_internet_answer,
+        description=(
+            'use this tool when the input question is not related to turism in Marbella.'
+        )
+    )
+]
+llm = ChatOpenAI(model='gpt-4',temperature=0)
+# conversational memory
+conversational_memory = ConversationBufferWindowMemory(
+    memory_key='chat_history',
+    k=5,
+    return_messages=True
+)
+agent = initialize_agent(
+    agent='chat-conversational-react-description',
+    tools=tools,
+    llm=llm,
+    verbose=True,
+    max_iterations=3,
+    early_stopping_method='generate',
+    memory=conversational_memory
+)
+def call_agent(input):
+    return agent(input)['output']

app.py CHANGED Viewed

@@ -1,11 +1,19 @@
 import gradio as gr
 # Función del bot que procesa el mensaje del usuario
 def chatbot(message, history=[]):
     # Agregar el mensaje del usuario al historial
     history.append(("Usuario:", message))
     # Generar una respuesta simple del bot
-    response = f"Bot: Entendido, has dicho '{message}'"
     history.append((response,))
     # Formatear el historial como un bloque de texto
     chat_history = "\n".join([f"{msg[0]} {msg[1]}" if len(msg) > 1 else msg[0] for msg in history])

 import gradio as gr
+from agent import call_agent
+import os
+from dotenv import load_dotenv, find_dotenv
+_ = load_dotenv(find_dotenv())
+OPENAI_API_KEY  = os.getenv('OPENAI_API_KEY')
 # Función del bot que procesa el mensaje del usuario
 def chatbot(message, history=[]):
     # Agregar el mensaje del usuario al historial
     history.append(("Usuario:", message))
+    # Consultar al agente de OpenAI
+    response = call_agent(message)
     # Generar una respuesta simple del bot
+    response = f"Bot:'{response}'"
     history.append((response,))
     # Formatear el historial como un bloque de texto
     chat_history = "\n".join([f"{msg[0]} {msg[1]}" if len(msg) > 1 else msg[0] for msg in history])

estructura.txt ADDED Viewed

	@@ -0,0 +1,39 @@

+Primera parte (Generar la base de conocimiento):
+-Partimos de una lista urls de videos de youtube.
+-Descargamos los videos (tmp).
+-Extraer el texto de cada video (txt).
+-Romper los textos, vectorizar y almacenar en la bbdd vectorial (Metadata: Texto, url, tema...).
+Segunda parte (Generar el Router):
+-Router que:
+    -Si la pregunta está relacionada:
+        -Entra en acción nuestro Agente*.
+    -Si no tiene nada que ver con la temática:
+        -Inicialmente cierra la conversación pero idealmente debería de hacer una búsqueda en Google.
+Tercera parte (Generar el Agente)*:
+    -Localizar los fragmentos de transcripciones más relevantes.
+    -(Extra) Resume todos los fragmentos de texto relevantes.
+    -Realizar la consulta con el contexto.
+    -Devolver la respuesta.
+Cuarta parte (Llevar un hilo de la conversación persistencia con el usuario):
+    -? IDs de usuario.
+Quinta parte (Despliegue de la Interfaz):
+    -Definir lo que queremos que el usuario vea (estilo conversación).
+    -Implementarlo en Gradio.
+    -Alojar el servicio.
+Sexta parte (Readme y presentación).
+TODO:
+webapp.py
+    -Hacer que el texto del input box se elimine tras pulsar enter.
+rotuer_chain.py
+    -Añadir al menos una cadena adicional al router
+*PDFs, Libros...

knowledgebase.py ADDED Viewed

	@@ -0,0 +1,86 @@

+#yt-dlp --write-subs --skip-download [youtube_url]
+from pinecone import Pinecone
+from pinecone import ServerlessSpec
+from youtube_transcript_api import YouTubeTranscriptApi
+import os
+from dotenv import load_dotenv, find_dotenv
+import torch
+from sentence_transformers import SentenceTransformer
+from tqdm import tqdm
+_ = load_dotenv(find_dotenv())
+PINECONE_API_KEY  = os.getenv('PINECONE_API_KEY')
+# Get youtube ids
+def get_youtube_ids(route):
+    yt_ids = []
+    with open(route, 'r') as file:
+        for line in file:
+            yt_ids.append(line.split('=')[1].strip())
+    return yt_ids
+# Get transcriptions clean
+def get_clean_transcriptions(yt_ids):
+    trans_bruto = YouTubeTranscriptApi.get_transcripts(yt_ids, languages=['es','en'])
+    return {k:" ".join([d['text'] for d in v if len(v)!=0]) for k, v in trans_bruto[0].items()}
+# Create index
+def create_index():
+    pc = Pinecone(api_key=PINECONE_API_KEY)
+    cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
+    region = os.environ.get('PINECONE_REGION') or 'us-east-1'
+    spec = ServerlessSpec(cloud=cloud, region=region)
+    index_name = "youtube-videos"
+    if index_name not in pc.list_indexes().names():
+        # create the index if it does not exist
+        pc.create_index(index_name, dimension=768, metric="cosine", spec=spec)
+    # connect to index we created
+    index = pc.Index(index_name)
+    return pc, index
+# Load retriever model
+def load_retriever():
+    # set device to GPU if available
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # load the retriever model from huggingface model hub
+    retriever = SentenceTransformer('flax-sentence-embeddings/all_datasets_v3_mpnet-base', device=device) #load the retriever model from HuggingFace. Use the flax-sentence-embeddings/all_datasets_v3_mpnet-base model
+    return retriever
+# Create embeddings and upsert them into the index
+def create_embeddings(dicc, index, retriever):
+    # Passage id
+    p_id = 0
+    # Itearte over transcriptions
+    for yt_id, transcription in dicc.items():
+        # Split the transcription into passages
+        passages = [transcription[i:i+1000] for i in range(0, len(transcription), 1000)]
+        # For each passage, create an embedding and upsert it into the index
+        for passage in tqdm(passages):
+            emb = retriever.encode(passage, convert_to_tensor=True)
+            meta = {'yt_id': yt_id, 'passage_text': passage}
+            to_upsert = [(str(p_id), emb.tolist(), meta)]
+            _ = index.upsert(vectors=to_upsert)
+            p_id += 1
+            # upsert/insert these records to pinecone
+            _ = index.upsert(vectors=to_upsert)
+    # check that we have all vectors in index
+    print(index.describe_index_stats())
+"""
+# Obtenemos las ids de los vídeos
+ls_ids = get_youtube_ids('./urls.txt')
+# Obtenemos las transcripciones de los vídeos
+d_trans = get_clean_transcriptions(ls_ids)
+# Creo el index
+pc, index = create_index()
+# Load retriever model
+retriever = load_retriever()
+# Poblate the database
+create_embeddings(d_trans, index, retriever)
+"""

requirements.txt ADDED Viewed

File without changes

urls.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+https://www.youtube.com/watch?v=7nDyUry3esM
+https://www.youtube.com/watch?v=sH9iFSeef-g
+https://www.youtube.com/watch?v=bCy5zSWSKL8
+https://www.youtube.com/watch?v=3CPzO9bHEOM
+https://www.youtube.com/watch?v=spAraLH3N-4
+https://www.youtube.com/watch?v=20UPUvLHKUY
+https://www.youtube.com/watch?v=nDC2PqM4YpY
+https://www.youtube.com/watch?v=QaiOb9I-ogA
+https://www.youtube.com/watch?v=HJd0LnkR63o

utils.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from knowledgebase import create_index, load_retriever
+from bs4 import BeautifulSoup
+import requests
+import serpapi
+import os
+import re
+from transformers import BartTokenizer
+from dotenv import load_dotenv, find_dotenv
+load_dotenv(find_dotenv())
+SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')
+HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+def query_pinecone(query, top_k, index, retriever):
+    # generate embeddings for the query
+    xq = retriever.encode([query], convert_to_tensor=True).tolist()[0]
+    # search pinecone index for context passage with the answer
+    xc = index.query(vector=xq, top_k=top_k, include_metadata=True)
+    return xc
+def format_query(query, context):
+    # extract passage_text from Pinecone search result and add the <P> tag
+    context = " ".join([f"<P> {m['metadata']['passage_text']}" for m in context['matches']])
+    # contcatinate the query and context passages
+    query = f"Pregunta del usuario: {query} \n Contexto para responder a la pregunta del usuario: {context}"
+    return query
+def get_question_context(query, top_k):
+    # Creo el index
+    _, index = create_index()
+    # Load retriever model
+    retriever = load_retriever()
+    # search pinecone index for context passage with the answer
+    context = query_pinecone(query, top_k, index, retriever)
+    # format query with context passages
+    query = format_query(query, context)
+    return query
+# Función que realiza la búsqueda en Google y extrae el contenido relevante de la primera URL no patrocinada
+def google_search_result(query):
+    # Make a Google search
+    s = serpapi.search(q=query, engine="google", location="Madrid, Spain", hl="es", gl="es", api_key=SERPAPI_API_KEY)
+    # Get the first non-ad URL
+    url = s["organic_results"][0]["link"]
+    # Extraer el contenido de la página
+    response = requests.get(url)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    # Extraer el texto relevante de la página
+    page_content = soup.get_text()
+    page_content = re.sub(r'\n+', ' ', page_content)
+    page_content = re.sub(r'\s+', ' ', page_content)
+    # Cargar el tokenizador para BART
+    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
+    # Tokenizar el contenido para contar los tokens
+    tokens = tokenizer.encode(page_content, truncation=True, max_length=1000)
+    # Decodificar los tokens de nuevo en texto truncado si es necesario
+    truncated_content = tokenizer.decode(tokens, skip_special_tokens=True)
+    # Resume el contenido de la página
+    API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
+    # Set the API headers
+    headers = {"Authorization":"Bearer "+HUGGINGFACEHUB_API_TOKEN}
+    # Make a request to the API
+    response = requests.post(API_URL, headers=headers, json={"inputs":truncated_content})
+    # Get the summary text from the response
+    return response.json()[0]['summary_text'] if len(response.json())>0 else "No se ha podido obtener un resumen de la página"