longobardomartin commited on
Commit
d669120
·
1 Parent(s): 11e2a72
__pycache__/agent.cpython-312.pyc ADDED
Binary file (3.57 kB). View file
 
__pycache__/knowledabase.cpython-312.pyc ADDED
Binary file (3.94 kB). View file
 
__pycache__/knowledgebase.cpython-312.pyc ADDED
Binary file (3.91 kB). View file
 
__pycache__/utils.cpython-312.pyc ADDED
Binary file (3.44 kB). View file
 
agent.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.agents import Tool
2
+ from langchain.agents import initialize_agent
3
+ from langchain_openai import ChatOpenAI
4
+ from langchain.chains.conversation.memory import ConversationBufferWindowMemory
5
+ from langchain.chains import LLMChain
6
+ from langchain.prompts import PromptTemplate
7
+ from utils import get_question_context, google_search_result
8
+
9
+ # Definimos el template para la consulta de turismo
10
+ turism_template = """You are a very experienced turist guide specialised in recommending activities \
11
+ and things to do in Marbella, a city located in Andalusia, Spain. \
12
+ You have an excellent knowledge of and understanding of restaurants, sports, activities, experiences and places to visit in the city \
13
+ specifically targeted to families, couples, friends and solo travelers. \
14
+ You have the ability to think, reflect, debate, discuss and evaluate the data stored in a knowledge base from youtube videos related to \
15
+ turism in Marbella, and the ability to make use of it to support your explanations to the future turists that will visit the city and ask for your advice. \
16
+ Remenber: You answer must be so accurate and based on your knowledbase. \
17
+ Here is a question from a user: \
18
+ {input}"""
19
+
20
+ default_template = """You are a bot specialised in giving answers to questions about a wide range of topics. \
21
+ You are provided with the user answer and context from the first non-sponsored URL from a Google search. \
22
+ If you don't know the answer simply say I don't know but if you do please answer the question precisely.\
23
+ Here is a question from a user and a bit of context from Google Search: \
24
+ {input}"""
25
+
26
+ def get_turism_answer(input):
27
+ input = get_question_context(query=input, top_k=3)
28
+ llm_prompt = PromptTemplate.from_template(turism_template)
29
+ chain = LLMChain(llm=llm, prompt=llm_prompt)
30
+ answer = chain.run(input)
31
+ return answer
32
+
33
+ def get_internet_answer(input):
34
+ context = google_search_result(input)
35
+ input = f"Pregunta del usuario: {input} \n Contexto para responder a la pregunta del usuario: {context}"
36
+ llm_prompt = PromptTemplate.from_template(default_template)
37
+ chain = LLMChain(llm=llm, prompt=llm_prompt)
38
+ answer = chain.run(input)
39
+ return answer
40
+
41
+ tools = [
42
+ Tool(
43
+ name='Turism knowledgebase tool',
44
+ func=get_turism_answer,
45
+ description=('Use this tool when answering questions about turism in Marbella.')
46
+ ),
47
+ Tool(
48
+ name='Default knowledgebase tool',
49
+ func=get_internet_answer,
50
+ description=(
51
+ 'use this tool when the input question is not related to turism in Marbella.'
52
+ )
53
+ )
54
+ ]
55
+
56
+ llm = ChatOpenAI(model='gpt-4',temperature=0)
57
+
58
+ # conversational memory
59
+ conversational_memory = ConversationBufferWindowMemory(
60
+ memory_key='chat_history',
61
+ k=5,
62
+ return_messages=True
63
+ )
64
+
65
+ agent = initialize_agent(
66
+ agent='chat-conversational-react-description',
67
+ tools=tools,
68
+ llm=llm,
69
+ verbose=True,
70
+ max_iterations=3,
71
+ early_stopping_method='generate',
72
+ memory=conversational_memory
73
+ )
74
+
75
+ def call_agent(input):
76
+ return agent(input)['output']
app.py CHANGED
@@ -1,11 +1,19 @@
1
  import gradio as gr
 
 
 
 
 
 
2
 
3
  # Función del bot que procesa el mensaje del usuario
4
  def chatbot(message, history=[]):
5
  # Agregar el mensaje del usuario al historial
6
  history.append(("Usuario:", message))
 
 
7
  # Generar una respuesta simple del bot
8
- response = f"Bot: Entendido, has dicho '{message}'"
9
  history.append((response,))
10
  # Formatear el historial como un bloque de texto
11
  chat_history = "\n".join([f"{msg[0]} {msg[1]}" if len(msg) > 1 else msg[0] for msg in history])
 
1
  import gradio as gr
2
+ from agent import call_agent
3
+ import os
4
+ from dotenv import load_dotenv, find_dotenv
5
+ _ = load_dotenv(find_dotenv())
6
+ OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
7
+
8
 
9
  # Función del bot que procesa el mensaje del usuario
10
  def chatbot(message, history=[]):
11
  # Agregar el mensaje del usuario al historial
12
  history.append(("Usuario:", message))
13
+ # Consultar al agente de OpenAI
14
+ response = call_agent(message)
15
  # Generar una respuesta simple del bot
16
+ response = f"Bot:'{response}'"
17
  history.append((response,))
18
  # Formatear el historial como un bloque de texto
19
  chat_history = "\n".join([f"{msg[0]} {msg[1]}" if len(msg) > 1 else msg[0] for msg in history])
estructura.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Primera parte (Generar la base de conocimiento):
2
+ -Partimos de una lista urls de videos de youtube.
3
+ -Descargamos los videos (tmp).
4
+ -Extraer el texto de cada video (txt).
5
+ -Romper los textos, vectorizar y almacenar en la bbdd vectorial (Metadata: Texto, url, tema...).
6
+
7
+ Segunda parte (Generar el Router):
8
+ -Router que:
9
+ -Si la pregunta está relacionada:
10
+ -Entra en acción nuestro Agente*.
11
+ -Si no tiene nada que ver con la temática:
12
+ -Inicialmente cierra la conversación pero idealmente debería de hacer una búsqueda en Google.
13
+
14
+ Tercera parte (Generar el Agente)*:
15
+ -Localizar los fragmentos de transcripciones más relevantes.
16
+ -(Extra) Resume todos los fragmentos de texto relevantes.
17
+ -Realizar la consulta con el contexto.
18
+ -Devolver la respuesta.
19
+
20
+ Cuarta parte (Llevar un hilo de la conversación persistencia con el usuario):
21
+ -? IDs de usuario.
22
+
23
+ Quinta parte (Despliegue de la Interfaz):
24
+ -Definir lo que queremos que el usuario vea (estilo conversación).
25
+ -Implementarlo en Gradio.
26
+ -Alojar el servicio.
27
+
28
+ Sexta parte (Readme y presentación).
29
+
30
+
31
+ TODO:
32
+ webapp.py
33
+ -Hacer que el texto del input box se elimine tras pulsar enter.
34
+
35
+ rotuer_chain.py
36
+ -Añadir al menos una cadena adicional al router
37
+
38
+
39
+ *PDFs, Libros...
knowledgebase.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #yt-dlp --write-subs --skip-download [youtube_url]
2
+ from pinecone import Pinecone
3
+ from pinecone import ServerlessSpec
4
+ from youtube_transcript_api import YouTubeTranscriptApi
5
+ import os
6
+ from dotenv import load_dotenv, find_dotenv
7
+ import torch
8
+ from sentence_transformers import SentenceTransformer
9
+ from tqdm import tqdm
10
+
11
+ _ = load_dotenv(find_dotenv())
12
+ PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
13
+
14
+ # Get youtube ids
15
+ def get_youtube_ids(route):
16
+ yt_ids = []
17
+ with open(route, 'r') as file:
18
+ for line in file:
19
+ yt_ids.append(line.split('=')[1].strip())
20
+ return yt_ids
21
+
22
+ # Get transcriptions clean
23
+ def get_clean_transcriptions(yt_ids):
24
+ trans_bruto = YouTubeTranscriptApi.get_transcripts(yt_ids, languages=['es','en'])
25
+ return {k:" ".join([d['text'] for d in v if len(v)!=0]) for k, v in trans_bruto[0].items()}
26
+
27
+ # Create index
28
+ def create_index():
29
+ pc = Pinecone(api_key=PINECONE_API_KEY)
30
+ cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
31
+ region = os.environ.get('PINECONE_REGION') or 'us-east-1'
32
+ spec = ServerlessSpec(cloud=cloud, region=region)
33
+ index_name = "youtube-videos"
34
+ if index_name not in pc.list_indexes().names():
35
+ # create the index if it does not exist
36
+ pc.create_index(index_name, dimension=768, metric="cosine", spec=spec)
37
+ # connect to index we created
38
+ index = pc.Index(index_name)
39
+ return pc, index
40
+
41
+ # Load retriever model
42
+ def load_retriever():
43
+ # set device to GPU if available
44
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
45
+ # load the retriever model from huggingface model hub
46
+ retriever = SentenceTransformer('flax-sentence-embeddings/all_datasets_v3_mpnet-base', device=device) #load the retriever model from HuggingFace. Use the flax-sentence-embeddings/all_datasets_v3_mpnet-base model
47
+ return retriever
48
+
49
+ # Create embeddings and upsert them into the index
50
+ def create_embeddings(dicc, index, retriever):
51
+ # Passage id
52
+ p_id = 0
53
+ # Itearte over transcriptions
54
+ for yt_id, transcription in dicc.items():
55
+ # Split the transcription into passages
56
+ passages = [transcription[i:i+1000] for i in range(0, len(transcription), 1000)]
57
+ # For each passage, create an embedding and upsert it into the index
58
+ for passage in tqdm(passages):
59
+ emb = retriever.encode(passage, convert_to_tensor=True)
60
+ meta = {'yt_id': yt_id, 'passage_text': passage}
61
+ to_upsert = [(str(p_id), emb.tolist(), meta)]
62
+ _ = index.upsert(vectors=to_upsert)
63
+ p_id += 1
64
+ # upsert/insert these records to pinecone
65
+ _ = index.upsert(vectors=to_upsert)
66
+ # check that we have all vectors in index
67
+ print(index.describe_index_stats())
68
+
69
+ """
70
+ # Obtenemos las ids de los vídeos
71
+ ls_ids = get_youtube_ids('./urls.txt')
72
+
73
+ # Obtenemos las transcripciones de los vídeos
74
+ d_trans = get_clean_transcriptions(ls_ids)
75
+
76
+ # Creo el index
77
+ pc, index = create_index()
78
+
79
+ # Load retriever model
80
+ retriever = load_retriever()
81
+
82
+ # Poblate the database
83
+ create_embeddings(d_trans, index, retriever)
84
+ """
85
+
86
+
requirements.txt ADDED
File without changes
urls.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ https://www.youtube.com/watch?v=7nDyUry3esM
2
+ https://www.youtube.com/watch?v=sH9iFSeef-g
3
+ https://www.youtube.com/watch?v=bCy5zSWSKL8
4
+ https://www.youtube.com/watch?v=3CPzO9bHEOM
5
+ https://www.youtube.com/watch?v=spAraLH3N-4
6
+ https://www.youtube.com/watch?v=20UPUvLHKUY
7
+ https://www.youtube.com/watch?v=nDC2PqM4YpY
8
+ https://www.youtube.com/watch?v=QaiOb9I-ogA
9
+ https://www.youtube.com/watch?v=HJd0LnkR63o
utils.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from knowledgebase import create_index, load_retriever
2
+ from bs4 import BeautifulSoup
3
+ import requests
4
+ import serpapi
5
+ import os
6
+ import re
7
+ from transformers import BartTokenizer
8
+ from dotenv import load_dotenv, find_dotenv
9
+ load_dotenv(find_dotenv())
10
+ SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')
11
+ HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
12
+
13
+ def query_pinecone(query, top_k, index, retriever):
14
+ # generate embeddings for the query
15
+ xq = retriever.encode([query], convert_to_tensor=True).tolist()[0]
16
+ # search pinecone index for context passage with the answer
17
+ xc = index.query(vector=xq, top_k=top_k, include_metadata=True)
18
+ return xc
19
+
20
+ def format_query(query, context):
21
+ # extract passage_text from Pinecone search result and add the <P> tag
22
+ context = " ".join([f"<P> {m['metadata']['passage_text']}" for m in context['matches']])
23
+ # contcatinate the query and context passages
24
+ query = f"Pregunta del usuario: {query} \n Contexto para responder a la pregunta del usuario: {context}"
25
+ return query
26
+
27
+ def get_question_context(query, top_k):
28
+ # Creo el index
29
+ _, index = create_index()
30
+ # Load retriever model
31
+ retriever = load_retriever()
32
+ # search pinecone index for context passage with the answer
33
+ context = query_pinecone(query, top_k, index, retriever)
34
+ # format query with context passages
35
+ query = format_query(query, context)
36
+ return query
37
+
38
+ # Función que realiza la búsqueda en Google y extrae el contenido relevante de la primera URL no patrocinada
39
+ def google_search_result(query):
40
+ # Make a Google search
41
+ s = serpapi.search(q=query, engine="google", location="Madrid, Spain", hl="es", gl="es", api_key=SERPAPI_API_KEY)
42
+ # Get the first non-ad URL
43
+ url = s["organic_results"][0]["link"]
44
+
45
+ # Extraer el contenido de la página
46
+ response = requests.get(url)
47
+ soup = BeautifulSoup(response.text, 'html.parser')
48
+
49
+ # Extraer el texto relevante de la página
50
+ page_content = soup.get_text()
51
+
52
+ page_content = re.sub(r'\n+', ' ', page_content)
53
+ page_content = re.sub(r'\s+', ' ', page_content)
54
+
55
+ # Cargar el tokenizador para BART
56
+ tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
57
+
58
+ # Tokenizar el contenido para contar los tokens
59
+ tokens = tokenizer.encode(page_content, truncation=True, max_length=1000)
60
+
61
+ # Decodificar los tokens de nuevo en texto truncado si es necesario
62
+ truncated_content = tokenizer.decode(tokens, skip_special_tokens=True)
63
+
64
+ # Resume el contenido de la página
65
+ API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
66
+ # Set the API headers
67
+ headers = {"Authorization":"Bearer "+HUGGINGFACEHUB_API_TOKEN}
68
+ # Make a request to the API
69
+ response = requests.post(API_URL, headers=headers, json={"inputs":truncated_content})
70
+ # Get the summary text from the response
71
+ return response.json()[0]['summary_text'] if len(response.json())>0 else "No se ha podido obtener un resumen de la página"