Spaces:

rohan13
/

canvas-discussion-grader

Runtime error

File size: 20,024 Bytes

import os
import pickle
import langchain

import faiss
from langchain import HuggingFaceHub, PromptTemplate
from langchain.chains import ConversationalRetrievalChain, LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.document_loaders import DirectoryLoader, TextLoader, UnstructuredHTMLLoader
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings
from langchain.memory import ConversationBufferWindowMemory
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
    StringPromptTemplate
)
from langchain.output_parsers import PydanticOutputParser
from langchain.tools.json.tool import JsonSpec

from typing import List, Union, Callable
from langchain.schema import AgentAction, AgentFinish
import re
from langchain.text_splitter import CharacterTextSplitter
from custom_faiss import MyFAISS
from langchain.cache import InMemoryCache
from langchain.chat_models import ChatGooglePalm
from langchain.document_loaders import JSONLoader
from langchain.agents import initialize_agent, Tool, AgentType
from langchain.agents import Tool, AgentExecutor, LLMSingleActionAgent, AgentOutputParser, BaseMultiActionAgent
from langchain.tools import StructuredTool
from langchain.chains import create_tagging_chain
from typing import List, Tuple, Any, Union
from langchain.schema import AgentAction, AgentFinish
from pydantic import BaseModel, Field
from typing import Optional

class ToolArgsSchema(BaseModel):
    student_name: Optional[str] = Field(description="The name of the student")
    question: str = Field(description="The question being asked")
    question_type: str = Field(description="The type of question being asked")
    interest: Optional[str] = Field(description="The interest of the student")

    class Config:
        schema_extra = {
            "required": ["question", "question_type"]
        }





langchain.llm_cache = InMemoryCache()

model_name = "GPT-4"

pickle_file = "_vs.pkl"
index_file = "_vs.index"
models_folder = "models/"
os.environ["LANGCHAIN_TRACING"] = "true"
discussions_file_path = "discussion_entries.json"

llm = OpenAI(model_name="gpt-3.5-turbo-16k", temperature=0, verbose=True)

embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')

chat_history = []

memory = ConversationBufferWindowMemory(memory_key="chat_history", k=10)

vectorstore_index = None

agent_prompt = """
I am the LLM AI canvas discussion grading assistant. 
I can answer two types of questions: grade-based questions and interest-based questions. 
Grade-based questions are about the grades of a certain student or a group of students based on the rubric below for the canvas discussion on the topic 8 nouns. ALWAYS return total score when it is grading based question. 
Interest-based questions are about the interests or skills of a certain student or a group of students based on their discussion posts.
You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about type of question it is
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
{agent_scratchpad}
"""

# Set up a prompt template
class CustomPromptTemplate(StringPromptTemplate):
    # The template to use
    template: str
    ############## NEW ######################
    # The list of tools available
    tools_getter: Callable

    def format(self, **kwargs) -> str:
        # Get the intermediate steps (AgentAction, Observation tuples)
        # Format them in a particular way
        intermediate_steps = kwargs.pop("intermediate_steps")
        thoughts = ""
        for action, observation in intermediate_steps:
            thoughts += action.log
            thoughts += f"\nObservation: {observation}\nThought: "
        # Set the agent_scratchpad variable to that value
        kwargs["agent_scratchpad"] = thoughts
        ############## NEW ######################
        tools = self.tools_getter(kwargs["input"])
        # Create a tools variable from the list of tools provided
        kwargs["tools"] = "\n".join(
            [f"{tool.name}: {tool.description}" for tool in tools]
        )
        # Create a list of tool names for the tools provided
        kwargs["tool_names"] = ", ".join([tool.name for tool in tools])
        return self.template.format(**kwargs)

class CustomOutputParser(AgentOutputParser):

    def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
        print("llm_output")
        print(llm_output)
        # Check if agent should finish
        if "Final Answer:" in llm_output:
            return AgentFinish(
                # Return values is generally always a dictionary with a single `output` key
                # It is not recommended to try anything else at the moment :)
                return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
                log=llm_output,
            )
        # Parse out the action and action input
        regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
        match = re.search(regex, llm_output, re.DOTALL)
        if not match:
            raise ValueError(f"Could not parse LLM output: `{llm_output}`")
        action = match.group(1).strip()
        action_input = match.group(2)
        # Return the action and action input
        return AgentAction(tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output)

system_template = """
I am the LLM AI canvas discussion grading assistant. 
I can answer two types of questions: grade-based questions and interest-based questions. 
Grade-based questions are about the grades of a certain student or a group of students based on the rubric below for the canvas discussion on the topic 8 nouns. 
Interest-based questions are about the interests or skills of a certain student or a group of students based on their discussion posts.
To grade student discussions, I will follow the rubric below.

Student Post

3 points: Post includes 8 nouns and text describing how these nouns relate to the student.
2 points: Student's post includes 8 nouns but does not offer how those nouns relate to the student.
1 point: Student's post has significant missing details.
0 points: The student does not provide an initial post, or otherwise does not follow assignment instructions.


Response to Others

3 points: Student responds to at least 3 other student discussion threads AND responds to questions asked of them. Student posts insightful comments that prompt on target discussion. These posts also avoid throw away comments such as I agree, Me too, Good idea.
2 points: Student was notably lacking in one criterion.
1 point: Student was notably lacking in two criteria.
0 points: The student does not interact in the threads of other students.
I will be able to identify each student by name, and I will be able to share their likings, interests, and other characteristics. I will also be able to filter out students based on their interests.

I will not deviate from the grading scheme. I will grade each discussion entry and reply carefully, and I will share the grades of all individuals by name on the basis of the rubric. I will ALWAYS return total score when it is grading based question.

The discussions and their replies are in following format:
Student Post: Student Name
Reply to: Another Student Discussion ID

Your answer to grade based questions should be in following format:
Student Post: X points
Response to Others: X points
Total: X points 

Following are the relevant discussions to grade or answer the interest based questions
----------------
Discussions: 
{context}"""

messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}"),
]
CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)


def set_model_and_embeddings():
    global chat_history
    # set_model(model)
    # set_embeddings(model)
    chat_history = []

def set_embeddings(model):
    global embeddings
    if model == "GPT-3.5" or model == "GPT-4":
        print("Loading OpenAI embeddings")
        embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
    elif model == "Flan UL2" or model == "Flan T5":
        print("Loading Hugging Face embeddings")
        embeddings = HuggingFaceHubEmbeddings(repo_id="sentence-transformers/all-MiniLM-L6-v2")


def get_search_index():
    global vectorstore_index, model_name
    if os.path.isfile(get_file_path(model_name, pickle_file)) and os.path.isfile(
            get_file_path(model_name, index_file)) and os.path.getsize(get_file_path(model_name, pickle_file)) > 0:
        # Load index from pickle file
        with open(get_file_path(model_name, pickle_file), "rb") as f:
            # search_index = Chroma(persist_directory=models_folder, embedding_function=embeddings)
            search_index = pickle.load(f)
            print("Loaded index")
    else:
        search_index = create_index(model_name)
        print("Created index")

    vectorstore_index = search_index
    return search_index


def create_index(model):
    source_chunks = create_chunk_documents()
    search_index = search_index_from_docs(source_chunks)
    # search_index.persist()
    faiss.write_index(search_index.index, get_file_path(model, index_file))
    # Save index to pickle file
    with open(get_file_path(model, pickle_file), "wb") as f:
        pickle.dump(search_index, f)
    return search_index


def get_file_path(model, file):
    # If model is GPT3.5 or GPT4 return models_folder + openai + file else return models_folder + hf + file
    if model == "GPT-3.5" or model == "GPT-4":
        return models_folder + "openai" + file
    else:
        return models_folder + "hf" + file


def search_index_from_docs(source_chunks):
    # print("source chunks: " + str(len(source_chunks)))
    # print("embeddings: " + str(embeddings))

    search_index = MyFAISS.from_documents(source_chunks, embeddings)
    return search_index


def get_html_files():
    loader = DirectoryLoader('docs', glob="**/*.html", loader_cls=UnstructuredHTMLLoader, recursive=True)
    document_list = loader.load()
    for document in document_list:
        document.metadata["name"] = document.metadata["source"].split("/")[-1].split(".")[0]
    return document_list

def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["name"] = record.get("name")
    return metadata
def get_json_file():
    global discussions_file_path
    loader = JSONLoader(
        file_path=discussions_file_path,
        jq_schema='.[]', metadata_func=metadata_func, content_key="message")
    return loader.load()
def fetch_data_for_embeddings():
    # document_list = get_text_files()
    document_list = get_html_files()
    # document_list = get_json_file()
    print("document list: " + str(len(document_list)))
    return document_list


def get_text_files():
    loader = DirectoryLoader('docs', glob="**/*.txt", loader_cls=TextLoader, recursive=True)
    document_list = loader.load()
    return document_list


def create_chunk_documents():
    sources = fetch_data_for_embeddings()

    splitter = CharacterTextSplitter(separator=" ", chunk_size=800, chunk_overlap=0)

    source_chunks = splitter.split_documents(sources)

    print("chunks: " + str(len(source_chunks)))

    return sources


def get_qa_chain(vectorstore_index, question, metadata):
    global llm, model_name
    print(llm)
    filter_dict = {"name": metadata.student_name}
    # embeddings_filter = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.76)
    # compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=gpt_3_5_index.as_retriever())
    retriever = get_retriever(filter_dict, vectorstore_index, metadata)

    print(retriever.get_relevant_documents(question))

    chain = ConversationalRetrievalChain.from_llm(llm, retriever, return_source_documents=True,
                                                  verbose=True, get_chat_history=get_chat_history,
                                                  combine_docs_chain_kwargs={"prompt": CHAT_PROMPT})
    return chain


def get_retriever(filter_dict, vectorstore_index, metadata):
    if metadata.question_type == "grade-based":
        retriever = vectorstore_index.as_retriever(search_type='mmr',
                                                   search_kwargs={'lambda_mult': 1, 'fetch_k': 20, 'k': 10,
                                                                  'filter': filter_dict})

    else:
        retriever = vectorstore_index.as_retriever(search_type='mmr',
                                                   search_kwargs={'lambda_mult': 1, 'fetch_k': 20, 'k': 10})

    return retriever


def get_chat_history(inputs) -> str:
    res = []
    for human, ai in inputs:
        res.append(f"Human:{human}\nAI:{ai}")
    return "\n".join(res)


def generate_answer(question, metadata:  ToolArgsSchema) -> str:
    # print("filter: " + filter)
    global chat_history, vectorstore_index
    chain = get_qa_chain(vectorstore_index, question, metadata)

    result = chain(
        {"question": question, "chat_history": chat_history})
    chat_history.extend([(question, result["answer"])])
    sources = []
    print(result)

    for document in result['source_documents']:
        source = document.metadata['source']
        sources.append(source.split('/')[-1].split('.')[0])
        print(sources)

    source = ',\n'.join(set(sources))
    # return result['answer'] + '\nSOURCES: ' + source
    return result['answer']
def get_question_type(question):

    parser = PydanticOutputParser(pydantic_object=ToolArgsSchema)
    prompt_template = """I can answer two types of questions: grade-based questions and interest-based questions. 
Grade-based questions are about the grades of a certain student or a group of students based on the rubric below for the canvas discussion on the topic 8 nouns. 
Interest-based questions are about the interests or skills of a certain student or a group of students based on their discussion posts.
Question: {question}
Find following information about the question asked. Return Optional empty if the information is not available.:
Format instructions: {format_instructions}"""

    llm = OpenAI(temperature=0)
    prompt = PromptTemplate(template=prompt_template, input_variables=["question"], output_parser=parser, partial_variables={"format_instructions": parser.get_format_instructions()})
    llm_chain = LLMChain(
        llm=llm,
        prompt=prompt,

    )
    output = llm_chain.run(question)
    output = parser.parse(output)
    output = generate_answer(question, output)
    return output











# class FakeAgent(BaseMultiActionAgent):
#     """Fake Custom Agent."""
#
#     @property
#     def input_keys(self):
#         return ["input"]
#
#     def plan(
#             self, intermediate_steps: List[Tuple[AgentAction, str]], **kwargs: Any
#     ) -> Union[List[AgentAction], AgentFinish]:
#         print("input keys")
#         print(self.input_keys)
#         print("intermediate steps")
#         print(intermediate_steps)
#         print("kwargs")
#         print(kwargs)
#
#         """Given input, decided what to do.
#
#         Args:
#             intermediate_steps: Steps the LLM has taken to date,
#                 along with observations
#             **kwargs: User inputs.
#
#         Returns:
#             Action specifying what tool to use.
#         """
#         if len(intermediate_steps) == 0:
#             first_action = AgentAction(tool="question type", tool_input=kwargs["input"], log="")
#             print("first action")
#             print(first_action)
#             second_action = AgentAction(tool="Grade",tool_input=kwargs["input"], log="")
#             print("second action")
#             print(second_action)
#             return [
#                 first_action,
#                 second_action,
#             ]
#         else:
#             return AgentFinish(return_values={"output": "bar"}, log="")
#
#     async def aplan(
#             self, intermediate_steps: List[Tuple[AgentAction, str]], **kwargs: Any
#     ) -> Union[List[AgentAction], AgentFinish]:
#         """Given input, decided what to do.
#
#         Args:
#             intermediate_steps: Steps the LLM has taken to date,
#                 along with observations
#             **kwargs: User inputs.
#
#         Returns:
#             Action specifying what tool to use.
#         """
#         if len(intermediate_steps) == 0:
#             return [
#                 AgentAction(tool="question type", tool_input=kwargs["input"], log=""),
#                 AgentAction(tool="Grade",
#                             tool_input={
#                                 "student_name": kwargs["student_name"],
#                                 "question": kwargs["question"],
#                                 "question_type": kwargs["question_type"],
#                                 "interest": kwargs["interest"]
#                             }, log=""),
#             ]
#         else:
#             return AgentFinish(return_values={"output": "bar"}, log="")
#
#
# schema = {
#     "properties": {
#         "student_name" : {"type": "string", "description": "The name of the student"},
#         "question": {"type": "string", "description": "The question being asked"},
#         "question type" : {"type": "string",
#                            "enum": ["student grades", "student specific", "interest specific"],
#                            "description": "The type of question being asked"},
#         "interest" : {"type": "string", "description": "The interest of the student"},
#     },
#     "required": ["question", "question type"]
# }





# def get_tagging_chain(question)-> str:
#     global schema
#     chain = create_tagging_chain(schema, llm)
#     first_answer = chain.run(question)
#     print("first answer:")
#     print(first_answer)
#     return first_answer
#
#
# def get_grading_agent():
#
#     tools = [
#         Tool(
#             name="question type",
#             func=get_tagging_chain,
#             description="Useful when you need to understand the type of the input."
#         ),
#         StructuredTool(
#             name="Grade",
#             func=generate_answer,
#             description="Useful when you need to answer questions about students, grades, interests, etc from the context of canvas discussion posts. If the question is student specific, student name is required.",
#             args_schema=ToolArgsSchema
#         )
#     ]
#     # agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
#
#     agent = FakeAgent(output_parser=CustomOutputParser())
#     # prompt = CustomPromptTemplate(template=agent_prompt, tools=tools, input_variables=["input", "intermediate_steps"])
#     # output_parser = CustomOutputParser()
#     # tool_names = [tool.name for tool in tools]
#     # llm_chain = LLMChain(llm=llm, prompt=prompt)
#     # agent = LLMSingleActionAgent(
#     #     llm_chain=llm_chain,
#     #     output_parser=output_parser,
#     #     stop=["\nObservation:"],
#     #     allowed_tools=tool_names,
#     # )
#     agent_executor = AgentExecutor.from_agent_and_tools(
#         agent=agent, tools=tools, verbose=True
#     )
#
#     # return initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=True)
#     return agent_executor
#
#
#
# def grade_answer(question) -> str:
#     global chat_history, vectorstore_index
#     agent = get_grading_agent()
#     return agent.run(question)