hf-qa-demo

Runtime error

App Files Files Community

KonradSzafer commited on Feb 15, 2024

Commit

d22d549

1 Parent(s): 7a13d67

refactor update

Browse files

Files changed (9) hide show

data/datasets/hf_repositories_urls_scraped.json +92 -70
data/hugging_face_docs_dataset.py +2 -1
data/{indexer.ipynb → index.ipynb} +60 -6
data/{indexing_benchmark.ipynb → index_benchmark.ipynb} +0 -0
data/{scrapers → stackoverflow_scrapers}/stack_overflow_scraper.py +0 -0
data/{stackoverflow_python_dataset.py → stackoverflow_scrapers/stackoverflow_python_dataset.py} +0 -0
data/{upload_csv_dataset.py → stackoverflow_scrapers/upload_csv_dataset.py} +0 -0
qa_engine/config.py +1 -1
qa_engine/qa_engine.py +24 -20

data/datasets/hf_repositories_urls_scraped.json CHANGED Viewed

@@ -1,91 +1,113 @@
 {
     "urls": [
-        "https://github.com/huggingface/tokenizers",
-        "https://github.com/huggingface/datablations",
-        "https://github.com/huggingface/peft",
-        "https://github.com/huggingface/tflite-android-transformers",
-        "https://github.com/huggingface/simulate",
-        "https://github.com/huggingface/transformers",
-        "https://github.com/huggingface/deep-rl-class",
-        "https://github.com/huggingface/awesome-huggingface",
-        "https://github.com/huggingface/datasets-server",
-        "https://github.com/huggingface/setfit",
-        "https://github.com/huggingface/olm-training",
-        "https://github.com/huggingface/huggingface_sb3",
-        "https://github.com/huggingface/optimum-neuron",
-        "https://github.com/huggingface/blog",
-        "https://github.com/huggingface/100-times-faster-nlp",
-        "https://github.com/huggingface/bloom-jax-inference",
-        "https://github.com/huggingface/speechbox",
-        "https://github.com/huggingface/olm-datasets",
-        "https://github.com/huggingface/hub-docs",
-        "https://github.com/huggingface/torchMoji",
-        "https://github.com/huggingface/hffs",
         "https://github.com/huggingface/trl",
-        "https://github.com/huggingface/text-generation-inference",
-        "https://github.com/huggingface/Mongoku",
         "https://github.com/huggingface/education-toolkit",
-        "https://github.com/huggingface/datasets",
-        "https://github.com/huggingface/optimum-benchmark",
-        "https://github.com/huggingface/course",
-        "https://github.com/huggingface/accelerate",
-        "https://github.com/huggingface/pytorch-image-models",
-        "https://github.com/huggingface/fuego",
-        "https://github.com/huggingface/diffusion-models-class",
-        "https://github.com/huggingface/disaggregators",
-        "https://github.com/huggingface/unity-api",
-        "https://github.com/huggingface/workshops",
-        "https://github.com/huggingface/llm-ls",
         "https://github.com/huggingface/llm-vscode",
-        "https://github.com/huggingface/community-events",
-        "https://github.com/huggingface/tune",
-        "https://github.com/huggingface/candle",
         "https://github.com/huggingface/paper-style-guide",
-        "https://github.com/huggingface/huggingface.js",
-        "https://github.com/huggingface/neuralcoref",
-        "https://github.com/huggingface/hfapi",
-        "https://github.com/huggingface/data-measurements-tool",
-        "https://github.com/huggingface/personas",
-        "https://github.com/huggingface/instruction-tuned-sd",
-        "https://github.com/huggingface/swift-transformers",
-        "https://github.com/huggingface/api-inference-community",
-        "https://github.com/huggingface/diffusers",
         "https://github.com/huggingface/safetensors",
-        "https://github.com/huggingface/optimum-graphcore",
-        "https://github.com/huggingface/OBELICS",
-        "https://github.com/huggingface/swift-coreml-diffusers",
         "https://github.com/huggingface/naacl_transfer_learning_tutorial",
         "https://github.com/huggingface/nn_pruning",
         "https://github.com/huggingface/awesome-papers",
-        "https://github.com/huggingface/optimum-intel",
-        "https://github.com/huggingface/autotrain-advanced",
-        "https://github.com/huggingface/pytorch-openai-transformer-lm",
-        "https://github.com/huggingface/node-question-answering",
         "https://github.com/huggingface/optimum",
-        "https://github.com/huggingface/knockknock",
-        "https://github.com/huggingface/optimum-habana",
-        "https://github.com/huggingface/transfer-learning-conv-ai",
-        "https://github.com/huggingface/notebooks",
-        "https://github.com/huggingface/hmtl",
-        "https://github.com/huggingface/block_movement_pruning",
-        "https://github.com/huggingface/huggingface_hub",
         "https://github.com/huggingface/transformers-bloom-inference",
-        "https://github.com/huggingface/hf_transfer",
-        "https://github.com/huggingface/doc-builder",
         "https://github.com/huggingface/large_language_model_training_playbook",
         "https://github.com/huggingface/that_is_good_data",
-        "https://github.com/huggingface/swift-coreml-transformers",
         "https://github.com/huggingface/datasets-viewer",
-        "https://github.com/huggingface/open-muse",
         "https://github.com/huggingface/evaluate",
-        "https://github.com/huggingface/llm_training_handbook",
-        "https://github.com/huggingface/pytorch_block_sparse",
         "https://github.com/huggingface/chat-ui",
-        "https://github.com/huggingface/llm.nvim",
-        "https://github.com/huggingface/swift-chat",
-        "https://github.com/huggingface/pytorch-pretrained-BigGAN",
         "https://github.com/huggingface/exporters",
-        "https://github.com/huggingface/audio-transformers-course",
         "https://github.com/huggingface/hf-endpoints-documentation",
         "https://github.com/gradio-app/gradio"
     ]

 {
     "urls": [
         "https://github.com/huggingface/trl",
+        "https://github.com/huggingface/bert-syntax",
+        "https://github.com/huggingface/pytorch_block_sparse",
         "https://github.com/huggingface/education-toolkit",
+        "https://github.com/huggingface/diffusion-fast",
+        "https://github.com/huggingface/swift-transformers",
+        "https://github.com/huggingface/llm_training_handbook",
+        "https://github.com/huggingface/awesome-huggingface",
+        "https://github.com/huggingface/m4-logs",
         "https://github.com/huggingface/llm-vscode",
+        "https://github.com/huggingface/huggingface_sb3",
+        "https://github.com/huggingface/audio-transformers-course",
+        "https://github.com/huggingface/huggingface_hub",
+        "https://github.com/huggingface/swift-chat",
+        "https://github.com/huggingface/swift-coreml-transformers",
+        "https://github.com/huggingface/notebooks",
+        "https://github.com/huggingface/datasets-server",
+        "https://github.com/huggingface/adversarialnlp",
+        "https://github.com/huggingface/alignment-handbook",
+        "https://github.com/huggingface/workshops",
+        "https://github.com/huggingface/torchMoji",
         "https://github.com/huggingface/paper-style-guide",
+        "https://github.com/huggingface/optimum-intel",
         "https://github.com/huggingface/safetensors",
+        "https://github.com/huggingface/accelerate",
         "https://github.com/huggingface/naacl_transfer_learning_tutorial",
+        "https://github.com/huggingface/hfapi",
+        "https://github.com/huggingface/optimum-neuron",
+        "https://github.com/huggingface/simulate",
+        "https://github.com/huggingface/unity-api",
+        "https://github.com/huggingface/instruction-tuned-sd",
+        "https://github.com/huggingface/disaggregators",
+        "https://github.com/huggingface/personas",
+        "https://github.com/huggingface/pytorch-openai-transformer-lm",
+        "https://github.com/huggingface/llm-ls",
         "https://github.com/huggingface/nn_pruning",
+        "https://github.com/huggingface/speechbox",
+        "https://github.com/huggingface/community-events",
+        "https://github.com/huggingface/tflite-android-transformers",
+        "https://github.com/huggingface/neuralcoref-viz",
+        "https://github.com/huggingface/amused",
         "https://github.com/huggingface/awesome-papers",
         "https://github.com/huggingface/optimum",
         "https://github.com/huggingface/transformers-bloom-inference",
+        "https://github.com/huggingface/open-muse",
+        "https://github.com/huggingface/pytorch-image-models",
+        "https://github.com/huggingface/olm-datasets",
+        "https://github.com/huggingface/datablations",
         "https://github.com/huggingface/large_language_model_training_playbook",
+        "https://github.com/huggingface/candle",
+        "https://github.com/huggingface/hf-hub",
+        "https://github.com/huggingface/transformers_bloom_parallel",
+        "https://github.com/huggingface/optimum-benchmark",
+        "https://github.com/huggingface/Mongoku",
+        "https://github.com/huggingface/hf_transfer",
         "https://github.com/huggingface/that_is_good_data",
+        "https://github.com/huggingface/100-times-faster-nlp",
+        "https://github.com/huggingface/fuego",
+        "https://github.com/huggingface/optimum-graphcore",
+        "https://github.com/huggingface/peft",
+        "https://github.com/huggingface/tokenizers",
+        "https://github.com/huggingface/llm.nvim",
+        "https://github.com/huggingface/autotrain-advanced",
+        "https://github.com/huggingface/blog",
         "https://github.com/huggingface/datasets-viewer",
+        "https://github.com/huggingface/huggingface.js",
+        "https://github.com/huggingface/diffusion-models-class",
+        "https://github.com/huggingface/rlhf-interface",
+        "https://github.com/huggingface/neuralcoref",
+        "https://github.com/huggingface/pytorch-pretrained-BigGAN",
+        "https://github.com/huggingface/distil-whisper",
+        "https://github.com/huggingface/quanto",
+        "https://github.com/huggingface/text-embeddings-inference",
+        "https://github.com/huggingface/course",
         "https://github.com/huggingface/evaluate",
+        "https://github.com/huggingface/datasets",
+        "https://github.com/huggingface/optimum-habana",
+        "https://github.com/huggingface/hub-docs",
+        "https://github.com/huggingface/node-question-answering",
+        "https://github.com/huggingface/tune",
+        "https://github.com/huggingface/discord-bots",
         "https://github.com/huggingface/chat-ui",
+        "https://github.com/huggingface/setfit",
+        "https://github.com/huggingface/transformers",
+        "https://github.com/huggingface/swift-coreml-diffusers",
+        "https://github.com/huggingface/OBELICS",
+        "https://github.com/huggingface/text-generation-inference",
+        "https://github.com/huggingface/transfer-learning-conv-ai",
+        "https://github.com/huggingface/llm-intellij",
+        "https://github.com/huggingface/api-inference-community",
+        "https://github.com/huggingface/optimum-nvidia",
+        "https://github.com/huggingface/sharp-transformers",
         "https://github.com/huggingface/exporters",
+        "https://github.com/huggingface/doc-builder",
+        "https://github.com/huggingface/olm-training",
+        "https://github.com/huggingface/deep-rl-class",
+        "https://github.com/huggingface/zapier",
+        "https://github.com/huggingface/hffs",
+        "https://github.com/huggingface/hmtl",
+        "https://github.com/huggingface/block_movement_pruning",
+        "https://github.com/huggingface/data-measurements-tool",
+        "https://github.com/huggingface/knockknock",
+        "https://github.com/huggingface/bloom-jax-inference",
+        "https://github.com/huggingface/frp",
+        "https://github.com/huggingface/gsplat.js",
+        "https://github.com/huggingface/ml-agents",
+        "https://github.com/huggingface/competitions",
+        "https://github.com/huggingface/diffusers",
         "https://github.com/huggingface/hf-endpoints-documentation",
         "https://github.com/gradio-app/gradio"
     ]

data/hugging_face_docs_dataset.py CHANGED Viewed

@@ -180,7 +180,8 @@ def markdown_cleaner(data: str):
 if __name__ == '__main__':
-    repo_urls_file = "./datasets/hf_repositories_urls.json"
     repo_dir = "./datasets/huggingface_repositories/"
     docs_dir = "./datasets/huggingface_docs/"
     download_repositories(repo_urls_file, repo_dir)

 if __name__ == '__main__':
+    # repo_urls_file = "./datasets/hf_repositories_urls.json"
+    repo_urls_file = "./datasets/hf_repositories_urls_scraped.json"
     repo_dir = "./datasets/huggingface_repositories/"
     docs_dir = "./datasets/huggingface_docs/"
     download_repositories(repo_urls_file, repo_dir)

data/{indexer.ipynb → index.ipynb} RENAMED Viewed

@@ -8,6 +8,7 @@
    "source": [
     "import math\n",
     "from pathlib import Path\n",
     "from typing import Any\n",
     "\n",
     "import numpy as np\n",
@@ -18,7 +19,14 @@
     "from langchain.indexes import VectorstoreIndexCreator\n",
     "from langchain.text_splitter import CharacterTextSplitter\n",
     "from langchain.vectorstores import FAISS\n",
-    "from huggingface_hub import HfApi"
    ]
   },
   {
@@ -63,11 +71,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "chunk_size = 512\n",
     "text_splitter = CharacterTextSplitter(\n",
     "    separator=\"\",\n",
-    "    chunk_size=chunk_size,\n",
-    "    chunk_overlap=100,\n",
     "    length_function=len,\n",
     ")\n",
     "docs = text_splitter.create_documents(docs, metadata)\n",
@@ -127,7 +137,7 @@
     "        return all_embeddings\n",
     "\n",
     "\n",
-    "# max length fed to the mode\n",
     "# if longer than CHUNK_SIZE in previous steps: then N chunks + averaging of embeddings\n",
     "max_length = 512\n",
     "embedding_model = AverageInstructEmbeddings(  \n",
@@ -156,13 +166,21 @@
     "index = FAISS.from_documents(docs, embedding_model)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "index_name = f'index-{model_name}-{chunk_size}-m{max_length}-11_Jan_2024'\n",
     "index_name = index_name.replace('/', '_')"
    ]
   },
@@ -198,6 +216,7 @@
     "    print(f\"Document {i} of {len(docs)}\")\n",
     "    print(\"Page Content:\")\n",
     "    print(f\"\\n{'-'*100}\\n\")\n",
     "    print(doc.page_content, '\\n')\n",
     "    print(doc.metadata)"
    ]
@@ -221,6 +240,41 @@
     "    repo_type='dataset',\n",
     ")"
    ]
   }
  ],
  "metadata": {

    "source": [
     "import math\n",
     "from pathlib import Path\n",
+    "from datetime import datetime\n",
     "from typing import Any\n",
     "\n",
     "import numpy as np\n",
     "from langchain.indexes import VectorstoreIndexCreator\n",
     "from langchain.text_splitter import CharacterTextSplitter\n",
     "from langchain.vectorstores import FAISS\n",
+    "from huggingface_hub import HfApi, snapshot_download"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Index building"
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "# if split_chunk_size > 512 model is processing first 512 characters of the chunk\n",
+    "split_chunk_size = 800\n",
+    "chunk_overlap = 200\n",
     "text_splitter = CharacterTextSplitter(\n",
     "    separator=\"\",\n",
+    "    chunk_size=split_chunk_size,\n",
+    "    chunk_overlap=chunk_overlap,\n",
     "    length_function=len,\n",
     ")\n",
     "docs = text_splitter.create_documents(docs, metadata)\n",
     "        return all_embeddings\n",
     "\n",
     "\n",
+    "# max length fed to the model\n",
     "# if longer than CHUNK_SIZE in previous steps: then N chunks + averaging of embeddings\n",
     "max_length = 512\n",
     "embedding_model = AverageInstructEmbeddings(  \n",
     "index = FAISS.from_documents(docs, embedding_model)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Index uploading"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
+    "todays_date = datetime.now().strftime('%d_%b_%Y')\n",
+    "index_name = f'index-{model_name}-{split_chunk_size}-{chunk_overlap}-m{max_length}-{todays_date}'\n",
     "index_name = index_name.replace('/', '_')"
    ]
   },
     "    print(f\"Document {i} of {len(docs)}\")\n",
     "    print(\"Page Content:\")\n",
     "    print(f\"\\n{'-'*100}\\n\")\n",
+    "    print(f'length of a chunk: {len(doc.page_content)}')\n",
     "    print(doc.page_content, '\\n')\n",
     "    print(doc.metadata)"
    ]
     "    repo_type='dataset',\n",
     ")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Index inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index_repo_id = f'KonradSzafer/index-hkunlp_instructor-large-512-m512-11_Jan_2024'\n",
+    "\n",
+    "snapshot_download(\n",
+    "    repo_id=index_repo_id,\n",
+    "    allow_patterns=['*.faiss', '*.pkl'], \n",
+    "    repo_type='dataset',\n",
+    "    local_dir='../indexes/run/'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "index = FAISS.load_local('../indexes/run/', embedding_model)\n",
+    "docs = index.similarity_search(query='how to create a pipeline object?', k=5)\n",
+    "docs[0].metadata\n",
+    "docs[0].page_content"
+   ]
   }
  ],
  "metadata": {

data/{indexing_benchmark.ipynb → index_benchmark.ipynb} RENAMED Viewed

File without changes

data/{scrapers → stackoverflow_scrapers}/stack_overflow_scraper.py RENAMED Viewed

File without changes

data/{stackoverflow_python_dataset.py → stackoverflow_scrapers/stackoverflow_python_dataset.py} RENAMED Viewed

File without changes

data/{upload_csv_dataset.py → stackoverflow_scrapers/upload_csv_dataset.py} RENAMED Viewed

File without changes

qa_engine/config.py CHANGED Viewed

@@ -36,7 +36,7 @@ class Config:
     # Discord bot config - optional
     discord_token: str = get_env('DISCORD_TOKEN', '-', warn=False)
-    discord_channel_ids: list[int] = get_env('DISCORD_CHANNEL_IDS', field(default_factory=list), warn=True)
     num_last_messages: int = int(get_env('NUM_LAST_MESSAGES', 2, warn=False))
     use_names_in_context: bool = eval(get_env('USE_NAMES_IN_CONTEXT', 'False', warn=False))
     enable_commands: bool = eval(get_env('ENABLE_COMMANDS', 'True', warn=False))

     # Discord bot config - optional
     discord_token: str = get_env('DISCORD_TOKEN', '-', warn=False)
+    discord_channel_ids: list[int] = get_env('DISCORD_CHANNEL_IDS', field(default_factory=list), warn=False)
     num_last_messages: int = int(get_env('NUM_LAST_MESSAGES', 2, warn=False))
     use_names_in_context: bool = eval(get_env('USE_NAMES_IN_CONTEXT', 'False', warn=False))
     enable_commands: bool = eval(get_env('ENABLE_COMMANDS', 'True', warn=False))

qa_engine/qa_engine.py CHANGED Viewed

@@ -181,30 +181,11 @@ class QAEngine():
         self.first_stage_docs = first_stage_docs
         self.debug = debug
-        if 'local_models/' in llm_model_id:
-            logger.info('using local binary model')
-            self.llm_model = LocalBinaryModel(
-                model_id=llm_model_id
-            )
-        elif 'api_models/' in llm_model_id:
-            logger.info('using api served model')
-            self.llm_model = APIServedModel(
-                model_url=llm_model_id.replace('api_models/', ''),
-                debug=self.debug
-            )
-        elif llm_model_id == 'mock':
-            logger.info('using mock model')
-            self.llm_model = MockLocalBinaryModel()
-        else:
-            logger.info('using transformers pipeline model')
-            self.llm_model = TransformersPipelineModel(
-                model_id=llm_model_id
-            )
         prompt = PromptTemplate(
             template=prompt_template,
             input_variables=['question', 'context']
         )
         self.llm_chain = LLMChain(prompt=prompt, llm=self.llm_model)
         if self.use_docs_for_context:
@@ -228,6 +209,29 @@ class QAEngine():
             self.reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
     @staticmethod
     def _preprocess_question(question: str) -> str:
         if question[-1] != '?':

         self.first_stage_docs = first_stage_docs
         self.debug = debug
         prompt = PromptTemplate(
             template=prompt_template,
             input_variables=['question', 'context']
         )
+        self.llm_model = QAEngine._get_model(llm_model_id)
         self.llm_chain = LLMChain(prompt=prompt, llm=self.llm_model)
         if self.use_docs_for_context:
             self.reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
+    @staticmethod
+    def _get_model(llm_model_id: str):
+        if 'local_models/' in llm_model_id:
+            logger.info('using local binary model')
+            return LocalBinaryModel(
+                model_id=llm_model_id
+            )
+        elif 'api_models/' in llm_model_id:
+            logger.info('using api served model')
+            return APIServedModel(
+                model_url=llm_model_id.replace('api_models/', ''),
+                debug=self.debug
+            )
+        elif llm_model_id == 'mock':
+            logger.info('using mock model')
+            return MockLocalBinaryModel()
+        else:
+            logger.info('using transformers pipeline model')
+            return TransformersPipelineModel(
+                model_id=llm_model_id
+            )
     @staticmethod
     def _preprocess_question(question: str) -> str:
         if question[-1] != '?':