added pipeline

Browse files

Files changed (4) hide show

README.md +59 -0
create_handler.ipynb +251 -0
pipeline.py +31 -0
requirements.txt +4 -0

README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+---
+license: bsd-3-clause
+tags:
+- endpoints-template
+pipeline_tag: text-generation
+---
+# Sharded fork of [Salesforce/codegen-6B-mono](https://huggingface.co/Salesforce/codegen-6B-mono) with a custom pipeline.py
+This repository implements a custom `pipeline` task for `text-generation` for 🤗 Inference Endpoints for LLM inference using bitsandbytes quantization. The code for the customized pipeline is in the [pipeline.py](https://huggingface.co/philschmid/codegen-6B-mono-sharded-bnb/blob/main/pipeline.py).
+There is also a [notebook](https://huggingface.co/philschmid/codegen-6B-mono-sharded-bnb/blob/main/create_handler.ipynb) included.
+### expected Request payload
+```json
+{
+    "inputs": "# load distilbert model and initialize text-classification pipeline\nmodel_id = 'distil",
+    "parameters": {
+        "top_k": 100,
+        "max_length": 64,
+        "early_stopping": true,
+        "do_sample": true,
+        "eos_token_id": 50256,
+    }
+}
+```
+below is an example on how to run a request using Python and `requests`.
+## Run Request
+```python
+import json
+from typing import List
+import requests as r
+import base64
+ENDPOINT_URL = ""
+HF_TOKEN = ""
+parameters={
+        "top_k": 100,
+        "max_length": 64,
+        "early_stopping": True,
+        "do_sample": True,
+        "eos_token_id": 50256,
+    }
+def predict(code_snippet:str=None):
+    payload = {"inputs": code_snippet,"parameters": parameters}
+    response = r.post(
+        ENDPOINT_URL, headers={"Authorization": f"Bearer {HF_TOKEN}"}, json=payload
+    )
+    return response.json()
+prediction = predict(
+    code_snippet="# load distilbert model and initialize text-classification pipeline\nmodel_id = 'distil"
+)
+```
+expected output
+```python
+{'generated_text': "# load distilbert model and initialize text-classification pipeline\nmodel_id = 'distilbert-base-uncased'\nmodel_url = 'https://tfhub.dev/tensorflow/small_bert/1'\n\nmodel_dir = './distilBERT'"}
+```

create_handler.ipynb ADDED Viewed

	@@ -0,0 +1,251 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup & Installation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Writing requirements.txt\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile requirements.txt\n",
+    "bitsandbytes\n",
+    "git+https://github.com/huggingface/transformers.git\n",
+    "accelerate\n",
+    "sentencepiece"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Create Custom Handler for Inference Endpoints\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting pipeline.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile pipeline.py\n",
+    "from typing import  Dict, List, Any\n",
+    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
+    "import torch\n",
+    "\n",
+    "class PreTrainedPipeline():\n",
+    "    def __init__(self, path=\"\"):\n",
+    "        # load the optimized model\n",
+    "        self.model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16, device_map=\"auto\", load_in_8bit=True)\n",
+    "        self.tokenizer = AutoTokenizer.from_pretrained(path)\n",
+    "\n",
+    "    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:\n",
+    "        \"\"\"\n",
+    "        Args:\n",
+    "            data (:obj:):\n",
+    "                includes the input data and the parameters for the inference.\n",
+    "        Return:\n",
+    "            A :obj:`list`:. The list contains the embeddings of the inference inputs\n",
+    "        \"\"\"\n",
+    "        inputs = data.get(\"inputs\", data)\n",
+    "        parameters = data.get(\"parameters\", {})\n",
+    "\n",
+    "        # tokenize the input\n",
+    "        input_ids = self.tokenizer(inputs,return_tensors=\"pt\").input_ids.to(self.model.device)\n",
+    "        # run the model\n",
+    "        logits = self.model.generate(input_ids, **parameters)\n",
+    "        # Perform pooling\n",
+    "        # postprocess the prediction\n",
+    "        return {\"generated_text\": self.tokenizer.decode(logits[0].tolist())}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "test custom pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "===================================BUG REPORT===================================\n",
+      "Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
+      "For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link\n",
+      "================================================================================\n",
+      "CUDA SETUP: CUDA runtime path found: /home/ubuntu/miniconda/envs/dev/lib/libcudart.so\n",
+      "CUDA SETUP: Highest compute capability among GPUs detected: 7.5\n",
+      "CUDA SETUP: Detected CUDA version 113\n",
+      "CUDA SETUP: Loading binary /home/ubuntu/miniconda/envs/dev/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pipeline import PreTrainedPipeline\n",
+    "\n",
+    "# init handler\n",
+    "my_handler = PreTrainedPipeline(path=\".\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
+      "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n",
+      "/home/ubuntu/miniconda/envs/dev/lib/python3.9/site-packages/transformers/generation_utils.py:1228: UserWarning: Neither `max_length` nor `max_new_tokens` has been set, `max_length` will default to 20 (`self.config.max_length`). Controlling `max_length` via the config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.\n",
+      "  warnings.warn(\n",
+      "/home/ubuntu/miniconda/envs/dev/lib/python3.9/site-packages/transformers/models/codegen/modeling_codegen.py:167: UserWarning: where received a uint8 condition tensor. This behavior is deprecated and will be removed in a future version of PyTorch. Use a boolean condition instead. (Triggered internally at  ../aten/src/ATen/native/TensorCompare.cpp:333.)\n",
+      "  attn_weights = torch.where(causal_mask, attn_weights, mask_value)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'generated_text': 'def hello_world():\\n    return \"Hello World\"\\n\\[email protected](\\'/'}"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "# prepare sample payload\n",
+    "request = {\"inputs\": \"def hello_world():\"}\n",
+    "\n",
+    "# test the handler\n",
+    "my_handler(request)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
+      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'generated_text': \"# load distilbert model and initialize text-classification pipeline\\nmodel_id = 'distilbert-base-uncased'\\nmodel_url = 'https://tfhub.dev/tensorflow/small_bert/1'\\n\\nmodel_dir = './distilBERT'\"}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# prepare sample payload\n",
+    "request = {\n",
+    "    \"inputs\": \"# load distilbert model and initialize text-classification pipeline\\nmodel_id = 'distil\",\n",
+    "    \"parameters\": {\n",
+    "        \"top_k\": 100,\n",
+    "        \"max_length\": 64,\n",
+    "        \"early_stopping\": True,\n",
+    "        \"do_sample\": True,\n",
+    "        \"eos_token_id\": 50256,\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "# test the handler\n",
+    "print(my_handler(request))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "50256"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "my_handler.tokenizer.convert_tokens_to_ids(my_handler.tokenizer.eos_token)\n",
+    "git remote set-url origin https://git-repo/new-repository.git"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.13 ('dev': conda)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "f6dd96c16031089903d5a31ec148b80aeb0d39c32affb1a1080393235fbfa2fc"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

pipeline.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from typing import Dict, List, Any
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+class PreTrainedPipeline:
+    def __init__(self, path=""):
+        # load the optimized model
+        self.model = AutoModelForCausalLM.from_pretrained(
+            path, torch_dtype=torch.float16, device_map="auto", load_in_8bit=True
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(path)
+    def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
+        """
+        Args:
+            data (:obj:):
+                includes the input data and the parameters for the inference.
+        Return:
+            A :obj:`list`:. The list contains the embeddings of the inference inputs
+        """
+        inputs = data.get("inputs", data)
+        parameters = data.get("parameters", {})
+        # tokenize the input
+        input_ids = self.tokenizer(inputs, return_tensors="pt").input_ids.to(self.model.device)
+        # run the model
+        logits = self.model.generate(input_ids, **parameters)
+        # Perform pooling
+        # postprocess the prediction
+        return {"generated_text": self.tokenizer.decode(logits[0].tolist())}

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+bitsandbytes
+git+https://github.com/huggingface/transformers.git
+accelerate
+sentencepiece