Spaces:

cicero-im
/

synthetic-data-generator-new

Runtime error

App Files Files Community

davidberenstein1957 commited on Jan 10

Commit

3c6a88c

unverified ·

2 Parent(s): 9a45614 ce401b1

Merge pull request #20 from argilla-io/feat/improve-support-local-deployment

Browse files

Files changed (20) hide show

README.md +11 -7
examples/{argilla_deployment.py → argilla-deployment.py} +5 -2
examples/fine-tune-modernbert-classifier.ipynb +1 -1
examples/{openai_local.py → hf-dedicated-or-tgi-deployment.py} +7 -4
examples/{enforce_mapgie_template copy.py → hf-serverless-deployment.py} +3 -2
examples/ollama-deployment.py +22 -0
examples/{ollama_local.py → openai-deployment.py} +6 -3
examples/vllm-deployment.py +21 -0
pdm.lock +51 -16
pyproject.toml +1 -1
src/synthetic_dataset_generator/_distiset.py +10 -0
src/synthetic_dataset_generator/apps/base.py +7 -2
src/synthetic_dataset_generator/apps/chat.py +8 -11
src/synthetic_dataset_generator/apps/eval.py +3 -1
src/synthetic_dataset_generator/apps/textcat.py +41 -20
src/synthetic_dataset_generator/constants.py +53 -25
src/synthetic_dataset_generator/pipelines/base.py +132 -1
src/synthetic_dataset_generator/pipelines/chat.py +36 -72
src/synthetic_dataset_generator/pipelines/textcat.py +12 -52
src/synthetic_dataset_generator/utils.py +5 -0

README.md CHANGED Viewed

@@ -28,7 +28,7 @@ hf_oauth_scopes:
 ## Introduction
-Synthetic Data Generator is a tool that allows you to create high-quality datasets for training and fine-tuning language models. It leverages the power of distilabel and LLMs to generate synthetic data tailored to your specific needs. [The announcement blog](https://huggingface.co/blog/synthetic-data-generator) goes over a practical example of how to use it but you can also wathh the [video](https://www.youtube.com/watch?v=nXjVtnGeEss) to see it in action.
 Supported Tasks:
@@ -76,21 +76,25 @@ launch()
 - `HF_TOKEN`: Your [Hugging Face token](https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&tokenType=fineGrained) to push your datasets to the Hugging Face Hub and generate free completions from Hugging Face Inference Endpoints. You can find some configuration examples in the [examples](examples/) folder.
-Optionally, you can set the following environment variables to customize the generation process.
 - `MAX_NUM_TOKENS`: The maximum number of tokens to generate, defaults to `2048`.
 - `MAX_NUM_ROWS`: The maximum number of rows to generate, defaults to `1000`.
 - `DEFAULT_BATCH_SIZE`: The default batch size to use for generating the dataset, defaults to `5`.
-Optionally, you can use different models and APIs. For providers outside of Hugging Face, we provide an integration through [LiteLLM](https://docs.litellm.ai/docs/providers).
-- `BASE_URL`: The base URL for any OpenAI compatible API, e.g. `https://api.openai.com/v1/`, `http://127.0.0.1:11434/v1/`.
-- `MODEL`: The model to use for generating the dataset, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`, `openai/gpt-4o`, `ollama/llama3.1`.
 - `API_KEY`: The API key to use for the generation API, e.g. `hf_...`, `sk-...`. If not provided, it will default to the provided `HF_TOKEN` environment variable.
-SFT and Chat Data generation is only supported with Hugging Face Inference Endpoints , and you can set the following environment variables use it with models other than Llama3 and Qwen2.
-- `MAGPIE_PRE_QUERY_TEMPLATE`: Enforce setting the pre-query template for Magpie, which is only supported with Hugging Face Inference Endpoints. Llama3 and Qwen2 are supported out of the box and will use `"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"` and `"<|im_start|>user\n"` respectively. For other models, you can pass a custom pre-query template string.
 Optionally, you can also push your datasets to Argilla for further curation by setting the following environment variables:

 ## Introduction
+Synthetic Data Generator is a tool that allows you to create high-quality datasets for training and fine-tuning language models. It leverages the power of distilabel and LLMs to generate synthetic data tailored to your specific needs. [The announcement blog](https://huggingface.co/blog/synthetic-data-generator) goes over a practical example of how to use it but you can also watch the [video](https://www.youtube.com/watch?v=nXjVtnGeEss) to see it in action.
 Supported Tasks:
 - `HF_TOKEN`: Your [Hugging Face token](https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&tokenType=fineGrained) to push your datasets to the Hugging Face Hub and generate free completions from Hugging Face Inference Endpoints. You can find some configuration examples in the [examples](examples/) folder.
+You can set the following environment variables to customize the generation process.
 - `MAX_NUM_TOKENS`: The maximum number of tokens to generate, defaults to `2048`.
 - `MAX_NUM_ROWS`: The maximum number of rows to generate, defaults to `1000`.
 - `DEFAULT_BATCH_SIZE`: The default batch size to use for generating the dataset, defaults to `5`.
+Optionally, you can use different API providers and models.
+- `MODEL`: The model to use for generating the dataset, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`, `gpt-4o`, `llama3.1`.
 - `API_KEY`: The API key to use for the generation API, e.g. `hf_...`, `sk-...`. If not provided, it will default to the provided `HF_TOKEN` environment variable.
+- `OPENAI_BASE_URL`: The base URL for any OpenAI compatible API, e.g. `https://api.openai.com/v1/`.
+- `OLLAMA_BASE_URL`: The base URL for any Ollama compatible API, e.g. `http://127.0.0.1:11434/`.
+- `HUGGINGFACE_BASE_URL`: The base URL for any Hugging Face compatible API, e.g. TGI server or Dedicated Inference Endpoints. If you want to use serverless inference, only set the `MODEL`.
+- `VLLM_BASE_URL`: The base URL for any VLLM compatible API, e.g. `http://localhost:8000/`.
+SFT and Chat Data generation is not supported with OpenAI Endpoints. Additionally, you need to configure it per model family based on their prompt templates using the right `TOKENIZER_ID` and `MAGPIE_PRE_QUERY_TEMPLATE` environment variables.
+- `TOKENIZER_ID`: The tokenizer ID to use for the magpie pipeline, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`.
+- `MAGPIE_PRE_QUERY_TEMPLATE`: Enforce setting the pre-query template for Magpie, which is only supported with Hugging Face Inference Endpoints. `llama3` and `qwen2` are supported out of the box and will use `"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"` and `"<|im_start|>user\n"`, respectively. For other models, you can pass a custom pre-query template string.
 Optionally, you can also push your datasets to Argilla for further curation by setting the following environment variables:

examples/{argilla_deployment.py → argilla-deployment.py} RENAMED Viewed

@@ -9,7 +9,10 @@ import os
 from synthetic_dataset_generator import launch
 # Follow https://docs.argilla.io/latest/getting_started/quickstart/ to get your Argilla API key and URL
-os.environ["ARGILLA_API_URL"] = "https://[your-owner-name]-[your_space_name].hf.space"
-os.environ["ARGILLA_API_KEY"] = "my_api_key"
 launch()

 from synthetic_dataset_generator import launch
 # Follow https://docs.argilla.io/latest/getting_started/quickstart/ to get your Argilla API key and URL
+os.environ["HF_TOKEN"] = "hf_..."
+os.environ["ARGILLA_API_URL"] = (
+    "https://[your-owner-name]-[your_space_name].hf.space"  # argilla base url
+)
+os.environ["ARGILLA_API_KEY"] = "my_api_key"  # argilla api key
 launch()

examples/fine-tune-modernbert-classifier.ipynb CHANGED Viewed

@@ -530,7 +530,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
   }
  },
  "nbformat": 4,

    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,

examples/{openai_local.py → hf-dedicated-or-tgi-deployment.py} RENAMED Viewed

@@ -8,9 +8,12 @@ import os
 from synthetic_dataset_generator import launch
-assert os.getenv("HF_TOKEN")  # push the data to huggingface
-os.environ["BASE_URL"] = "https://api.openai.com/v1/"
-os.environ["API_KEY"] = os.getenv("OPENAI_API_KEY")
-os.environ["MODEL"] = "gpt-4o"
 launch()

 from synthetic_dataset_generator import launch
+os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
+os.environ["HUGGINGFACE_BASE_URL"] = "http://127.0.0.1:3000/"  # dedicated endpoint/TGI
+os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3"  # magpie template
+os.environ["TOKENIZER_ID"] = (
+    "meta-llama/Llama-3.1-8B-Instruct"  # tokenizer for model hosted on endpoint
+)
+os.environ["MODEL"] = None  # model is linked to endpoint
 launch()

examples/{enforce_mapgie_template copy.py → hf-serverless-deployment.py} RENAMED Viewed

@@ -8,7 +8,8 @@ import os
 from synthetic_dataset_generator import launch
-os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "my_custom_template"
-os.environ["MODEL"] = "google/gemma-2-9b-it"
 launch()

 from synthetic_dataset_generator import launch
+os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
+os.environ["MODEL"] = "meta-llama/Llama-3.1-8B-Instruct"  # use instruct model
+os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3"  # use the template for the model
 launch()

examples/ollama-deployment.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# /// script
+# requires-python = ">=3.11,<3.12"
+# dependencies = [
+#     "synthetic-dataset-generator",
+# ]
+# ///
+# ollama serve
+# ollama run qwen2.5:32b-instruct-q5_K_S
+import os
+from synthetic_dataset_generator import launch
+os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
+os.environ["OLLAMA_BASE_URL"] = "http://127.0.0.1:11434/"  # ollama base url
+os.environ["MODEL"] = "qwen2.5:32b-instruct-q5_K_S"  # model id
+os.environ["TOKENIZER_ID"] = "Qwen/Qwen2.5-32B-Instruct"  # tokenizer id
+os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "qwen2"
+os.environ["MAX_NUM_ROWS"] = "10000"
+os.environ["DEFAULT_BATCH_SIZE"] = "2"
+os.environ["MAX_NUM_TOKENS"] = "1024"
+launch()

examples/{ollama_local.py → openai-deployment.py} RENAMED Viewed

@@ -4,12 +4,15 @@
 #     "synthetic-dataset-generator",
 # ]
 # ///
 import os
 from synthetic_dataset_generator import launch
-assert os.getenv("HF_TOKEN")  # push the data to huggingface
-os.environ["BASE_URL"] = "http://127.0.0.1:11434/v1/"
-os.environ["MODEL"] = "llama3.1"
 launch()

 #     "synthetic-dataset-generator",
 # ]
 # ///
 import os
 from synthetic_dataset_generator import launch
+os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
+os.environ["OPENAI_BASE_URL"] = "https://api.openai.com/v1/"  # openai base url
+os.environ["API_KEY"] = os.getenv("OPENAI_API_KEY")  # openai api key
+os.environ["MODEL"] = "gpt-4o"  # model id
+os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = None  # chat data not supported with OpenAI
 launch()

examples/vllm-deployment.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# /// script
+# requires-python = ">=3.11,<3.12"
+# dependencies = [
+#     "synthetic-dataset-generator",
+# ]
+# ///
+# vllm serve Qwen/Qwen2.5-1.5B-Instruct
+import os
+from synthetic_dataset_generator import launch
+os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
+os.environ["VLLM_BASE_URL"] = "http://127.0.0.1:8000/"  # vllm base url
+os.environ["MODEL"] = "Qwen/Qwen2.5-1.5B-Instruct"  # model id
+os.environ["TOKENIZER_ID"] = "Qwen/Qwen2.5-1.5B-Instruct"  # tokenizer id
+os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "qwen2"
+os.environ["MAX_NUM_ROWS"] = "10000"
+os.environ["DEFAULT_BATCH_SIZE"] = "2"
+os.environ["MAX_NUM_TOKENS"] = "1024"
+launch()

pdm.lock CHANGED Viewed

@@ -5,7 +5,7 @@
 groups = ["default"]
 strategy = ["inherit_metadata"]
 lock_version = "4.5.0"
-content_hash = "sha256:a6d7f86e9a168e7eb78801faafa8a9ea13270310ee9c21a0e23aeab277d973a0"
 [[metadata.targets]]
 requires_python = ">=3.10,<3.13"
@@ -491,8 +491,11 @@ files = [
 [[package]]
 name = "distilabel"
-version = "1.4.1"
 requires_python = ">=3.9"
 summary = "Distilabel is an AI Feedback (AIF) framework for building datasets with and for LLMs."
 groups = ["default"]
 dependencies = [
@@ -512,30 +515,30 @@ dependencies = [
     "typer>=0.9.0",
     "universal-pathlib>=0.2.2",
 ]
-files = [
-    {file = "distilabel-1.4.1-py3-none-any.whl", hash = "sha256:4643da7f3abae86a330d86d1498443ea56978e462e21ae3d106a4c6013386965"},
-    {file = "distilabel-1.4.1.tar.gz", hash = "sha256:0c373be234e8f2982ec7f940d9a95585b15306b6ab5315f5a6a45214d8f34006"},
-]
 [[package]]
 name = "distilabel"
-version = "1.4.1"
-extras = ["argilla", "hf-inference-endpoints", "instructor", "outlines"]
 requires_python = ">=3.9"
 summary = "Distilabel is an AI Feedback (AIF) framework for building datasets with and for LLMs."
 groups = ["default"]
 dependencies = [
     "argilla>=2.0.0",
-    "distilabel==1.4.1",
     "huggingface-hub>=0.22.0",
     "instructor>=1.2.3",
     "ipython",
     "numba>=0.54.0",
     "outlines>=0.0.40",
-]
-files = [
-    {file = "distilabel-1.4.1-py3-none-any.whl", hash = "sha256:4643da7f3abae86a330d86d1498443ea56978e462e21ae3d106a4c6013386965"},
-    {file = "distilabel-1.4.1.tar.gz", hash = "sha256:0c373be234e8f2982ec7f940d9a95585b15306b6ab5315f5a6a45214d8f34006"},
 ]
 [[package]]
@@ -824,7 +827,7 @@ files = [
 [[package]]
 name = "httpx"
-version = "0.28.1"
 requires_python = ">=3.8"
 summary = "The next generation HTTP client."
 groups = ["default"]
@@ -833,10 +836,11 @@ dependencies = [
     "certifi",
     "httpcore==1.*",
     "idna",
 ]
 files = [
-    {file = "httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad"},
-    {file = "httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc"},
 ]
 [[package]]
@@ -1068,6 +1072,22 @@ files = [
     {file = "lark-1.2.2.tar.gz", hash = "sha256:ca807d0162cd16cef15a8feecb862d7319e7a09bdb13aef927968e45040fed80"},
 ]
 [[package]]
 name = "llvmlite"
 version = "0.43.0"
@@ -1538,6 +1558,21 @@ files = [
     {file = "nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485"},
 ]
 [[package]]
 name = "openai"
 version = "1.57.4"

 groups = ["default"]
 strategy = ["inherit_metadata"]
 lock_version = "4.5.0"
+content_hash = "sha256:e95140895657d62ad438ff1815ddf1798abbb342ddd2649ae462620b8b3f5350"
 [[metadata.targets]]
 requires_python = ">=3.10,<3.13"
 [[package]]
 name = "distilabel"
+version = "1.5.0"
 requires_python = ">=3.9"
+git = "https://github.com/argilla-io/distilabel.git"
+ref = "feat/add-magpie-support-llama-cpp-ollama"
+revision = "4e291e7bf1c27b734a683a3af1fefe58965d77d6"
 summary = "Distilabel is an AI Feedback (AIF) framework for building datasets with and for LLMs."
 groups = ["default"]
 dependencies = [
     "typer>=0.9.0",
     "universal-pathlib>=0.2.2",
 ]
 [[package]]
 name = "distilabel"
+version = "1.5.0"
+extras = ["argilla", "hf-inference-endpoints", "hf-transformers", "instructor", "llama-cpp", "ollama", "openai", "outlines"]
 requires_python = ">=3.9"
+git = "https://github.com/argilla-io/distilabel.git"
+ref = "feat/add-magpie-support-llama-cpp-ollama"
+revision = "4e291e7bf1c27b734a683a3af1fefe58965d77d6"
 summary = "Distilabel is an AI Feedback (AIF) framework for building datasets with and for LLMs."
 groups = ["default"]
 dependencies = [
     "argilla>=2.0.0",
+    "distilabel @ git+https://github.com/argilla-io/distilabel.git@feat/add-magpie-support-llama-cpp-ollama",
     "huggingface-hub>=0.22.0",
     "instructor>=1.2.3",
     "ipython",
+    "llama-cpp-python>=0.2.0",
     "numba>=0.54.0",
+    "ollama>=0.1.7",
+    "openai>=1.0.0",
     "outlines>=0.0.40",
+    "torch>=2.0.0",
+    "transformers>=4.34.1",
 ]
 [[package]]
 [[package]]
 name = "httpx"
+version = "0.27.2"
 requires_python = ">=3.8"
 summary = "The next generation HTTP client."
 groups = ["default"]
     "certifi",
     "httpcore==1.*",
     "idna",
+    "sniffio",
 ]
 files = [
+    {file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"},
+    {file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"},
 ]
 [[package]]
     {file = "lark-1.2.2.tar.gz", hash = "sha256:ca807d0162cd16cef15a8feecb862d7319e7a09bdb13aef927968e45040fed80"},
 ]
+[[package]]
+name = "llama-cpp-python"
+version = "0.3.5"
+requires_python = ">=3.8"
+summary = "Python bindings for the llama.cpp library"
+groups = ["default"]
+dependencies = [
+    "diskcache>=5.6.1",
+    "jinja2>=2.11.3",
+    "numpy>=1.20.0",
+    "typing-extensions>=4.5.0",
+]
+files = [
+    {file = "llama_cpp_python-0.3.5.tar.gz", hash = "sha256:f5ce47499d53d3973e28ca5bdaf2dfe820163fa3fb67e3050f98e2e9b58d2cf6"},
+]
 [[package]]
 name = "llvmlite"
 version = "0.43.0"
     {file = "nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485"},
 ]
+[[package]]
+name = "ollama"
+version = "0.4.4"
+requires_python = "<4.0,>=3.8"
+summary = "The official Python client for Ollama."
+groups = ["default"]
+dependencies = [
+    "httpx<0.28.0,>=0.27.0",
+    "pydantic<3.0.0,>=2.9.0",
+]
+files = [
+    {file = "ollama-0.4.4-py3-none-any.whl", hash = "sha256:0f466e845e2205a1cbf5a2fef4640027b90beaa3b06c574426d8b6b17fd6e139"},
+    {file = "ollama-0.4.4.tar.gz", hash = "sha256:e1db064273c739babc2dde9ea84029c4a43415354741b6c50939ddd3dd0f7ffb"},
+]
 [[package]]
 name = "openai"
 version = "1.57.4"

pyproject.toml CHANGED Viewed

@@ -18,7 +18,7 @@ readme = "README.md"
 license = {text = "Apache 2"}
 dependencies = [
-    "distilabel[hf-inference-endpoints,argilla,outlines,instructor]>=1.4.1,<2.0.0",
     "gradio[oauth]>=5.4.0,<6.0.0",
     "transformers>=4.44.2,<5.0.0",
     "sentence-transformers>=3.2.0,<4.0.0",

 license = {text = "Apache 2"}
 dependencies = [
+    "distilabel[argilla,hf-inference-endpoints,hf-transformers,instructor,llama-cpp,ollama,openai,outlines,vllm] @ git+https://github.com/argilla-io/distilabel.git@develop",
     "gradio[oauth]>=5.4.0,<6.0.0",
     "transformers>=4.44.2,<5.0.0",
     "sentence-transformers>=3.2.0,<4.0.0",

src/synthetic_dataset_generator/_distiset.py CHANGED Viewed

@@ -81,6 +81,15 @@ class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
                 dataset[0] if not isinstance(dataset, dict) else dataset["train"][0]
             )
         readme_metadata = {}
         if repo_id and token:
             readme_metadata = self._extract_readme_metadata(repo_id, token)
@@ -90,6 +99,7 @@ class CustomDistisetWithAdditionalTag(distilabel.distiset.Distiset):
             "size_categories": size_categories_parser(
                 max(len(dataset) for dataset in self.values())
             ),
             "tags": [
                 "synthetic",
                 "distilabel",

                 dataset[0] if not isinstance(dataset, dict) else dataset["train"][0]
             )
+        keys = list(sample_records.keys())
+        if len(keys) != 2 or not (
+            ("label" in keys and "text" in keys)
+            or ("labels" in keys and "text" in keys)
+        ):
+            task_categories = ["text-classification"]
+        elif "prompt" in keys or "messages" in keys:
+            task_categories = ["text-generation", "text2text-generation"]
         readme_metadata = {}
         if repo_id and token:
             readme_metadata = self._extract_readme_metadata(repo_id, token)
             "size_categories": size_categories_parser(
                 max(len(dataset) for dataset in self.values())
             ),
+            "task_categories": task_categories,
             "tags": [
                 "synthetic",
                 "distilabel",

src/synthetic_dataset_generator/apps/base.py CHANGED Viewed

@@ -77,10 +77,15 @@ def validate_push_to_hub(org_name, repo_name):
     return repo_id
-def combine_datasets(repo_id: str, dataset: Dataset) -> Dataset:
     try:
         new_dataset = load_dataset(
-            repo_id, split="train", download_mode="force_redownload"
         )
         return concatenate_datasets([dataset, new_dataset])
     except Exception:

     return repo_id
+def combine_datasets(
+    repo_id: str, dataset: Dataset, oauth_token: Union[OAuthToken, None]
+) -> Dataset:
     try:
         new_dataset = load_dataset(
+            repo_id,
+            split="train",
+            download_mode="force_redownload",
+            token=oauth_token.token,
         )
         return concatenate_datasets([dataset, new_dataset])
     except Exception:

src/synthetic_dataset_generator/apps/chat.py CHANGED Viewed

@@ -25,12 +25,12 @@ from synthetic_dataset_generator.constants import (
     MODEL,
     SFT_AVAILABLE,
 )
 from synthetic_dataset_generator.pipelines.chat import (
     DEFAULT_DATASET_DESCRIPTIONS,
     generate_pipeline_code,
     get_magpie_generator,
     get_prompt_generator,
-    get_prompt_rewriter,
     get_response_generator,
 )
 from synthetic_dataset_generator.pipelines.embeddings import (
@@ -40,6 +40,7 @@ from synthetic_dataset_generator.pipelines.embeddings import (
 from synthetic_dataset_generator.utils import (
     get_argilla_client,
     get_org_dropdown,
     swap_visibility,
 )
@@ -106,7 +107,6 @@ def generate_dataset(
 ) -> pd.DataFrame:
     num_rows = test_max_num_rows(num_rows)
     progress(0.0, desc="(1/2) Generating instructions")
-    prompt_rewriter = get_prompt_rewriter()
     magpie_generator = get_magpie_generator(
         system_prompt, num_turns, temperature, is_sample
     )
@@ -117,14 +117,7 @@ def generate_dataset(
     batch_size = DEFAULT_BATCH_SIZE
     # create prompt rewrites
-    inputs = [
-        {
-            "instruction": f"Rewrite this prompt keeping the same structure but highlighting different aspects of the original without adding anything new. Original prompt: {system_prompt} Rewritten prompt: "
-        }
-        for i in range(int(num_rows / 50))
-    ]
-    batch = list(prompt_rewriter.process(inputs=inputs))
-    prompt_rewrites = [entry["generation"] for entry in batch[0]] + [system_prompt]
     # create instructions
     n_processed = 0
@@ -142,6 +135,7 @@ def generate_dataset(
         batch = list(magpie_generator.process(inputs=inputs))
         magpie_results.extend(batch[0])
         n_processed += batch_size
     progress(0.5, desc="(1/2) Generating instructions")
     # generate responses
@@ -158,6 +152,7 @@ def generate_dataset(
             responses = list(response_generator.process(inputs=batch))
             response_results.extend(responses[0])
             n_processed += batch_size
         for result in response_results:
             result["prompt"] = result["instruction"]
             result["completion"] = result["generation"]
@@ -178,6 +173,7 @@ def generate_dataset(
             responses = list(response_generator.process(inputs=batch))
             response_results.extend(responses[0])
             n_processed += batch_size
         for result in response_results:
             result["messages"].append(
                 {"role": "assistant", "content": result["generation"]}
@@ -236,7 +232,7 @@ def push_dataset_to_hub(
     dataframe = convert_dataframe_messages(dataframe)
     progress(0.7, desc="Creating dataset")
     dataset = Dataset.from_pandas(dataframe)
-    dataset = combine_datasets(repo_id, dataset)
     progress(0.9, desc="Pushing dataset")
     distiset = Distiset({"default": dataset})
     distiset.push_to_hub(
@@ -600,4 +596,5 @@ with gr.Blocks() as app:
                 outputs=[dataset_description, system_prompt, num_turns, dataframe],
             )
             app.load(fn=get_org_dropdown, outputs=[org_name])
         app.load(fn=swap_visibility, outputs=main_ui)

     MODEL,
     SFT_AVAILABLE,
 )
+from synthetic_dataset_generator.pipelines.base import get_rewriten_prompts
 from synthetic_dataset_generator.pipelines.chat import (
     DEFAULT_DATASET_DESCRIPTIONS,
     generate_pipeline_code,
     get_magpie_generator,
     get_prompt_generator,
     get_response_generator,
 )
 from synthetic_dataset_generator.pipelines.embeddings import (
 from synthetic_dataset_generator.utils import (
     get_argilla_client,
     get_org_dropdown,
+    get_random_repo_name,
     swap_visibility,
 )
 ) -> pd.DataFrame:
     num_rows = test_max_num_rows(num_rows)
     progress(0.0, desc="(1/2) Generating instructions")
     magpie_generator = get_magpie_generator(
         system_prompt, num_turns, temperature, is_sample
     )
     batch_size = DEFAULT_BATCH_SIZE
     # create prompt rewrites
+    prompt_rewrites = get_rewriten_prompts(system_prompt, num_rows)
     # create instructions
     n_processed = 0
         batch = list(magpie_generator.process(inputs=inputs))
         magpie_results.extend(batch[0])
         n_processed += batch_size
+        random.seed(a=random.randint(0, 2**32 - 1))
     progress(0.5, desc="(1/2) Generating instructions")
     # generate responses
             responses = list(response_generator.process(inputs=batch))
             response_results.extend(responses[0])
             n_processed += batch_size
+            random.seed(a=random.randint(0, 2**32 - 1))
         for result in response_results:
             result["prompt"] = result["instruction"]
             result["completion"] = result["generation"]
             responses = list(response_generator.process(inputs=batch))
             response_results.extend(responses[0])
             n_processed += batch_size
+            random.seed(a=random.randint(0, 2**32 - 1))
         for result in response_results:
             result["messages"].append(
                 {"role": "assistant", "content": result["generation"]}
     dataframe = convert_dataframe_messages(dataframe)
     progress(0.7, desc="Creating dataset")
     dataset = Dataset.from_pandas(dataframe)
+    dataset = combine_datasets(repo_id, dataset, oauth_token)
     progress(0.9, desc="Pushing dataset")
     distiset = Distiset({"default": dataset})
     distiset.push_to_hub(
                 outputs=[dataset_description, system_prompt, num_turns, dataframe],
             )
             app.load(fn=get_org_dropdown, outputs=[org_name])
+        app.load(fn=get_random_repo_name, outputs=[repo_name])
         app.load(fn=swap_visibility, outputs=main_ui)

src/synthetic_dataset_generator/apps/eval.py CHANGED Viewed

@@ -41,6 +41,7 @@ from synthetic_dataset_generator.utils import (
     extract_column_names,
     get_argilla_client,
     get_org_dropdown,
     pad_or_truncate_list,
     process_columns,
     swap_visibility,
@@ -359,7 +360,7 @@ def push_dataset_to_hub(
 ):
     repo_id = validate_push_to_hub(org_name, repo_name)
     dataset = Dataset.from_pandas(dataframe)
-    dataset = combine_datasets(repo_id, dataset)
     distiset = Distiset({"default": dataset})
     distiset.push_to_hub(
         repo_id=repo_id,
@@ -907,3 +908,4 @@ with gr.Blocks() as app:
     app.load(fn=swap_visibility, outputs=main_ui)
     app.load(fn=get_org_dropdown, outputs=[org_name])

     extract_column_names,
     get_argilla_client,
     get_org_dropdown,
+    get_random_repo_name,
     pad_or_truncate_list,
     process_columns,
     swap_visibility,
 ):
     repo_id = validate_push_to_hub(org_name, repo_name)
     dataset = Dataset.from_pandas(dataframe)
+    dataset = combine_datasets(repo_id, dataset, oauth_token)
     distiset = Distiset({"default": dataset})
     distiset.push_to_hub(
         repo_id=repo_id,
     app.load(fn=swap_visibility, outputs=main_ui)
     app.load(fn=get_org_dropdown, outputs=[org_name])
+    app.load(fn=get_random_repo_name, outputs=[repo_name])

src/synthetic_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -20,6 +20,7 @@ from synthetic_dataset_generator.apps.base import (
     validate_push_to_hub,
 )
 from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE
 from synthetic_dataset_generator.pipelines.embeddings import (
     get_embeddings,
     get_sentence_embedding_dimensions,
@@ -35,6 +36,7 @@ from synthetic_dataset_generator.utils import (
     get_argilla_client,
     get_org_dropdown,
     get_preprocess_labels,
     swap_visibility,
 )
@@ -106,7 +108,7 @@ def generate_dataset(
     )
     updated_system_prompt = f"{system_prompt}. Optional labels: {', '.join(labels)}."
     if multi_label:
-        updated_system_prompt = f"{updated_system_prompt}. Only apply relevant labels. Applying less labels is better than applying too many labels."
     labeller_generator = get_labeller_generator(
         system_prompt=updated_system_prompt,
         labels=labels,
@@ -118,6 +120,7 @@ def generate_dataset(
     # create text classification data
     n_processed = 0
     textcat_results = []
     while n_processed < num_rows:
         progress(
             2 * 0.5 * n_processed / num_rows,
@@ -128,25 +131,24 @@ def generate_dataset(
         batch_size = min(batch_size, remaining_rows)
         inputs = []
         for _ in range(batch_size):
             if multi_label:
                 num_labels = len(labels)
                 k = int(
                     random.betavariate(alpha=(num_labels - 1), beta=num_labels)
                     * num_labels
                 )
-            else:
-                k = 1
             sampled_labels = random.sample(labels, min(k, len(labels)))
             random.shuffle(sampled_labels)
             inputs.append(
                 {
-                    "task": f"{system_prompt}. The text represents the following categories: {', '.join(sampled_labels)}"
                 }
             )
         batch = list(textcat_generator.process(inputs=inputs))
         textcat_results.extend(batch[0])
         n_processed += batch_size
     for result in textcat_results:
         result["text"] = result["input_text"]
@@ -164,6 +166,7 @@ def generate_dataset(
         labels_batch = list(labeller_generator.process(inputs=batch))
         labeller_results.extend(labels_batch[0])
         n_processed += batch_size
     progress(
         1,
         total=total_steps,
@@ -178,26 +181,43 @@ def generate_dataset(
     dataframe = pd.DataFrame(distiset_results)
     if multi_label:
-        dataframe["labels"] = dataframe["labels"].apply(
-            lambda x: list(
-                set(
-                    [
                         label.lower().strip()
-                        if (label is not None and label.lower().strip() in labels)
-                        else random.choice(labels)
                         for label in x
-                    ]
                 )
-            )
-        )
         dataframe = dataframe[dataframe["labels"].notna()]
     else:
         dataframe = dataframe.rename(columns={"labels": "label"})
-        dataframe["label"] = dataframe["label"].apply(
-            lambda x: x.lower().strip()
-            if x and x.lower().strip() in labels
-            else random.choice(labels)
-        )
     dataframe = dataframe[dataframe["text"].notna()]
     progress(1.0, desc="Dataset created")
@@ -235,7 +255,7 @@ def push_dataset_to_hub(
         dataframe.reset_index(drop=True),
         features=features,
     )
-    dataset = combine_datasets(repo_id, dataset)
     distiset = Distiset({"default": dataset})
     progress(0.9, desc="Pushing dataset")
     distiset.push_to_hub(
@@ -647,3 +667,4 @@ with gr.Blocks() as app:
     app.load(fn=swap_visibility, outputs=main_ui)
     app.load(fn=get_org_dropdown, outputs=[org_name])

     validate_push_to_hub,
 )
 from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE
+from synthetic_dataset_generator.pipelines.base import get_rewriten_prompts
 from synthetic_dataset_generator.pipelines.embeddings import (
     get_embeddings,
     get_sentence_embedding_dimensions,
     get_argilla_client,
     get_org_dropdown,
     get_preprocess_labels,
+    get_random_repo_name,
     swap_visibility,
 )
     )
     updated_system_prompt = f"{system_prompt}. Optional labels: {', '.join(labels)}."
     if multi_label:
+        updated_system_prompt = f"{updated_system_prompt}. Only apply relevant labels. Applying less labels is always better than applying too many labels."
     labeller_generator = get_labeller_generator(
         system_prompt=updated_system_prompt,
         labels=labels,
     # create text classification data
     n_processed = 0
     textcat_results = []
+    rewritten_system_prompts = get_rewriten_prompts(system_prompt, num_rows)
     while n_processed < num_rows:
         progress(
             2 * 0.5 * n_processed / num_rows,
         batch_size = min(batch_size, remaining_rows)
         inputs = []
         for _ in range(batch_size):
+            k = 1
             if multi_label:
                 num_labels = len(labels)
                 k = int(
                     random.betavariate(alpha=(num_labels - 1), beta=num_labels)
                     * num_labels
                 )
             sampled_labels = random.sample(labels, min(k, len(labels)))
             random.shuffle(sampled_labels)
             inputs.append(
                 {
+                    "task": f"{random.choice(rewritten_system_prompts)}. The text represents the following categories: {', '.join(sampled_labels)}"
                 }
             )
         batch = list(textcat_generator.process(inputs=inputs))
         textcat_results.extend(batch[0])
         n_processed += batch_size
+        random.seed(a=random.randint(0, 2**32 - 1))
     for result in textcat_results:
         result["text"] = result["input_text"]
         labels_batch = list(labeller_generator.process(inputs=batch))
         labeller_results.extend(labels_batch[0])
         n_processed += batch_size
+        random.seed(a=random.randint(0, 2**32 - 1))
     progress(
         1,
         total=total_steps,
     dataframe = pd.DataFrame(distiset_results)
     if multi_label:
+        def _validate_labels(x):
+            if isinstance(x, str):  # single label
+                return [x.lower().strip()]
+            elif isinstance(x, list):  # multiple labels
+                return list(
+                    set(
                         label.lower().strip()
                         for label in x
+                        if label.lower().strip() in labels
+                    )
                 )
+            else:
+                return list(set([random.choice(labels)]))
+        dataframe["labels"] = dataframe["labels"].apply(_validate_labels)
         dataframe = dataframe[dataframe["labels"].notna()]
     else:
+        def _validate_labels(x):
+            if isinstance(x, str) and x.lower().strip() in labels:
+                return x.lower().strip()
+            elif isinstance(x, list):
+                options = [
+                    label.lower().strip()
+                    for label in x
+                    if isinstance(label, str) and label.lower().strip() in labels
+                ]
+                if options:
+                    return random.choice(options)
+                else:
+                    return random.choice(labels)
+            else:
+                return random.choice(labels)
         dataframe = dataframe.rename(columns={"labels": "label"})
+        dataframe["label"] = dataframe["label"].apply(_validate_labels)
     dataframe = dataframe[dataframe["text"].notna()]
     progress(1.0, desc="Dataset created")
         dataframe.reset_index(drop=True),
         features=features,
     )
+    dataset = combine_datasets(repo_id, dataset, oauth_token)
     distiset = Distiset({"default": dataset})
     progress(0.9, desc="Pushing dataset")
     distiset.push_to_hub(
     app.load(fn=swap_visibility, outputs=main_ui)
     app.load(fn=get_org_dropdown, outputs=[org_name])
+    app.load(fn=get_random_repo_name, outputs=[repo_name])

src/synthetic_dataset_generator/constants.py CHANGED Viewed

@@ -7,39 +7,66 @@ import argilla as rg
 TEXTCAT_TASK = "text_classification"
 SFT_TASK = "supervised_fine_tuning"
-# Hugging Face
 HF_TOKEN = os.getenv("HF_TOKEN")
 if not HF_TOKEN:
     raise ValueError(
         "HF_TOKEN is not set. Ensure you have set the HF_TOKEN environment variable that has access to the Hugging Face Hub repositories and Inference Endpoints."
     )
-# Inference
-MAX_NUM_TOKENS = int(os.getenv("MAX_NUM_TOKENS", 2048))
-MAX_NUM_ROWS: str | int = int(os.getenv("MAX_NUM_ROWS", 1000))
-DEFAULT_BATCH_SIZE = int(os.getenv("DEFAULT_BATCH_SIZE", 5))
-MODEL = os.getenv("MODEL", "meta-llama/Meta-Llama-3.1-8B-Instruct")
-BASE_URL = os.getenv("BASE_URL", default=None)
 _API_KEY = os.getenv("API_KEY")
-if _API_KEY:
-    API_KEYS = [_API_KEY]
-else:
-    API_KEYS = [os.getenv("HF_TOKEN")] + [
-        os.getenv(f"HF_TOKEN_{i}") for i in range(1, 10)
-    ]
 API_KEYS = [token for token in API_KEYS if token]
 # Determine if SFT is available
 SFT_AVAILABLE = False
 llama_options = ["llama3", "llama-3", "llama 3"]
 qwen_options = ["qwen2", "qwen-2", "qwen 2"]
-if os.getenv("MAGPIE_PRE_QUERY_TEMPLATE"):
     SFT_AVAILABLE = True
-    passed_pre_query_template = os.getenv("MAGPIE_PRE_QUERY_TEMPLATE")
-    if passed_pre_query_template.lower() in llama_options:
         MAGPIE_PRE_QUERY_TEMPLATE = "llama3"
-    elif passed_pre_query_template.lower() in qwen_options:
         MAGPIE_PRE_QUERY_TEMPLATE = "qwen2"
     else:
         MAGPIE_PRE_QUERY_TEMPLATE = passed_pre_query_template
@@ -54,12 +81,12 @@ elif MODEL.lower() in qwen_options or any(
     SFT_AVAILABLE = True
     MAGPIE_PRE_QUERY_TEMPLATE = "qwen2"
-if BASE_URL:
     SFT_AVAILABLE = False
 if not SFT_AVAILABLE:
     warnings.warn(
-        message="`SFT_AVAILABLE` is set to `False`. Use Hugging Face Inference Endpoints to generate chat data."
     )
     MAGPIE_PRE_QUERY_TEMPLATE = None
@@ -67,11 +94,12 @@ if not SFT_AVAILABLE:
 STATIC_EMBEDDING_MODEL = "minishlab/potion-base-8M"
 # Argilla
-ARGILLA_API_URL = os.getenv("ARGILLA_API_URL")
-ARGILLA_API_KEY = os.getenv("ARGILLA_API_KEY")
-if ARGILLA_API_URL is None or ARGILLA_API_KEY is None:
-    ARGILLA_API_URL = os.getenv("ARGILLA_API_URL_SDG_REVIEWER")
-    ARGILLA_API_KEY = os.getenv("ARGILLA_API_KEY_SDG_REVIEWER")
 if not ARGILLA_API_URL or not ARGILLA_API_KEY:
     warnings.warn("ARGILLA_API_URL or ARGILLA_API_KEY is not set or is empty")

 TEXTCAT_TASK = "text_classification"
 SFT_TASK = "supervised_fine_tuning"
+# Inference
+MAX_NUM_TOKENS = int(os.getenv("MAX_NUM_TOKENS", 2048))
+MAX_NUM_ROWS = int(os.getenv("MAX_NUM_ROWS", 1000))
+DEFAULT_BATCH_SIZE = int(os.getenv("DEFAULT_BATCH_SIZE", 5))
+# Models
+MODEL = os.getenv("MODEL", "meta-llama/Meta-Llama-3.1-8B-Instruct")
+TOKENIZER_ID = os.getenv(key="TOKENIZER_ID", default=None)
+OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL")
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL")
+HUGGINGFACE_BASE_URL = os.getenv("HUGGINGFACE_BASE_URL")
+VLLM_BASE_URL = os.getenv("VLLM_BASE_URL")
+# check if model is set correctly
+if HUGGINGFACE_BASE_URL and MODEL:
+    raise ValueError(
+        "`HUGGINGFACE_BASE_URL` and `MODEL` cannot be set at the same time. Use a model id for serverless inference and a base URL dedicated to Hugging Face Inference Endpoints."
+    )
+if not MODEL:
+    if OPENAI_BASE_URL or OLLAMA_BASE_URL or VLLM_BASE_URL:
+        raise ValueError("`MODEL` is not set. Please provide a model id for inference.")
+# Check if multiple base URLs are provided
+base_urls = [
+    url
+    for url in [OPENAI_BASE_URL, OLLAMA_BASE_URL, HUGGINGFACE_BASE_URL, VLLM_BASE_URL]
+    if url
+]
+if len(base_urls) > 1:
+    raise ValueError(
+        f"Multiple base URLs provided: {', '.join(base_urls)}. Only one base URL can be set at a time."
+    )
+BASE_URL = OPENAI_BASE_URL or OLLAMA_BASE_URL or HUGGINGFACE_BASE_URL or VLLM_BASE_URL
+# API Keys
 HF_TOKEN = os.getenv("HF_TOKEN")
 if not HF_TOKEN:
     raise ValueError(
         "HF_TOKEN is not set. Ensure you have set the HF_TOKEN environment variable that has access to the Hugging Face Hub repositories and Inference Endpoints."
     )
 _API_KEY = os.getenv("API_KEY")
+API_KEYS = (
+    [_API_KEY]
+    if _API_KEY
+    else [HF_TOKEN] + [os.getenv(f"HF_TOKEN_{i}") for i in range(1, 10)]
+)
 API_KEYS = [token for token in API_KEYS if token]
 # Determine if SFT is available
 SFT_AVAILABLE = False
 llama_options = ["llama3", "llama-3", "llama 3"]
 qwen_options = ["qwen2", "qwen-2", "qwen 2"]
+if passed_pre_query_template := os.getenv("MAGPIE_PRE_QUERY_TEMPLATE", "").lower():
     SFT_AVAILABLE = True
+    if passed_pre_query_template in llama_options:
         MAGPIE_PRE_QUERY_TEMPLATE = "llama3"
+    elif passed_pre_query_template in qwen_options:
         MAGPIE_PRE_QUERY_TEMPLATE = "qwen2"
     else:
         MAGPIE_PRE_QUERY_TEMPLATE = passed_pre_query_template
     SFT_AVAILABLE = True
     MAGPIE_PRE_QUERY_TEMPLATE = "qwen2"
+if OPENAI_BASE_URL:
     SFT_AVAILABLE = False
 if not SFT_AVAILABLE:
     warnings.warn(
+        "`SFT_AVAILABLE` is set to `False`. Use Hugging Face Inference Endpoints or Ollama to generate chat data, provide a `TOKENIZER_ID` and `MAGPIE_PRE_QUERY_TEMPLATE`. You can also use `HUGGINGFACE_BASE_URL` to with vllm."
     )
     MAGPIE_PRE_QUERY_TEMPLATE = None
 STATIC_EMBEDDING_MODEL = "minishlab/potion-base-8M"
 # Argilla
+ARGILLA_API_URL = os.getenv("ARGILLA_API_URL") or os.getenv(
+    "ARGILLA_API_URL_SDG_REVIEWER"
+)
+ARGILLA_API_KEY = os.getenv("ARGILLA_API_KEY") or os.getenv(
+    "ARGILLA_API_KEY_SDG_REVIEWER"
+)
 if not ARGILLA_API_URL or not ARGILLA_API_KEY:
     warnings.warn("ARGILLA_API_URL or ARGILLA_API_KEY is not set or is empty")

src/synthetic_dataset_generator/pipelines/base.py CHANGED Viewed

@@ -1,4 +1,21 @@
-from synthetic_dataset_generator.constants import API_KEYS
 TOKEN_INDEX = 0
@@ -8,3 +25,117 @@ def _get_next_api_key():
     api_key = API_KEYS[TOKEN_INDEX % len(API_KEYS)]
     TOKEN_INDEX += 1
     return api_key

+import math
+import random
+import gradio as gr
+from distilabel.llms import ClientvLLM, InferenceEndpointsLLM, OllamaLLM, OpenAILLM
+from distilabel.steps.tasks import TextGeneration
+from synthetic_dataset_generator.constants import (
+    API_KEYS,
+    DEFAULT_BATCH_SIZE,
+    HUGGINGFACE_BASE_URL,
+    MAGPIE_PRE_QUERY_TEMPLATE,
+    MODEL,
+    OLLAMA_BASE_URL,
+    OPENAI_BASE_URL,
+    TOKENIZER_ID,
+    VLLM_BASE_URL,
+)
 TOKEN_INDEX = 0
     api_key = API_KEYS[TOKEN_INDEX % len(API_KEYS)]
     TOKEN_INDEX += 1
     return api_key
+def _get_prompt_rewriter():
+    generation_kwargs = {
+        "temperature": 1,
+    }
+    system_prompt = "You are a prompt rewriter. You are given a prompt and you need to rewrite it keeping the same structure but highlighting different aspects of the original without adding anything new."
+    prompt_rewriter = TextGeneration(
+        llm=_get_llm(generation_kwargs=generation_kwargs),
+        system_prompt=system_prompt,
+        use_system_prompt=True,
+    )
+    prompt_rewriter.load()
+    return prompt_rewriter
+def get_rewriten_prompts(prompt: str, num_rows: int):
+    prompt_rewriter = _get_prompt_rewriter()
+    # create prompt rewrites
+    inputs = [
+        {"instruction": f"Original prompt: {prompt} \nRewritten prompt: "}
+        for i in range(math.floor(num_rows / 100))
+    ]
+    n_processed = 0
+    prompt_rewrites = [prompt]
+    while n_processed < num_rows:
+        batch = list(
+            prompt_rewriter.process(
+                inputs=inputs[n_processed : n_processed + DEFAULT_BATCH_SIZE]
+            )
+        )
+        prompt_rewrites += [entry["generation"] for entry in batch[0]]
+        n_processed += DEFAULT_BATCH_SIZE
+        random.seed(a=random.randint(0, 2**32 - 1))
+    return prompt_rewrites
+def _get_llm(use_magpie_template=False, **kwargs):
+    if OPENAI_BASE_URL:
+        llm = OpenAILLM(
+            model=MODEL,
+            base_url=OPENAI_BASE_URL,
+            api_key=_get_next_api_key(),
+            **kwargs,
+        )
+        if "generation_kwargs" in kwargs:
+            if "stop_sequences" in kwargs["generation_kwargs"]:
+                kwargs["generation_kwargs"]["stop"] = kwargs["generation_kwargs"][
+                    "stop_sequences"
+                ]
+                del kwargs["generation_kwargs"]["stop_sequences"]
+            if "do_sample" in kwargs["generation_kwargs"]:
+                del kwargs["generation_kwargs"]["do_sample"]
+    elif OLLAMA_BASE_URL:
+        if "generation_kwargs" in kwargs:
+            if "max_new_tokens" in kwargs["generation_kwargs"]:
+                kwargs["generation_kwargs"]["num_predict"] = kwargs[
+                    "generation_kwargs"
+                ]["max_new_tokens"]
+                del kwargs["generation_kwargs"]["max_new_tokens"]
+            if "stop_sequences" in kwargs["generation_kwargs"]:
+                kwargs["generation_kwargs"]["stop"] = kwargs["generation_kwargs"][
+                    "stop_sequences"
+                ]
+                del kwargs["generation_kwargs"]["stop_sequences"]
+            if "do_sample" in kwargs["generation_kwargs"]:
+                del kwargs["generation_kwargs"]["do_sample"]
+            options = kwargs["generation_kwargs"]
+            del kwargs["generation_kwargs"]
+            kwargs["generation_kwargs"] = {}
+            kwargs["generation_kwargs"]["options"] = options
+        llm = OllamaLLM(
+            model=MODEL,
+            host=OLLAMA_BASE_URL,
+            tokenizer_id=TOKENIZER_ID or MODEL,
+            **kwargs,
+        )
+    elif HUGGINGFACE_BASE_URL:
+        kwargs["generation_kwargs"]["do_sample"] = True
+        llm = InferenceEndpointsLLM(
+            api_key=_get_next_api_key(),
+            base_url=HUGGINGFACE_BASE_URL,
+            tokenizer_id=TOKENIZER_ID or MODEL,
+            **kwargs,
+        )
+    elif VLLM_BASE_URL:
+        if "generation_kwargs" in kwargs:
+            if "do_sample" in kwargs["generation_kwargs"]:
+                del kwargs["generation_kwargs"]["do_sample"]
+        llm = ClientvLLM(
+            base_url=VLLM_BASE_URL,
+            model=MODEL,
+            tokenizer=TOKENIZER_ID or MODEL,
+            api_key=_get_next_api_key(),
+            **kwargs,
+        )
+    else:
+        llm = InferenceEndpointsLLM(
+            api_key=_get_next_api_key(),
+            tokenizer_id=TOKENIZER_ID or MODEL,
+            model_id=MODEL,
+            magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE,
+            **kwargs,
+        )
+    return llm
+try:
+    llm = _get_llm()
+    llm.load()
+    llm.generate([[{"content": "Hello, world!", "role": "user"}]])
+except Exception as e:
+    gr.Error(f"Error loading {llm.__class__.__name__}: {e}")

src/synthetic_dataset_generator/pipelines/chat.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from distilabel.llms import InferenceEndpointsLLM
 from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
 from synthetic_dataset_generator.constants import (
@@ -7,7 +6,7 @@ from synthetic_dataset_generator.constants import (
     MAX_NUM_TOKENS,
     MODEL,
 )
-from synthetic_dataset_generator.pipelines.base import _get_next_api_key
 INFORMATION_SEEKING_PROMPT = (
     "You are an AI assistant designed to provide accurate and concise information on a wide"
@@ -149,18 +148,13 @@ def _get_output_mappings(num_turns):
 def get_prompt_generator():
     prompt_generator = TextGeneration(
-        llm=InferenceEndpointsLLM(
-            api_key=_get_next_api_key(),
-            model_id=MODEL,
-            tokenizer_id=MODEL,
-            base_url=BASE_URL,
-            generation_kwargs={
-                "temperature": 0.8,
-                "max_new_tokens": MAX_NUM_TOKENS,
-                "do_sample": True,
-            },
-        ),
         system_prompt=PROMPT_CREATION_PROMPT,
         use_system_prompt=True,
     )
@@ -172,38 +166,34 @@ def get_magpie_generator(system_prompt, num_turns, temperature, is_sample):
     input_mappings = _get_output_mappings(num_turns)
     output_mappings = input_mappings.copy()
     if num_turns == 1:
         magpie_generator = Magpie(
-            llm=InferenceEndpointsLLM(
-                model_id=MODEL,
-                tokenizer_id=MODEL,
-                base_url=BASE_URL,
-                api_key=_get_next_api_key(),
                 magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE,
-                generation_kwargs={
-                    "temperature": temperature,
-                    "do_sample": True,
-                    "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.25),
-                    "stop_sequences": _STOP_SEQUENCES,
-                },
             ),
             n_turns=num_turns,
             output_mappings=output_mappings,
             only_instruction=True,
         )
     else:
         magpie_generator = Magpie(
-            llm=InferenceEndpointsLLM(
-                model_id=MODEL,
-                tokenizer_id=MODEL,
-                base_url=BASE_URL,
-                api_key=_get_next_api_key(),
                 magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE,
-                generation_kwargs={
-                    "temperature": temperature,
-                    "do_sample": True,
-                    "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.5),
-                    "stop_sequences": _STOP_SEQUENCES,
-                },
             ),
             end_with_user=True,
             n_turns=num_turns,
@@ -213,51 +203,25 @@ def get_magpie_generator(system_prompt, num_turns, temperature, is_sample):
     return magpie_generator
-def get_prompt_rewriter():
-    prompt_rewriter = TextGeneration(
-        llm=InferenceEndpointsLLM(
-            model_id=MODEL,
-            tokenizer_id=MODEL,
-            base_url=BASE_URL,
-            api_key=_get_next_api_key(),
-            generation_kwargs={
-                "temperature": 1,
-            },
-        ),
-    )
-    prompt_rewriter.load()
-    return prompt_rewriter
 def get_response_generator(system_prompt, num_turns, temperature, is_sample):
     if num_turns == 1:
         response_generator = TextGeneration(
-            llm=InferenceEndpointsLLM(
-                model_id=MODEL,
-                tokenizer_id=MODEL,
-                base_url=BASE_URL,
-                api_key=_get_next_api_key(),
-                generation_kwargs={
-                    "temperature": temperature,
-                    "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.5),
-                },
-            ),
             system_prompt=system_prompt,
             output_mappings={"generation": "completion"},
             input_mappings={"instruction": "prompt"},
         )
     else:
         response_generator = ChatGeneration(
-            llm=InferenceEndpointsLLM(
-                model_id=MODEL,
-                tokenizer_id=MODEL,
-                base_url=BASE_URL,
-                api_key=_get_next_api_key(),
-                generation_kwargs={
-                    "temperature": temperature,
-                    "max_new_tokens": MAX_NUM_TOKENS,
-                },
-            ),
             output_mappings={"generation": "completion"},
             input_mappings={"conversation": "messages"},
         )
@@ -293,7 +257,7 @@ with Pipeline(name="sft") as pipeline:
                 "max_new_tokens": {MAX_NUM_TOKENS},
                 "stop_sequences": {_STOP_SEQUENCES}
             }},
-            api_key=os.environ["BASE_URL"],
         ),
         n_turns={num_turns},
         num_rows={num_rows},

 from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
 from synthetic_dataset_generator.constants import (
     MAX_NUM_TOKENS,
     MODEL,
 )
+from synthetic_dataset_generator.pipelines.base import _get_llm
 INFORMATION_SEEKING_PROMPT = (
     "You are an AI assistant designed to provide accurate and concise information on a wide"
 def get_prompt_generator():
+    generation_kwargs = {
+        "temperature": 0.8,
+        "max_new_tokens": MAX_NUM_TOKENS,
+        "do_sample": True,
+    }
     prompt_generator = TextGeneration(
+        llm=_get_llm(generation_kwargs=generation_kwargs),
         system_prompt=PROMPT_CREATION_PROMPT,
         use_system_prompt=True,
     )
     input_mappings = _get_output_mappings(num_turns)
     output_mappings = input_mappings.copy()
     if num_turns == 1:
+        generation_kwargs = {
+            "temperature": temperature,
+            "do_sample": True,
+            "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.25),
+            "stop_sequences": _STOP_SEQUENCES,
+        }
         magpie_generator = Magpie(
+            llm=_get_llm(
+                generation_kwargs=generation_kwargs,
                 magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE,
+                use_magpie_template=True,
             ),
             n_turns=num_turns,
             output_mappings=output_mappings,
             only_instruction=True,
         )
     else:
+        generation_kwargs = {
+            "temperature": temperature,
+            "do_sample": True,
+            "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.5),
+            "stop_sequences": _STOP_SEQUENCES,
+        }
         magpie_generator = Magpie(
+            llm=_get_llm(
+                generation_kwargs=generation_kwargs,
                 magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE,
+                use_magpie_template=True,
             ),
             end_with_user=True,
             n_turns=num_turns,
     return magpie_generator
 def get_response_generator(system_prompt, num_turns, temperature, is_sample):
     if num_turns == 1:
+        generation_kwargs = {
+            "temperature": temperature,
+            "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.5),
+        }
         response_generator = TextGeneration(
+            llm=_get_llm(generation_kwargs=generation_kwargs),
             system_prompt=system_prompt,
             output_mappings={"generation": "completion"},
             input_mappings={"instruction": "prompt"},
         )
     else:
+        generation_kwargs = {
+            "temperature": temperature,
+            "max_new_tokens": MAX_NUM_TOKENS,
+        }
         response_generator = ChatGeneration(
+            llm=_get_llm(generation_kwargs=generation_kwargs),
             output_mappings={"generation": "completion"},
             input_mappings={"conversation": "messages"},
         )
                 "max_new_tokens": {MAX_NUM_TOKENS},
                 "stop_sequences": {_STOP_SEQUENCES}
             }},
+            api_key=os.environ["API_KEY"],
         ),
         n_turns={num_turns},
         num_rows={num_rows},

src/synthetic_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import random
 from typing import List
-from distilabel.llms import InferenceEndpointsLLM, OpenAILLM
 from distilabel.steps.tasks import (
     GenerateTextClassificationData,
     TextClassification,
@@ -9,8 +8,12 @@ from distilabel.steps.tasks import (
 )
 from pydantic import BaseModel, Field
-from synthetic_dataset_generator.constants import BASE_URL, MAX_NUM_TOKENS, MODEL
-from synthetic_dataset_generator.pipelines.base import _get_next_api_key
 from synthetic_dataset_generator.utils import get_preprocess_labels
 PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
@@ -69,23 +72,10 @@ def get_prompt_generator():
         "temperature": 0.8,
         "max_new_tokens": MAX_NUM_TOKENS,
     }
-    if BASE_URL:
-        llm = OpenAILLM(
-            model=MODEL,
-            base_url=BASE_URL,
-            api_key=_get_next_api_key(),
-            structured_output=structured_output,
-            generation_kwargs=generation_kwargs,
-        )
-    else:
-        generation_kwargs["do_sample"] = True
-        llm = InferenceEndpointsLLM(
-            api_key=_get_next_api_key(),
-            model_id=MODEL,
-            base_url=BASE_URL,
-            structured_output=structured_output,
-            generation_kwargs=generation_kwargs,
-        )
     prompt_generator = TextGeneration(
         llm=llm,
@@ -103,22 +93,7 @@ def get_textcat_generator(difficulty, clarity, temperature, is_sample):
         "max_new_tokens": 256 if is_sample else MAX_NUM_TOKENS,
         "top_p": 0.95,
     }
-    if BASE_URL:
-        llm = OpenAILLM(
-            model=MODEL,
-            base_url=BASE_URL,
-            api_key=_get_next_api_key(),
-            generation_kwargs=generation_kwargs,
-        )
-    else:
-        generation_kwargs["do_sample"] = True
-        llm = InferenceEndpointsLLM(
-            model_id=MODEL,
-            base_url=BASE_URL,
-            api_key=_get_next_api_key(),
-            generation_kwargs=generation_kwargs,
-        )
     textcat_generator = GenerateTextClassificationData(
         llm=llm,
         difficulty=None if difficulty == "mixed" else difficulty,
@@ -134,22 +109,7 @@ def get_labeller_generator(system_prompt, labels, multi_label):
         "temperature": 0.01,
         "max_new_tokens": MAX_NUM_TOKENS,
     }
-    if BASE_URL:
-        llm = OpenAILLM(
-            model=MODEL,
-            base_url=BASE_URL,
-            api_key=_get_next_api_key(),
-            generation_kwargs=generation_kwargs,
-        )
-    else:
-        llm = InferenceEndpointsLLM(
-            model_id=MODEL,
-            base_url=BASE_URL,
-            api_key=_get_next_api_key(),
-            generation_kwargs=generation_kwargs,
-        )
     labeller_generator = TextClassification(
         llm=llm,
         context=system_prompt,

 import random
 from typing import List
 from distilabel.steps.tasks import (
     GenerateTextClassificationData,
     TextClassification,
 )
 from pydantic import BaseModel, Field
+from synthetic_dataset_generator.constants import (
+    BASE_URL,
+    MAX_NUM_TOKENS,
+    MODEL,
+)
+from synthetic_dataset_generator.pipelines.base import _get_llm
 from synthetic_dataset_generator.utils import get_preprocess_labels
 PROMPT_CREATION_PROMPT = """You are an AI assistant specialized in generating very precise text classification tasks for dataset creation.
         "temperature": 0.8,
         "max_new_tokens": MAX_NUM_TOKENS,
     }
+    llm = _get_llm(
+        structured_output=structured_output,
+        generation_kwargs=generation_kwargs,
+    )
     prompt_generator = TextGeneration(
         llm=llm,
         "max_new_tokens": 256 if is_sample else MAX_NUM_TOKENS,
         "top_p": 0.95,
     }
+    llm = _get_llm(generation_kwargs=generation_kwargs)
     textcat_generator = GenerateTextClassificationData(
         llm=llm,
         difficulty=None if difficulty == "mixed" else difficulty,
         "temperature": 0.01,
         "max_new_tokens": MAX_NUM_TOKENS,
     }
+    llm = _get_llm(generation_kwargs=generation_kwargs)
     labeller_generator = TextClassification(
         llm=llm,
         context=system_prompt,

src/synthetic_dataset_generator/utils.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import json
 import warnings
 from typing import List, Optional, Union
@@ -55,6 +56,10 @@ def list_orgs(oauth_token: Union[OAuthToken, None] = None):
     return organizations
 def get_org_dropdown(oauth_token: Union[OAuthToken, None] = None):
     if oauth_token is not None:
         orgs = list_orgs(oauth_token)

 import json
+import uuid
 import warnings
 from typing import List, Optional, Union
     return organizations
+def get_random_repo_name():
+    return f"my-distiset-{str(uuid.uuid4())[:8]}"
 def get_org_dropdown(oauth_token: Union[OAuthToken, None] = None):
     if oauth_token is not None:
         orgs = list_orgs(oauth_token)