Spaces:

expressapi
/

chroma

Running

App Files Files Community

badalsahani commited on Feb 12, 2024

Commit

287a0bc

1 Parent(s): fbbc97b

feat: chroma initial deploy

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +10 -0
.gitattributes +1 -35
.github/ISSUE_TEMPLATE/bug_report.yaml +43 -0
.github/ISSUE_TEMPLATE/config.yml +5 -0
.github/ISSUE_TEMPLATE/feature_request.yaml +46 -0
.github/ISSUE_TEMPLATE/installation_trouble.yaml +41 -0
.github/actions/bandit-scan/Dockerfile +7 -0
.github/actions/bandit-scan/action.yaml +26 -0
.github/actions/bandit-scan/entrypoint.sh +13 -0
.github/workflows/chroma-client-integration-test.yml +31 -0
.github/workflows/chroma-cluster-test.yml +42 -0
.github/workflows/chroma-coordinator-test.yaml +23 -0
.github/workflows/chroma-integration-test.yml +40 -0
.github/workflows/chroma-js-release.yml +42 -0
.github/workflows/chroma-release-python-client.yml +58 -0
.github/workflows/chroma-release.yml +179 -0
.github/workflows/chroma-test.yml +65 -0
.github/workflows/chroma-worker-test.yml +36 -0
.github/workflows/pr-review-checklist.yml +37 -0
.github/workflows/python-vuln.yaml +28 -0
.gitignore +34 -0
.pre-commit-config.yaml +36 -0
.vscode/settings.json +131 -0
Cargo.lock +0 -0
Cargo.toml +5 -0
DEVELOP.md +111 -0
Dockerfile +39 -0
LICENSE +201 -0
README.md +106 -11
RELEASE_PROCESS.md +22 -0
Tiltfile +30 -0
bandit.yaml +4 -0
bin/cluster-test.sh +62 -0
bin/docker_entrypoint.sh +15 -0
bin/generate_cloudformation.py +198 -0
bin/integration-test +75 -0
bin/reset.sh +13 -0
bin/templates/docker-compose.yml +21 -0
bin/test-package.sh +24 -0
bin/test-remote +16 -0
bin/test.py +7 -0
bin/version +8 -0
bin/windows_upgrade_sqlite.py +20 -0
chromadb/__init__.py +257 -0
chromadb/api/__init__.py +596 -0
chromadb/api/client.py +496 -0
chromadb/api/fastapi.py +654 -0
chromadb/api/models/Collection.py +633 -0
chromadb/api/segment.py +914 -0
chromadb/api/types.py +509 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,10 @@

+venv
+.conda
+.git
+examples
+clients
+.hypothesis
+__pycache__
+.vscode
+*.egg-info
+.pytest_cache

.gitattributes CHANGED Viewed

@@ -1,35 +1 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ _pb2.py linguist-generated

.github/ISSUE_TEMPLATE/bug_report.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+name: 🐛 Bug Report
+description: File a bug report to help us improve Chroma
+title: "[Bug]: "
+labels: ["bug", "triage"]
+# assignees:
+#   - octocat
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for taking the time to fill out this bug report!
+  - type: textarea
+    id: what-happened
+    attributes:
+      label: What happened?
+      description: Also tell us, what did you expect to happen?
+      placeholder: Tell us what you see!
+#       value: "A bug happened!"
+    validations:
+      required: true
+  - type: textarea
+    id: versions
+    attributes:
+      label: Versions
+      description: Your Chroma, Python, and OS versions, as well as whatever else you think relevant. Check that you have [the latest Chroma](https://github.com/chroma-core/chroma/pkgs/container/chroma) as we are a fast moving pre v1.0 project.
+      placeholder: Chroma v0.3.22, Python 3.9.6, MacOS 12.5
+#       value: "A bug happened!"
+    validations:
+      required: true
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant log output
+      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+      render: shell
+#   - type: checkboxes
+#     id: terms
+#     attributes:
+#       label: Code of Conduct
+#       description: By submitting this issue, you agree to follow our [Code of Conduct](https://example.com)
+#       options:
+#         - label: I agree to follow this project's Code of Conduct
+#           required: true

.github/ISSUE_TEMPLATE/config.yml ADDED Viewed

	@@ -0,0 +1,5 @@

+blank_issues_enabled: true
+contact_links:
+  - name: 🤷🏻‍♀️ Questions
+    url: https://discord.com/invite/MMeYNTmh3x
+    about: Interact with the Chroma community here by asking for help, discussing and more!

.github/ISSUE_TEMPLATE/feature_request.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+name: 🚀 Feature request
+description: Suggest an idea for Chroma
+title: "[Feature Request]: "
+labels: ["enhancement"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for taking the time to request this feature!
+  - type: textarea
+    id: problem
+    attributes:
+      label: Describe the problem
+      description: Please provide a clear and concise description the problem this feature would solve. The more information you can provide here, the better.
+      placeholder: I prefer if...
+    validations:
+      required: true
+  - type: textarea
+    id: solution
+    attributes:
+      label: Describe the proposed solution
+      description: Please provide a clear and concise description of what you would like to happen.
+      placeholder: I would like to see...
+    validations:
+      required: true
+  - type: textarea
+    id: alternatives
+    attributes:
+      label: Alternatives considered
+      description: "Please provide a clear and concise description of any alternative solutions or features you've considered."
+  - type: dropdown
+    id: importance
+    attributes:
+      label: Importance
+      description: How important is this feature to you?
+      options:
+        - nice to have
+        - would make my life easier
+        - i cannot use Chroma without it
+    validations:
+      required: true
+  - type: textarea
+    id: additional-context
+    attributes:
+      label: Additional Information
+      description: Add any other context or screenshots about the feature request here.

.github/ISSUE_TEMPLATE/installation_trouble.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+name: Installation Issue
+description: Request for install help with Chroma
+title: "[Install issue]: "
+labels: ["installation trouble"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for taking the time to fill out this issue report!
+  - type: textarea
+    id: what-happened
+    attributes:
+      label: What happened?
+      description: Also tell us, what did you expect to happen?
+      placeholder: Tell us what you see!
+#       value: "A bug happened!"
+    validations:
+      required: true
+  - type: textarea
+    id: versions
+    attributes:
+      label: Versions
+      description: We need your Chroma, Python, and OS versions, as well as whatever else you think relevant.
+      placeholder: Chroma v0.3.14, Python 3.9.6, MacOS 12.5
+#       value: "A bug happened!"
+    validations:
+      required: true
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant log output
+      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+      render: shell
+#   - type: checkboxes
+#     id: terms
+#     attributes:
+#       label: Code of Conduct
+#       description: By submitting this issue, you agree to follow our [Code of Conduct](https://example.com)
+#       options:
+#         - label: I agree to follow this project's Code of Conduct
+#           required: true

.github/actions/bandit-scan/Dockerfile ADDED Viewed

	@@ -0,0 +1,7 @@

+FROM python:3.10-alpine AS base-action
+RUN pip3 install -U setuptools pip bandit
+COPY entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+ENTRYPOINT ["sh","/entrypoint.sh"]

.github/actions/bandit-scan/action.yaml ADDED Viewed

	@@ -0,0 +1,26 @@

+name: 'Bandit Scan'
+description: 'This action performs a security vulnerability scan of python code using bandit library.'
+inputs:
+  bandit-config:
+    description: 'Bandit configuration file'
+    required: false
+  input-dir:
+    description: 'Directory to scan'
+    required: false
+    default: '.'
+  format:
+    description: 'Output format (txt, csv, json, xml, yaml). Default: json'
+    required: false
+    default: 'json'
+  output-file:
+    description: "The report file to produce. Make sure to align your format with the file extension to avoid confusion."
+    required: false
+    default: "bandit-scan.json"
+runs:
+  using: 'docker'
+  image: 'Dockerfile'
+  args:
+    - ${{ inputs.format }}
+    - ${{ inputs.bandit-config }}
+    - ${{ inputs.input-dir }}
+    - ${{ inputs.output-file }}

.github/actions/bandit-scan/entrypoint.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+#!/bin/bash
+CFG="-c $2"
+if [ -z "$1" ]; then
+    echo "No path to scan provided"
+    exit 1
+fi
+if [ -z "$2" ]; then
+    CFG = ""
+fi
+bandit -f "$1" ${CFG} -r "$3" -o "$4"
+exit 0 #we want to ignore the exit code of bandit (for now)

.github/workflows/chroma-client-integration-test.yml ADDED Viewed

	@@ -0,0 +1,31 @@

+name: Chroma Client Integration Tests
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+      - '**'
+  workflow_dispatch:
+jobs:
+  test:
+    timeout-minutes: 90
+    strategy:
+      matrix:
+        python: ['3.8', '3.9', '3.10', '3.11']
+        platform: [ubuntu-latest, windows-latest]
+    runs-on: ${{ matrix.platform }}
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python }}
+    - name: Install test dependencies
+      run: python -m pip install -r requirements.txt && python -m pip install -r requirements_dev.txt
+    - name: Test
+      run: clients/python/integration-test.sh

.github/workflows/chroma-cluster-test.yml ADDED Viewed

	@@ -0,0 +1,42 @@

+name: Chroma Cluster Tests
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+      - '**'
+  workflow_dispatch:
+jobs:
+  test:
+    strategy:
+      matrix:
+        python: ['3.8']
+        platform: ['16core-64gb-ubuntu-latest']
+        testfile: ["chromadb/test/ingest/test_producer_consumer.py",
+                   "chromadb/test/db/test_system.py",
+                   "chromadb/test/segment/distributed/test_memberlist_provider.py",]
+    runs-on: ${{ matrix.platform }}
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python }}
+    - name: Install test dependencies
+      run: python -m pip install -r requirements.txt && python -m pip install -r requirements_dev.txt
+    - name: Start minikube
+      id: minikube
+      uses: medyagh/setup-minikube@latest
+      with:
+        minikube-version: latest
+        kubernetes-version: latest
+        driver: docker
+        addons: ingress, ingress-dns
+        start-args: '--profile chroma-test'
+    - name: Integration Test
+      run: bin/cluster-test.sh ${{ matrix.testfile }}

.github/workflows/chroma-coordinator-test.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+name: Chroma Coordinator Tests
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+      - '**'
+  workflow_dispatch:
+jobs:
+  test:
+    strategy:
+      matrix:
+        platform: [ubuntu-latest]
+    runs-on: ${{ matrix.platform }}
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Build and test
+      run: cd go/coordinator && make test

.github/workflows/chroma-integration-test.yml ADDED Viewed

	@@ -0,0 +1,40 @@

+name: Chroma Integration Tests
+on:
+  push:
+    branches:
+      - main
+      - team/hypothesis-tests
+  pull_request:
+    branches:
+      - main
+      - '**'
+  workflow_dispatch:
+jobs:
+  test:
+    strategy:
+      matrix:
+        python: ['3.8']
+        platform: [ubuntu-latest, windows-latest]
+        testfile: ["--ignore-glob 'chromadb/test/property/*' --ignore='chromadb/test/test_cli.py' --ignore='chromadb/test/auth/test_simple_rbac_authz.py'",
+                   "chromadb/test/property/test_add.py",
+                   "chromadb/test/test_cli.py",
+                   "chromadb/test/auth/test_simple_rbac_authz.py",
+                   "chromadb/test/property/test_collections.py",
+                   "chromadb/test/property/test_cross_version_persist.py",
+                   "chromadb/test/property/test_embeddings.py",
+                   "chromadb/test/property/test_filtering.py",
+                   "chromadb/test/property/test_persist.py"]
+    runs-on: ${{ matrix.platform }}
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python }}
+    - name: Install test dependencies
+      run: python -m pip install -r requirements.txt && python -m pip install -r requirements_dev.txt
+    - name: Integration Test
+      run: bin/integration-test ${{ matrix.testfile }}

.github/workflows/chroma-js-release.yml ADDED Viewed

	@@ -0,0 +1,42 @@

+name: Chroma Release JS Client
+on:
+  push:
+    tags:
+      - 'js_release_*.*.*'  # Match tags in the form js_release_X.Y.Z
+      - 'js_release_alpha_*.*.*'  # Match tags in the form js_release_alpha_X.Y.Z
+jobs:
+  build-and-release:
+    runs-on: ubuntu-latest
+    permissions: write-all
+    steps:
+    - name: Check if tag matches the pattern
+      run: |
+        if [[ "${{ github.ref }}" =~ ^refs/tags/js_release_alpha_[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+          echo "Tag matches the pattern js_release_alpha_X.Y.Z"
+          echo "NPM_SCRIPT=release_alpha" >> "$GITHUB_ENV"
+        elif [[ "${{ github.ref }}" =~ ^refs/tags/js_release_[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+          echo "Tag matches the pattern js_release_X.Y.Z"
+          echo "NPM_SCRIPT=release" >> "$GITHUB_ENV"
+        else
+          echo "Tag does not match the release tag pattern, exiting workflow"
+          exit 1
+        fi
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 0
+    - name: Set up JS
+      uses: actions/setup-node@v3
+      with:
+        node-version: '16.x'
+        registry-url: 'https://registry.npmjs.org'
+    - name: Install Client Dev Dependencies
+      run: npm install
+      working-directory: ./clients/js/
+    - name: npm Test & Publish
+      run: npm run db:run && PORT=8001 npm run $NPM_SCRIPT
+      working-directory: ./clients/js/
+      env:
+        NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}

.github/workflows/chroma-release-python-client.yml ADDED Viewed

	@@ -0,0 +1,58 @@

+name: Chroma Release Python Client
+on:
+  push:
+    tags:
+      - '[0-9]+.[0-9]+.[0-9]+'  # Match tags in the form X.Y.Z
+    branches:
+      - main
+      - hammad/thin_client
+jobs:
+  check_tag:
+      runs-on: ubuntu-latest
+      outputs:
+        tag_matches: ${{ steps.check-tag.outputs.tag_matches }}
+      steps:
+        - name: Check Tag
+          id: check-tag
+          run: |
+            if [[ ${{ github.event.ref }} =~ ^refs/tags/[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+                echo "tag_matches=true" >> $GITHUB_OUTPUT
+            fi
+  build-and-release:
+    runs-on: ubuntu-latest
+    needs: check_tag
+    if: needs.check_tag.outputs.tag_matches == 'true'
+    permissions: write-all
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 0
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.10'
+    - name: Install Client Dev Dependencies
+      run: python -m pip install -r ./clients/python/requirements.txt && python -m pip install -r ./clients/python/requirements_dev.txt
+    - name: Build Client
+      run: ./clients/python/build_python_thin_client.sh
+    - name: Install setuptools_scm
+      run: python -m pip install setuptools_scm
+    - name: Get Release Version
+      id: version
+      run: echo "version=$(python -m setuptools_scm)" >> $GITHUB_OUTPUT
+    - name: Get current date
+      id: builddate
+      run: echo "builddate=$(date +'%Y-%m-%dT%H:%M')" >> $GITHUB_OUTPUT
+    - name: Publish to Test PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.TEST_PYPI_PYTHON_CLIENT_PUBLISH_KEY }}
+        repository-url: https://test.pypi.org/legacy/
+    - name: Publish to PyPI
+      if: startsWith(github.ref, 'refs/tags')
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.PYPI_PYTHON_CLIENT_PUBLISH_KEY }}

.github/workflows/chroma-release.yml ADDED Viewed

	@@ -0,0 +1,179 @@

+name: Chroma Release
+on:
+  push:
+    tags:
+      - "*"
+    branches:
+      - main
+env:
+  GHCR_IMAGE_NAME: "ghcr.io/chroma-core/chroma"
+  DOCKERHUB_IMAGE_NAME: "chromadb/chroma"
+  PLATFORMS: linux/amd64,linux/arm64 #linux/riscv64, linux/arm/v7
+jobs:
+  check_tag:
+    runs-on: ubuntu-latest
+    outputs:
+      tag_matches: ${{ steps.check-tag.outputs.tag_matches }}
+    steps:
+      - name: Check Tag
+        id: check-tag
+        run: |
+          if [[ ${{ github.event.ref }} =~ ^refs/tags/[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+              echo "tag_matches=true" >> $GITHUB_OUTPUT
+          fi
+  build-and-release:
+    runs-on: ubuntu-latest
+    needs: check_tag
+    permissions: write-all
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 0
+    # https://github.com/docker/setup-qemu-action - for multiplatform builds
+    - name: Set up QEMU
+      uses: docker/setup-qemu-action@v2
+    # https://github.com/docker/setup-buildx-action - for multiplatform builds
+    - name: Set up Docker Buildx
+      id: buildx
+      uses: docker/setup-buildx-action@v2
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.10"
+    - name: Install Client Dev Dependencies
+      run: python -m pip install -r requirements_dev.txt
+    - name: Build Client
+      run: python -m build
+    - name: Test Client Package
+      run: bin/test-package.sh dist/*.tar.gz
+    - name: Log in to the Github Container registry
+      uses: docker/[email protected]
+      with:
+        registry: ghcr.io
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    - name: Login to DockerHub
+      uses: docker/[email protected]
+      with:
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_TOKEN }}
+    - name: Install setuptools_scm
+      run: python -m pip install setuptools_scm
+    - name: Get Release Version
+      id: version
+      run: echo "version=$(python -m setuptools_scm)" >> $GITHUB_OUTPUT
+    - name: Build and push prerelease Docker image
+      if: "needs.check_tag.outputs.tag_matches != 'true'"
+      uses: docker/[email protected]
+      with:
+        context: .
+        platforms: ${{ env.PLATFORMS }}
+        push: true
+        tags: "${{ env.GHCR_IMAGE_NAME }}:${{ steps.version.outputs.version }},${{ env.DOCKERHUB_IMAGE_NAME }}:${{ steps.version.outputs.version }}"
+    - name: Build and push release Docker image
+      if: "needs.check_tag.outputs.tag_matches == 'true'"
+      uses: docker/[email protected]
+      with:
+        context: .
+        platforms: ${{ env.PLATFORMS }}
+        push: true
+        tags: "${{ env.GHCR_IMAGE_NAME }}:${{ steps.version.outputs.version }},${{ env.DOCKERHUB_IMAGE_NAME }}:${{ steps.version.outputs.version }},${{ env.GHCR_IMAGE_NAME }}:latest,${{ env.DOCKERHUB_IMAGE_NAME }}:latest"
+    - name: Get current date
+      id: builddate
+      run: echo "builddate=$(date +'%Y-%m-%dT%H:%M')" >> $GITHUB_OUTPUT
+    - name: Publish to Test PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.TEST_PYPI_API_TOKEN }}
+        repository_url: https://test.pypi.org/legacy/
+    - name: Publish to PyPI
+      if: "needs.check_tag.outputs.tag_matches == 'true'"
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.PYPI_API_TOKEN }}
+    - name: Login to AWS
+      uses: aws-actions/configure-aws-credentials@v1
+      with:
+        role-to-assume: arn:aws:iam::369178033109:role/github-action-generate-cf-template
+        aws-region: us-east-1
+    - name: Generate CloudFormation template
+      id: generate-cf
+      if: "needs.check_tag.outputs.tag_matches == 'true'"
+      run: "pip install boto3 && python bin/generate_cloudformation.py"
+    - name: Release Tagged Version
+      uses: ncipollo/[email protected]
+      if: "needs.check_tag.outputs.tag_matches == 'true'"
+      with:
+        body: |
+          Version: `${{steps.version.outputs.version}}`
+          Git ref: `${{github.ref}}`
+          Build Date: `${{steps.builddate.outputs.builddate}}`
+          PIP Package: `chroma-${{steps.version.outputs.version}}.tar.gz`
+          Github Container Registry Image: `${{ env.GHCR_IMAGE_NAME }}:${{ steps.version.outputs.version }}`
+          DockerHub Image: `${{ env.DOCKERHUB_IMAGE_NAME }}:${{ steps.version.outputs.version }}`
+        artifacts: "dist/chroma-${{steps.version.outputs.version}}.tar.gz"
+        prerelease: true
+        generateReleaseNotes: true
+    - name: Update Tag
+      uses: richardsimko/[email protected]
+      if: "needs.check_tag.outputs.tag_matches != 'true'"
+      with:
+        tag_name: latest
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    - name: Release Latest
+      uses: ncipollo/[email protected]
+      if: "needs.check_tag.outputs.tag_matches != 'true'"
+      with:
+        tag: "latest"
+        name: "Latest"
+        body: |
+          Version: `${{steps.version.outputs.version}}`
+          Git ref: `${{github.ref}}`
+          Build Date: `${{steps.builddate.outputs.builddate}}`
+          PIP Package: `chroma-${{steps.version.outputs.version}}.tar.gz`
+          Github Container Registry Image: `${{ env.GHCR_IMAGE_NAME }}:${{ steps.version.outputs.version }}`
+          DockerHub Image: `${{ env.DOCKERHUB_IMAGE_NAME }}:${{ steps.version.outputs.version }}`
+        artifacts: "dist/chroma-${{steps.version.outputs.version}}.tar.gz"
+        allowUpdates: true
+        prerelease: true
+    - name: Trigger Hosted Chroma FE Release
+      uses: actions/github-script@v6
+      with:
+        github-token: ${{ secrets.HOSTED_CHROMA_WORKFLOW_DISPATCH_TOKEN }}
+        script: |
+          const result = await github.rest.actions.createWorkflowDispatch({
+            owner: 'chroma-core',
+            repo: 'hosted-chroma',
+            workflow_id: 'build-and-publish-frontend.yaml',
+            ref: 'main'
+          })
+          console.log(result)
+    - name: Trigger Hosted Chroma Coordinator Release
+      uses: actions/github-script@v6
+      with:
+        github-token: ${{ secrets.HOSTED_CHROMA_WORKFLOW_DISPATCH_TOKEN }}
+        script: |
+          const result = await github.rest.actions.createWorkflowDispatch({
+            owner: 'chroma-core',
+            repo: 'hosted-chroma',
+            workflow_id: 'build-and-deploy-coordinator.yaml',
+            ref: 'main'
+          })
+          console.log(result)
+    - name: Trigger Hosted Worker Release
+      uses: actions/github-script@v6
+      with:
+        github-token: ${{ secrets.HOSTED_CHROMA_WORKFLOW_DISPATCH_TOKEN }}
+        script: |
+          const result = await github.rest.actions.createWorkflowDispatch({
+            owner: 'chroma-core',
+            repo: 'hosted-chroma',
+            workflow_id: 'build-and-deploy-worker.yaml',
+            ref: 'main'
+          })
+          console.log(result)

.github/workflows/chroma-test.yml ADDED Viewed

	@@ -0,0 +1,65 @@

+name: Chroma Tests
+on:
+  push:
+    branches:
+      - main
+      - team/hypothesis-tests
+  pull_request:
+    branches:
+      - main
+      - '**'
+  workflow_dispatch:
+jobs:
+  test:
+    timeout-minutes: 90
+    strategy:
+      matrix:
+        python: ['3.8', '3.9', '3.10', '3.11']
+        platform: [ubuntu-latest, windows-latest]
+        testfile: ["--ignore-glob 'chromadb/test/property/*' --ignore-glob 'chromadb/test/stress/*' --ignore='chromadb/test/auth/test_simple_rbac_authz.py'",
+                   "chromadb/test/auth/test_simple_rbac_authz.py",
+                   "chromadb/test/property/test_add.py",
+                   "chromadb/test/property/test_collections.py",
+                   "chromadb/test/property/test_cross_version_persist.py",
+                   "chromadb/test/property/test_embeddings.py",
+                   "chromadb/test/property/test_filtering.py",
+                   "chromadb/test/property/test_persist.py"]
+    runs-on: ${{ matrix.platform }}
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python }}
+    - name: Install test dependencies
+      run: python -m pip install -r requirements.txt && python -m pip install -r requirements_dev.txt
+    - name: Upgrade SQLite
+      run: python bin/windows_upgrade_sqlite.py
+      if: runner.os == 'Windows'
+    - name: Test
+      run: python -m pytest ${{ matrix.testfile }}
+  stress-test:
+    timeout-minutes: 90
+    strategy:
+      matrix:
+        python: ['3.8']
+        platform: ['16core-64gb-ubuntu-latest', '16core-64gb-windows-latest']
+        testfile: ["'chromadb/test/stress/'"]
+    runs-on: ${{ matrix.platform }}
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python }}
+    - name: Install test dependencies
+      run: python -m pip install -r requirements.txt && python -m pip install -r requirements_dev.txt
+    - name: Upgrade SQLite
+      run: python bin/windows_upgrade_sqlite.py
+      if: runner.os == 'Windows'
+    - name: Test
+      run: python -m pytest ${{ matrix.testfile }}

.github/workflows/chroma-worker-test.yml ADDED Viewed

	@@ -0,0 +1,36 @@

+name: Chroma Worker Tests
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+      - '**'
+  workflow_dispatch:
+jobs:
+  test:
+    strategy:
+      matrix:
+        platform: [ubuntu-latest]
+    runs-on: ${{ matrix.platform }}
+    steps:
+    - name: Checkout chroma-hnswlib
+      uses: actions/checkout@v3
+      with:
+        repository: chroma-core/hnswlib
+        path: hnswlib
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        path: chroma
+    - name: Install Protoc
+      uses: arduino/setup-protoc@v2
+    - name: Build
+      run: cargo build --verbose
+      working-directory: chroma
+    - name: Test
+      run: cargo test --verbose
+      working-directory: chroma

.github/workflows/pr-review-checklist.yml ADDED Viewed

	@@ -0,0 +1,37 @@

+name: PR Review Checklist
+on:
+  pull_request_target:
+    types:
+      - opened
+jobs:
+  PR-Comment:
+    runs-on: ubuntu-latest
+    steps:
+    - name: PR Comment
+      uses: actions/github-script@v2
+      with:
+        github-token: ${{secrets.GITHUB_TOKEN}}
+        script: |
+          github.issues.createComment({
+            issue_number: ${{ github.event.number }},
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            body: `# Reviewer Checklist
+            Please leverage this checklist to ensure your code review is thorough before approving
+            ## Testing, Bugs, Errors, Logs, Documentation
+            - [ ] Can you think of any use case in which the code does not behave as intended? Have they been tested?
+            - [ ] Can you think of any inputs or external events that could break the code? Is user input validated and safe? Have they been tested?
+            - [ ] If appropriate, are there adequate property based tests?
+            - [ ] If appropriate, are there adequate unit tests?
+            - [ ] Should any logging, debugging, tracing information be added or removed?
+            - [ ] Are error messages user-friendly?
+            - [ ] Have all documentation changes needed been made?
+            - [ ] Have all non-obvious changes been commented?
+            ## System Compatibility
+            - [ ] Are there any potential impacts on other parts of the system or backward compatibility?
+            - [ ] Does this change intersect with any items on our roadmap, and if so, is there a plan for fitting them together?
+            ## Quality
+            - [ ] Is this code of a unexpectedly high quality (Readability, Modularity, Intuitiveness)`
+          })

.github/workflows/python-vuln.yaml ADDED Viewed

	@@ -0,0 +1,28 @@

+name: Python Vulnerability Scan
+on:
+  push:
+    branches:
+      - '*'
+      - '*/**'
+    paths:
+      - chromadb/**
+      - clients/python/**
+  workflow_dispatch:
+jobs:
+  bandit-scan:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - uses: ./.github/actions/bandit-scan/
+        with:
+          input-dir: '.'
+          format: 'json'
+          bandit-config: 'bandit.yaml'
+          output-file: 'bandit-report.json'
+      - name: Upload Bandit Report
+        uses: actions/upload-artifact@v3
+        with:
+          name: bandit-artifact
+          path: |
+            bandit-report.json

.gitignore ADDED Viewed

	@@ -0,0 +1,34 @@

+# ignore mac created DS_Store files
+**/.DS_Store
+**/__pycache__
+go/coordinator/bin/
+go/coordinator/**/testdata/
+*.log
+**/data__nogit
+**/.ipynb_checkpoints
+index_data
+# Default configuration for persist_directory in chromadb/config.py
+# Currently it's located in "./chroma/"
+chroma/
+chroma_test_data/
+server.htpasswd
+.venv
+venv
+.env
+.chroma
+*.egg-info
+dist
+.terraform/
+.terraform.lock.hcl
+terraform.tfstate
+.hypothesis/
+.idea

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,36 @@

+exclude: 'chromadb/proto/(chroma_pb2|coordinator_pb2)\.(py|pyi|py_grpc\.py)' # Generated files
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: trailing-whitespace
+      - id: mixed-line-ending
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+      - id: check-yaml
+        args: ["--allow-multiple-documents"]
+      - id: check-xml
+      - id: check-merge-conflict
+      - id: check-case-conflict
+      - id: check-docstring-first
+  - repo: https://github.com/psf/black
+    # https://github.com/psf/black/issues/2493
+    rev: "refs/tags/23.3.0:refs/tags/23.3.0"
+    hooks:
+      - id: black
+  - repo: https://github.com/PyCQA/flake8
+    rev: 6.1.0
+    hooks:
+      - id: flake8
+        args:
+          - "--extend-ignore=E203,E501,E503"
+          - "--max-line-length=88"
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: "v1.2.0"
+    hooks:
+      - id: mypy
+        args: [--strict, --ignore-missing-imports, --follow-imports=silent, --disable-error-code=type-abstract, --config-file=./pyproject.toml]
+        additional_dependencies: ["types-requests", "pydantic", "overrides", "hypothesis", "pytest", "pypika", "numpy", "types-protobuf", "kubernetes"]

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,131 @@

+{
+  "git.ignoreLimitWarning": true,
+  "editor.rulers": [
+    88
+  ],
+  "editor.formatOnSave": true,
+  "python.formatting.provider": "black",
+  "files.exclude": {
+    "**/__pycache__": true,
+    "**/.ipynb_checkpoints": true,
+    "**/.pytest_cache": true,
+    "**/chroma.egg-info": true
+  },
+  "python.analysis.typeCheckingMode": "basic",
+  "python.linting.flake8Enabled": true,
+  "python.linting.enabled": true,
+  "python.linting.flake8Args": [
+    "--extend-ignore=E203",
+    "--extend-ignore=E501",
+    "--extend-ignore=E503",
+    "--max-line-length=88"
+  ],
+  "python.testing.pytestArgs": [
+    "."
+  ],
+  "python.testing.unittestEnabled": false,
+  "python.testing.pytestEnabled": true,
+  "editor.formatOnPaste": true,
+  "python.linting.mypyEnabled": true,
+  "python.linting.mypyCategorySeverity.note": "Error",
+  "python.linting.mypyArgs": [
+    "--follow-imports=silent",
+    "--ignore-missing-imports",
+    "--show-column-numbers",
+    "--no-pretty",
+    "--strict",
+    "--disable-error-code=type-abstract"
+  ],
+  "protoc": {
+    "options": [
+      "--proto_path=idl/",
+    ]
+  },
+  "rust-analyzer.cargo.buildScripts.enable": true,
+  "files.associations": {
+    "fstream": "cpp",
+    "iosfwd": "cpp",
+    "__hash_table": "cpp",
+    "__locale": "cpp",
+    "atomic": "cpp",
+    "deque": "cpp",
+    "filesystem": "cpp",
+    "future": "cpp",
+    "locale": "cpp",
+    "random": "cpp",
+    "regex": "cpp",
+    "string": "cpp",
+    "tuple": "cpp",
+    "type_traits": "cpp",
+    "unordered_map": "cpp",
+    "valarray": "cpp",
+    "variant": "cpp",
+    "vector": "cpp",
+    "__string": "cpp",
+    "istream": "cpp",
+    "memory": "cpp",
+    "optional": "cpp",
+    "string_view": "cpp",
+    "__bit_reference": "cpp",
+    "__bits": "cpp",
+    "__config": "cpp",
+    "__debug": "cpp",
+    "__errc": "cpp",
+    "__mutex_base": "cpp",
+    "__node_handle": "cpp",
+    "__nullptr": "cpp",
+    "__split_buffer": "cpp",
+    "__threading_support": "cpp",
+    "__tree": "cpp",
+    "__tuple": "cpp",
+    "array": "cpp",
+    "bit": "cpp",
+    "bitset": "cpp",
+    "cctype": "cpp",
+    "charconv": "cpp",
+    "chrono": "cpp",
+    "cinttypes": "cpp",
+    "clocale": "cpp",
+    "cmath": "cpp",
+    "compare": "cpp",
+    "complex": "cpp",
+    "concepts": "cpp",
+    "condition_variable": "cpp",
+    "csignal": "cpp",
+    "cstdarg": "cpp",
+    "cstddef": "cpp",
+    "cstdint": "cpp",
+    "cstdio": "cpp",
+    "cstdlib": "cpp",
+    "cstring": "cpp",
+    "ctime": "cpp",
+    "cwchar": "cpp",
+    "cwctype": "cpp",
+    "exception": "cpp",
+    "format": "cpp",
+    "forward_list": "cpp",
+    "initializer_list": "cpp",
+    "iomanip": "cpp",
+    "ios": "cpp",
+    "iostream": "cpp",
+    "limits": "cpp",
+    "list": "cpp",
+    "map": "cpp",
+    "mutex": "cpp",
+    "new": "cpp",
+    "numeric": "cpp",
+    "ostream": "cpp",
+    "queue": "cpp",
+    "ratio": "cpp",
+    "set": "cpp",
+    "sstream": "cpp",
+    "stack": "cpp",
+    "stdexcept": "cpp",
+    "streambuf": "cpp",
+    "system_error": "cpp",
+    "typeindex": "cpp",
+    "typeinfo": "cpp",
+    "unordered_set": "cpp",
+    "algorithm": "cpp"
+  },
+}

Cargo.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

Cargo.toml ADDED Viewed

	@@ -0,0 +1,5 @@

+[workspace]
+members = [
+   "rust/worker/"
+]

DEVELOP.md ADDED Viewed

	@@ -0,0 +1,111 @@

+# Development Instructions
+This project uses the testing, build and release standards specified
+by the PyPA organization and documented at
+https://packaging.python.org.
+## Setup
+Because of the dependencies it relies on (like `pytorch`), this project does not support Python version >3.10.0.
+Set up a virtual environment and install the project's requirements
+and dev requirements:
+```
+python3 -m venv venv      # Only need to do this once
+source venv/bin/activate  # Do this each time you use a new shell for the project
+pip install -r requirements.txt
+pip install -r requirements_dev.txt
+pre-commit install # install the precommit hooks
+```
+You can also install `chromadb` the `pypi` package locally and in editable mode with `pip install -e .`.
+## Running Chroma
+Chroma can be run via 3 modes:
+1. Standalone and in-memory:
+```python
+import chromadb
+api = chromadb.Client()
+print(api.heartbeat())
+```
+2. Standalone and in-memory with persistence:
+This by default saves your db and your indexes to a `.chroma` directory and can also load from them.
+```python
+import chromadb
+api = chromadb.PersistentClient(path="/path/to/persist/directory")
+print(api.heartbeat())
+```
+3. With a persistent backend and a small frontend client
+Run `chroma run --path /chroma_db_path`
+```python
+import chromadb
+api = chromadb.HttpClient(host="localhost", port="8000")
+print(api.heartbeat())
+```
+## Local dev setup for distributed chroma
+We use tilt for providing local dev setup. Tilt is an open source project
+##### Requirement
+- Docker
+- Local Kubernetes cluster (Recommended: [OrbStack](https://orbstack.dev/) for mac, [Kind](https://kind.sigs.k8s.io/) for linux)
+- [Tilt](https://docs.tilt.dev/)
+For starting the distributed Chroma in the workspace, use `tilt up`. It will create all the required resources and build the necessary Docker image in the current kubectl context.
+Once done, it will expose Chroma on port 8000. You can also visit the Tilt dashboard UI at http://localhost:10350/. To clean and remove all the resources created by Tilt, use `tilt down`.
+## Testing
+Unit tests are in the `/chromadb/test` directory.
+To run unit tests using your current environment, run `pytest`.
+## Manual Build
+To manually build a distribution, run `python -m build`.
+The project's source and wheel distributions will be placed in the `dist` directory.
+## Manual Release
+Not yet implemented.
+## Versioning
+This project uses PyPA's `setuptools_scm` module to determine the
+version number for build artifacts, meaning the version number is
+derived from Git rather than hardcoded in the repository. For full
+details, see the
+[documentation for setuptools_scm](https://github.com/pypa/setuptools_scm/).
+In brief, version numbers are generated as follows:
+- If the current git head is tagged, the version number is exactly the
+  tag (e.g, `0.0.1`).
+- If the the current git head is a clean checkout, but is not tagged,
+  the version number is a patch version increment of the most recent
+  tag, plus `devN` where N is the number of commits since the most
+  recent tag. For example, if there have been 5 commits since the
+  `0.0.1` tag, the generated version will be `0.0.2-dev5`.
+- If the current head is not a clean checkout, a `+dirty` local
+  version will be appended to the version number. For example,
+  `0.0.2-dev5+dirty`.
+At any point, you can manually run `python -m setuptools_scm` to see
+what version would be assigned given your current state.
+## Continuous Integration
+This project uses Github Actions to run unit tests automatically upon
+every commit to the main branch. See the documentation for Github
+Actions and the flow definitions in `.github/workflows` for details.
+## Continuous Delivery
+Not yet implemented.

Dockerfile ADDED Viewed

	@@ -0,0 +1,39 @@

+FROM python:3.11-slim-bookworm AS builder
+ARG REBUILD_HNSWLIB
+RUN apt-get update --fix-missing && apt-get install -y --fix-missing \
+    build-essential \
+    gcc \
+    g++ \
+    cmake \
+    autoconf && \
+    rm -rf /var/lib/apt/lists/* && \
+    mkdir /install
+WORKDIR /install
+COPY ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade --prefix="/install" -r requirements.txt
+RUN if [ "$REBUILD_HNSWLIB" = "true" ]; then pip install --no-binary :all: --force-reinstall --no-cache-dir --prefix="/install" chroma-hnswlib; fi
+FROM python:3.11-slim-bookworm AS final
+RUN mkdir /chroma
+WORKDIR /chroma
+COPY --from=builder /install /usr/local
+COPY ./bin/docker_entrypoint.sh /docker_entrypoint.sh
+COPY ./ /chroma
+RUN chmod +x /docker_entrypoint.sh
+ENV CHROMA_HOST_ADDR "0.0.0.0"
+ENV CHROMA_HOST_PORT 7860
+ENV CHROMA_WORKERS 1
+ENV CHROMA_LOG_CONFIG "chromadb/log_config.yml"
+ENV CHROMA_TIMEOUT_KEEP_ALIVE 30
+EXPOSE 7860
+ENTRYPOINT ["/docker_entrypoint.sh"]
+CMD [ "--workers ${CHROMA_WORKERS} --host ${CHROMA_HOST_ADDR} --port ${CHROMA_HOST_PORT} --proxy-headers --log-config ${CHROMA_LOG_CONFIG} --timeout-keep-alive ${CHROMA_TIMEOUT_KEEP_ALIVE}"]

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,11 +1,106 @@
----
-title: Chroma
-emoji: 🏢
-colorFrom: indigo
-colorTo: indigo
-sdk: docker
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<p align="center">
+  <a href="https://trychroma.com"><img src="https://user-images.githubusercontent.com/891664/227103090-6624bf7d-9524-4e05-9d2c-c28d5d451481.png" alt="Chroma logo"></a>
+</p>
+<p align="center">
+    <b>Chroma - the open-source embedding database</b>. <br />
+    The fastest way to build Python or JavaScript LLM apps with memory!
+</p>
+<p align="center">
+  <a href="https://discord.gg/MMeYNTmh3x" target="_blank">
+      <img src="https://img.shields.io/discord/1073293645303795742" alt="Discord">
+  </a> |
+  <a href="https://github.com/chroma-core/chroma/blob/master/LICENSE" target="_blank">
+      <img src="https://img.shields.io/static/v1?label=license&message=Apache 2.0&color=white" alt="License">
+  </a> |
+  <a href="https://docs.trychroma.com/" target="_blank">
+      Docs
+  </a> |
+  <a href="https://www.trychroma.com/" target="_blank">
+      Homepage
+  </a>
+</p>
+<p align="center">
+  <a href="https://github.com/chroma-core/chroma/actions/workflows/chroma-integration-test.yml" target="_blank">
+    <img src="https://github.com/chroma-core/chroma/actions/workflows/chroma-integration-test.yml/badge.svg?branch=main" alt="Integration Tests">
+  </a> |
+  <a href="https://github.com/chroma-core/chroma/actions/workflows/chroma-test.yml" target="_blank">
+    <img src="https://github.com/chroma-core/chroma/actions/workflows/chroma-test.yml/badge.svg?branch=main" alt="Tests">
+  </a>
+</p>
+```bash
+pip install chromadb # python client
+# for javascript, npm install chromadb!
+# for client-server mode, chroma run --path /chroma_db_path
+```
+The core API is only 4 functions (run our [💡 Google Colab](https://colab.research.google.com/drive/1QEzFyqnoFxq7LUGyP1vzR4iLt9PpCDXv?usp=sharing) or [Replit template](https://replit.com/@swyx/BasicChromaStarter?v=1)):
+```python
+import chromadb
+# setup Chroma in-memory, for easy prototyping. Can add persistence easily!
+client = chromadb.Client()
+# Create collection. get_collection, get_or_create_collection, delete_collection also available!
+collection = client.create_collection("all-my-documents")
+# Add docs to the collection. Can also update and delete. Row-based API coming soon!
+collection.add(
+    documents=["This is document1", "This is document2"], # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well
+    metadatas=[{"source": "notion"}, {"source": "google-docs"}], # filter on these!
+    ids=["doc1", "doc2"], # unique for each doc
+)
+# Query/search 2 most similar results. You can also .get by id
+results = collection.query(
+    query_texts=["This is a query document"],
+    n_results=2,
+    # where={"metadata_field": "is_equal_to_this"}, # optional filter
+    # where_document={"$contains":"search_string"}  # optional filter
+)
+```
+## Features
+- __Simple__: Fully-typed, fully-tested, fully-documented == happiness
+- __Integrations__: [`🦜️🔗 LangChain`](https://blog.langchain.dev/langchain-chroma/) (python and js), [`🦙 LlamaIndex`](https://twitter.com/atroyn/status/1628557389762007040) and more soon
+- __Dev, Test, Prod__: the same API that runs in your python notebook, scales to your cluster
+- __Feature-rich__: Queries, filtering, density estimation and more
+- __Free & Open Source__: Apache 2.0 Licensed
+## Use case: ChatGPT for ______
+For example, the `"Chat your data"` use case:
+1. Add documents to your database. You can pass in your own embeddings, embedding function, or let Chroma embed them for you.
+2. Query relevant documents with natural language.
+3. Compose documents into the context window of an LLM like `GPT3` for additional summarization or analysis.
+## Embeddings?
+What are embeddings?
+- [Read the guide from OpenAI](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings)
+- __Literal__: Embedding something turns it from image/text/audio into a list of numbers. 🖼️ or 📄 => `[1.2, 2.1, ....]`. This process makes documents "understandable" to a machine learning model.
+- __By analogy__: An embedding represents the essence of a document. This enables documents and queries with the same essence to be "near" each other and therefore easy to find.
+- __Technical__: An embedding is the latent-space position of a document at a layer of a deep neural network. For models trained specifically to embed data, this is the last layer.
+- __A small example__: If you search your photos for "famous bridge in San Francisco". By embedding this query and comparing it to the embeddings of your photos and their metadata - it should return photos of the Golden Gate Bridge.
+Embeddings databases (also known as **vector databases**) store embeddings and allow you to search by nearest neighbors rather than by substrings like a traditional database. By default, Chroma uses [Sentence Transformers](https://docs.trychroma.com/embeddings#sentence-transformers) to embed for you but you can also use OpenAI embeddings, Cohere (multilingual) embeddings, or your own.
+## Get involved
+Chroma is a rapidly developing project. We welcome PR contributors and ideas for how to improve the project.
+- [Join the conversation on Discord](https://discord.gg/MMeYNTmh3x) - `#contributing` channel
+- [Review the 🛣️ Roadmap and contribute your ideas](https://docs.trychroma.com/roadmap)
+- [Grab an issue and open a PR](https://github.com/chroma-core/chroma/issues) - [`Good first issue tag`](https://github.com/chroma-core/chroma/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
+- [Read our contributing guide](https://docs.trychroma.com/contributing)
+**Release Cadence**
+We currently release new tagged versions of the `pypi` and `npm` packages on Mondays. Hotfixes go out at any time during the week.
+## License
+[Apache 2.0](./LICENSE)

RELEASE_PROCESS.md ADDED Viewed

	@@ -0,0 +1,22 @@

+## Release Process
+This guide covers how to release chroma to PyPi
+#### Increase the version number
+1. Create a new PR for the release that upgrades the version in code. Name it `release/A.B.C` In [this file](https://github.com/chroma-core/chroma/blob/main/chromadb/__init__.py) update the __ version __.
+```
+__version__ = "A.B.C"
+```
+2. Add the "release" label to this PR
+3. Once the PR is merged, tag your commit SHA with the release version
+```
+git tag A.B.C <SHA>
+```
+4. You need to then wait for the github action for main for `chroma release` and `chroma client release` to go green. Not doing this will result in a race condition.
+#### Perform the release
+1. Push your tag to origin to create the release
+```
+git push origin A.B.C
+```
+2. This will trigger a Github action which performs the release

Tiltfile ADDED Viewed

	@@ -0,0 +1,30 @@

+docker_build('coordinator',
+             context='.',
+             dockerfile='./go/coordinator/Dockerfile'
+)
+docker_build('server',
+             context='.',
+             dockerfile='./Dockerfile',
+)
+docker_build('worker',
+             context='.',
+             dockerfile='./rust/worker/Dockerfile'
+)
+k8s_yaml(['k8s/dev/setup.yaml'])
+k8s_resource(
+  objects=['chroma:Namespace', 'memberlist-reader:ClusterRole', 'memberlist-reader:ClusterRoleBinding', 'pod-list-role:Role', 'pod-list-role-binding:RoleBinding', 'memberlists.chroma.cluster:CustomResourceDefinition','worker-memberlist:MemberList'],
+  new_name='k8s_setup',
+  labels=["infrastructure"]
+)
+k8s_yaml(['k8s/dev/pulsar.yaml'])
+k8s_resource('pulsar', resource_deps=['k8s_setup'], labels=["infrastructure"])
+k8s_yaml(['k8s/dev/server.yaml'])
+k8s_resource('server', resource_deps=['k8s_setup'],labels=["chroma"], port_forwards=8000 )
+k8s_yaml(['k8s/dev/coordinator.yaml'])
+k8s_resource('coordinator', resource_deps=['pulsar', 'server'], labels=["chroma"])
+k8s_yaml(['k8s/dev/worker.yaml'])
+k8s_resource('worker', resource_deps=['coordinator'],labels=["chroma"])

bandit.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+# FILE: bandit.yaml
+exclude_dirs: [ 'chromadb/test', 'bin', 'build', 'build', '.git', '.venv', 'venv', 'env','.github','examples','clients/js','.vscode' ]
+tests: [ ]
+skips: [ ]

bin/cluster-test.sh ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/usr/bin/env bash
+set -e
+function cleanup {
+    # Restore the previous kube context
+    kubectl config use-context $PREV_CHROMA_KUBE_CONTEXT
+    # Kill the tunnel process
+    kill $TUNNEL_PID
+    minikube delete -p chroma-test
+}
+trap cleanup EXIT
+# Save the current kube context into a variable
+export PREV_CHROMA_KUBE_CONTEXT=$(kubectl config current-context)
+# Create a new minikube cluster for the test
+minikube start -p chroma-test
+# Add the ingress addon to the cluster
+minikube addons enable ingress -p chroma-test
+minikube addons enable ingress-dns -p chroma-test
+# Setup docker to build inside the minikube cluster and build the image
+eval $(minikube -p chroma-test docker-env)
+docker build -t server:latest -f Dockerfile .
+docker build -t chroma-coordinator:latest -f go/coordinator/Dockerfile .
+docker build -t worker -f rust/worker/Dockerfile . --build-arg CHROMA_KUBERNETES_INTEGRATION=1
+# Apply the kubernetes manifests
+kubectl apply -f k8s/deployment
+kubectl apply -f k8s/crd
+kubectl apply -f k8s/cr
+kubectl apply -f k8s/test
+# Wait for the pods in the chroma namespace to be ready
+kubectl wait --namespace chroma --for=condition=Ready pods --all --timeout=400s
+# Run mini kube tunnel in the background to expose the service
+minikube tunnel -c true -p chroma-test &
+TUNNEL_PID=$!
+# Wait for the tunnel to be ready. There isn't an easy way to check if the tunnel is ready. So we just wait for 10 seconds
+sleep 10
+export CHROMA_CLUSTER_TEST_ONLY=1
+export CHROMA_SERVER_HOST=$(kubectl get svc server -n chroma -o=jsonpath='{.status.loadBalancer.ingress[0].ip}')
+export PULSAR_BROKER_URL=$(kubectl get svc pulsar-lb -n chroma -o=jsonpath='{.status.loadBalancer.ingress[0].ip}')
+export CHROMA_COORDINATOR_HOST=$(kubectl get svc coordinator-lb -n chroma -o=jsonpath='{.status.loadBalancer.ingress[0].ip}')
+export CHROMA_SERVER_GRPC_PORT="50051"
+echo "Chroma Server is running at port $CHROMA_SERVER_HOST"
+echo "Pulsar Broker is running at port $PULSAR_BROKER_URL"
+echo "Chroma Coordinator is running at port $CHROMA_COORDINATOR_HOST"
+echo testing: python -m pytest "$@"
+python -m pytest "$@"
+export CHROMA_KUBERNETES_INTEGRATION=1
+cd go/coordinator
+go test -timeout 30s -run ^TestNodeWatcher$ github.com/chroma/chroma-coordinator/internal/memberlist_manager

bin/docker_entrypoint.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+#!/bin/bash
+set -e
+export IS_PERSISTENT=1
+export CHROMA_SERVER_NOFILE=65535
+args="$@"
+if [[ $args =~ ^uvicorn.* ]]; then
+    echo "Starting server with args: $(eval echo "$args")"
+    echo -e "\033[31mWARNING: Please remove 'uvicorn chromadb.app:app' from your command line arguments. This is now handled by the entrypoint script."
+    exec $(eval echo "$args")
+else
+    echo "Starting 'uvicorn chromadb.app:app' with args: $(eval echo "$args")"
+    exec uvicorn chromadb.app:app $(eval echo "$args")
+fi

bin/generate_cloudformation.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import boto3
+import json
+import subprocess
+import os
+import re
+def b64text(txt):
+    """Generate Base 64 encoded CF json for a multiline string, subbing in values where appropriate"""
+    lines = []
+    for line in txt.splitlines(True):
+        if "${" in line:
+            lines.append({"Fn::Sub": line})
+        else:
+            lines.append(line)
+    return {"Fn::Base64": {"Fn::Join": ["", lines]}}
+path = os.path.dirname(os.path.realpath(__file__))
+version = subprocess.check_output(f"{path}/version").decode("ascii").strip()
+with open(f"{path}/templates/docker-compose.yml") as f:
+    docker_compose_file = str(f.read())
+cloud_config_script = """
+#cloud-config
+cloud_final_modules:
+- [scripts-user, always]
+"""
+cloud_init_script = f"""
+#!/bin/bash
+amazon-linux-extras install docker
+usermod -a -G docker ec2-user
+curl -L https://github.com/docker/compose/releases/latest/download/docker-compose-$(uname -s)-$(uname -m) -o /usr/local/bin/docker-compose
+chmod +x /usr/local/bin/docker-compose
+ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose
+systemctl enable docker
+systemctl start docker
+cat << EOF > /home/ec2-user/docker-compose.yml
+{docker_compose_file}
+EOF
+mkdir /home/ec2-user/config
+docker-compose -f /home/ec2-user/docker-compose.yml up -d
+"""
+userdata = f"""Content-Type: multipart/mixed; boundary="//"
+MIME-Version: 1.0
+--//
+Content-Type: text/cloud-config; charset="us-ascii"
+MIME-Version: 1.0
+Content-Transfer-Encoding: 7bit
+Content-Disposition: attachment; filename="cloud-config.txt"
+{cloud_config_script}
+--//
+Content-Type: text/x-shellscript; charset="us-ascii"
+MIME-Version: 1.0
+Content-Transfer-Encoding: 7bit
+Content-Disposition: attachment; filename="userdata.txt"
+{cloud_init_script}
+--//--
+"""
+cf = {
+    "AWSTemplateFormatVersion": "2010-09-09",
+    "Description": "Create a stack that runs Chroma hosted on a single instance",
+    "Parameters": {
+        "KeyName": {
+            "Description": "Name of an existing EC2 KeyPair to enable SSH access to the instance",
+            "Type": "String",
+            "ConstraintDescription": "If present, must be the name of an existing EC2 KeyPair.",
+            "Default": "",
+        },
+        "InstanceType": {
+            "Description": "EC2 instance type",
+            "Type": "String",
+            "Default": "t3.small",
+        },
+        "ChromaVersion": {
+            "Description": "Chroma version to install",
+            "Type": "String",
+            "Default": version,
+        },
+    },
+    "Conditions": {
+        "HasKeyName": {"Fn::Not": [{"Fn::Equals": [{"Ref": "KeyName"}, ""]}]},
+    },
+    "Resources": {
+        "ChromaInstance": {
+            "Type": "AWS::EC2::Instance",
+            "Properties": {
+                "ImageId": {
+                    "Fn::FindInMap": ["Region2AMI", {"Ref": "AWS::Region"}, "AMI"]
+                },
+                "InstanceType": {"Ref": "InstanceType"},
+                "UserData": b64text(userdata),
+                "SecurityGroupIds": [{"Ref": "ChromaInstanceSecurityGroup"}],
+                "KeyName": {
+                    "Fn::If": [
+                        "HasKeyName",
+                        {"Ref": "KeyName"},
+                        {"Ref": "AWS::NoValue"},
+                    ]
+                },
+                "BlockDeviceMappings": [
+                    {
+                        "DeviceName": {
+                            "Fn::FindInMap": [
+                                "Region2AMI",
+                                {"Ref": "AWS::Region"},
+                                "RootDeviceName",
+                            ]
+                        },
+                        "Ebs": {"VolumeSize": 24},
+                    }
+                ],
+            },
+        },
+        "ChromaInstanceSecurityGroup": {
+            "Type": "AWS::EC2::SecurityGroup",
+            "Properties": {
+                "GroupDescription": "Chroma Instance Security Group",
+                "SecurityGroupIngress": [
+                    {
+                        "IpProtocol": "tcp",
+                        "FromPort": "22",
+                        "ToPort": "22",
+                        "CidrIp": "0.0.0.0/0",
+                    },
+                    {
+                        "IpProtocol": "tcp",
+                        "FromPort": "8000",
+                        "ToPort": "8000",
+                        "CidrIp": "0.0.0.0/0",
+                    },
+                ],
+            },
+        },
+    },
+    "Outputs": {
+        "ServerIp": {
+            "Description": "IP address of the Chroma server",
+            "Value": {"Fn::GetAtt": ["ChromaInstance", "PublicIp"]},
+        }
+    },
+    "Mappings": {"Region2AMI": {}},
+}
+# Populate the Region2AMI mappings
+regions = boto3.client("ec2", region_name="us-east-1").describe_regions()["Regions"]
+for region in regions:
+    region_name = region["RegionName"]
+    ami_result = boto3.client("ec2", region_name=region_name).describe_images(
+        Owners=["137112412989"],
+        Filters=[
+            {"Name": "name", "Values": ["amzn2-ami-kernel-5.10-hvm-*-x86_64-gp2"]},
+            {"Name": "root-device-type", "Values": ["ebs"]},
+            {"Name": "virtualization-type", "Values": ["hvm"]},
+        ],
+    )
+    img = ami_result["Images"][0]
+    ami_id = img["ImageId"]
+    root_device_name = img["BlockDeviceMappings"][0]["DeviceName"]
+    cf["Mappings"]["Region2AMI"][region_name] = {
+        "AMI": ami_id,
+        "RootDeviceName": root_device_name,
+    }
+# Write the CF json to a file
+json.dump(cf, open("/tmp/chroma.cf.json", "w"), indent=4)
+# upload to S3
+s3 = boto3.client("s3", region_name="us-east-1")
+s3.upload_file(
+    "/tmp/chroma.cf.json",
+    "public.trychroma.com",
+    f"cloudformation/{version}/chroma.cf.json",
+)
+# Upload to s3 under /latest version only if this is a release
+pattern = re.compile(r"^\d+\.\d+\.\d+$")
+if pattern.match(version):
+    s3.upload_file(
+        "/tmp/chroma.cf.json",
+        "public.trychroma.com",
+        "cloudformation/latest/chroma.cf.json",
+    )
+else:
+    print(f"Version {version} is not a 3-part semver, not uploading to /latest")

bin/integration-test ADDED Viewed

	@@ -0,0 +1,75 @@

+#!/usr/bin/env bash
+set -e
+export CHROMA_PORT=8000
+function cleanup {
+  docker compose -f docker-compose.test.yml down --rmi local --volumes
+  rm server.htpasswd .chroma_env
+}
+function setup_auth {
+  local auth_type="$1"
+  case "$auth_type" in
+    basic)
+      docker run --rm --entrypoint htpasswd httpd:2 -Bbn admin admin > server.htpasswd
+      cat <<EOF > .chroma_env
+CHROMA_SERVER_AUTH_CREDENTIALS_FILE="/chroma/server.htpasswd"
+CHROMA_SERVER_AUTH_CREDENTIALS_PROVIDER="chromadb.auth.providers.HtpasswdFileServerAuthCredentialsProvider"
+CHROMA_SERVER_AUTH_PROVIDER="chromadb.auth.basic.BasicAuthServerProvider"
+EOF
+      ;;
+    token)
+      cat <<EOF > .chroma_env
+CHROMA_SERVER_AUTH_CREDENTIALS="test-token"
+CHROMA_SERVER_AUTH_TOKEN_TRANSPORT_HEADER="AUTHORIZATION"
+CHROMA_SERVER_AUTH_CREDENTIALS_PROVIDER="chromadb.auth.token.TokenConfigServerAuthCredentialsProvider"
+CHROMA_SERVER_AUTH_PROVIDER="chromadb.auth.token.TokenAuthServerProvider"
+EOF
+      ;;
+    xtoken)
+          cat <<EOF > .chroma_env
+CHROMA_SERVER_AUTH_CREDENTIALS="test-token"
+CHROMA_SERVER_AUTH_TOKEN_TRANSPORT_HEADER="X_CHROMA_TOKEN"
+CHROMA_SERVER_AUTH_CREDENTIALS_PROVIDER="chromadb.auth.token.TokenConfigServerAuthCredentialsProvider"
+CHROMA_SERVER_AUTH_PROVIDER="chromadb.auth.token.TokenAuthServerProvider"
+EOF
+    ;;
+    *)
+      echo "Unknown auth type: $auth_type"
+      exit 1
+      ;;
+  esac
+}
+trap cleanup EXIT
+docker compose -f docker-compose.test.yml up --build -d
+export CHROMA_INTEGRATION_TEST_ONLY=1
+export CHROMA_API_IMPL=chromadb.api.fastapi.FastAPI
+export CHROMA_SERVER_HOST=localhost
+export CHROMA_SERVER_HTTP_PORT=8000
+export CHROMA_SERVER_NOFILE=65535
+echo testing: python -m pytest "$@"
+python -m pytest "$@"
+cd clients/js
+# moved off of yarn to npm to fix issues with jackspeak/cliui/string-width versions #1314
+npm install
+npm run test:run
+docker compose down
+cd ../..
+for auth_type in basic token xtoken; do
+  echo "Testing $auth_type auth"
+  setup_auth "$auth_type"
+  cd clients/js
+  docker compose --env-file ../../.chroma_env -f ../../docker-compose.test-auth.yml up --build -d
+  yarn test:run-auth-"$auth_type"
+  cd ../..
+  docker compose down
+done

bin/reset.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+ #!/usr/bin/env bash
+eval $(minikube -p chroma-test docker-env)
+docker build -t chroma-coordinator:latest -f go/coordinator/Dockerfile .
+kubectl delete deployment coordinator -n chroma
+# Apply the kubernetes manifests
+kubectl apply -f k8s/deployment
+kubectl apply -f k8s/crd
+kubectl apply -f k8s/cr
+kubectl apply -f k8s/test

bin/templates/docker-compose.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+version: '3.9'
+networks:
+  net:
+    driver: bridge
+services:
+  server:
+    image: ghcr.io/chroma-core/chroma:${ChromaVersion}
+    volumes:
+      - index_data:/index_data
+    ports:
+      - 8000:8000
+    networks:
+      - net
+volumes:
+  index_data:
+    driver: local
+  backups:
+    driver: local

bin/test-package.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+#!/bin/bash
+# Verify PIP tarball
+tarball=$(readlink -f $1)
+if [ -f "$tarball" ]; then
+    echo "Testing PIP package from tarball: $tarball"
+else
+    echo "Could not find PIP package: $tarball"
+fi
+# Create temporary project dir
+dir=$(mktemp -d)
+echo "Building python project dir at $dir ..."
+cd $dir
+python3 -m venv venv
+source venv/bin/activate
+pip install $tarball
+python -c "import chromadb; api = chromadb.Client(); print(api.heartbeat())"

bin/test-remote ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/usr/bin/env bash
+set -e
+# Assert first argument is present
+if [ -z "$1" ]; then
+    echo "Usage: bin/test-remote <remote-host>"
+    exit 1
+fi
+export CHROMA_INTEGRATION_TEST_ONLY=1
+export CHROMA_SERVER_HOST=$1
+export CHROMA_API_IMPL=chromadb.api.fastapi.FastAPI
+export CHROMA_SERVER_HTTP_PORT=8000
+python -m pytest

bin/test.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Sanity check script to ensure that the Chroma client can connect
+# and is capable of recieving data.
+import chromadb
+# run in in-memory mode
+chroma_api = chromadb.Client()
+print(chroma_api.heartbeat())

bin/version ADDED Viewed

	@@ -0,0 +1,8 @@

+#!/usr/bin/env bash
+export VERSION=`python -m setuptools_scm`
+if [[ -n `git status --porcelain` ]]; then
+    VERSION=$VERSION-dirty
+fi
+echo $VERSION

bin/windows_upgrade_sqlite.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import requests
+import zipfile
+import io
+import os
+import sys
+import shutil
+# Used by Github Action runners to upgrade sqlite version to 3.42.0
+DLL_URL = "https://www.sqlite.org/2023/sqlite-dll-win64-x64-3420000.zip"
+if __name__ == "__main__":
+    # Download and extract the DLL
+    r = requests.get(DLL_URL)
+    z = zipfile.ZipFile(io.BytesIO(r.content))
+    z.extractall(".")
+    # Print current Python path
+    exec_path = os.path.dirname(sys.executable)
+    dlls_path = os.path.join(exec_path, "DLLs")
+    # Copy the DLL to the Python DLLs folder
+    shutil.copy("sqlite3.dll", dlls_path)

chromadb/__init__.py ADDED Viewed

	@@ -0,0 +1,257 @@

+from typing import Dict, Optional
+import logging
+from chromadb.api.client import Client as ClientCreator
+from chromadb.api.client import AdminClient as AdminClientCreator
+from chromadb.auth.token import TokenTransportHeader
+import chromadb.config
+from chromadb.config import DEFAULT_DATABASE, DEFAULT_TENANT, Settings
+from chromadb.api import AdminAPI, ClientAPI
+from chromadb.api.models.Collection import Collection
+from chromadb.api.types import (
+    CollectionMetadata,
+    Documents,
+    EmbeddingFunction,
+    Embeddings,
+    IDs,
+    Include,
+    Metadata,
+    Where,
+    QueryResult,
+    GetResult,
+    WhereDocument,
+    UpdateCollectionMetadata,
+)
+# Re-export types from chromadb.types
+__all__ = [
+    "Collection",
+    "Metadata",
+    "Where",
+    "WhereDocument",
+    "Documents",
+    "IDs",
+    "Embeddings",
+    "EmbeddingFunction",
+    "Include",
+    "CollectionMetadata",
+    "UpdateCollectionMetadata",
+    "QueryResult",
+    "GetResult",
+]
+logger = logging.getLogger(__name__)
+__settings = Settings()
+__version__ = "0.4.22"
+# Workaround to deal with Colab's old sqlite3 version
+try:
+    import google.colab  # noqa: F401
+    IN_COLAB = True
+except ImportError:
+    IN_COLAB = False
+is_client = False
+try:
+    from chromadb.is_thin_client import is_thin_client
+    is_client = is_thin_client
+except ImportError:
+    is_client = False
+if not is_client:
+    import sqlite3
+    if sqlite3.sqlite_version_info < (3, 35, 0):
+        if IN_COLAB:
+            # In Colab, hotswap to pysqlite-binary if it's too old
+            import subprocess
+            import sys
+            subprocess.check_call(
+                [sys.executable, "-m", "pip", "install", "pysqlite3-binary"]
+            )
+            __import__("pysqlite3")
+            sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
+        else:
+            raise RuntimeError(
+                "\033[91mYour system has an unsupported version of sqlite3. Chroma \
+                    requires sqlite3 >= 3.35.0.\033[0m\n"
+                "\033[94mPlease visit \
+                    https://docs.trychroma.com/troubleshooting#sqlite to learn how \
+                    to upgrade.\033[0m"
+            )
+def configure(**kwargs) -> None:  # type: ignore
+    """Override Chroma's default settings, environment variables or .env files"""
+    global __settings
+    __settings = chromadb.config.Settings(**kwargs)
+def get_settings() -> Settings:
+    return __settings
+def EphemeralClient(
+    settings: Optional[Settings] = None,
+    tenant: str = DEFAULT_TENANT,
+    database: str = DEFAULT_DATABASE,
+) -> ClientAPI:
+    """
+    Creates an in-memory instance of Chroma. This is useful for testing and
+    development, but not recommended for production use.
+    Args:
+        tenant: The tenant to use for this client. Defaults to the default tenant.
+        database: The database to use for this client. Defaults to the default database.
+    """
+    if settings is None:
+        settings = Settings()
+    settings.is_persistent = False
+    return ClientCreator(settings=settings, tenant=tenant, database=database)
+def PersistentClient(
+    path: str = "./chroma",
+    settings: Optional[Settings] = None,
+    tenant: str = DEFAULT_TENANT,
+    database: str = DEFAULT_DATABASE,
+) -> ClientAPI:
+    """
+    Creates a persistent instance of Chroma that saves to disk. This is useful for
+    testing and development, but not recommended for production use.
+    Args:
+        path: The directory to save Chroma's data to. Defaults to "./chroma".
+        tenant: The tenant to use for this client. Defaults to the default tenant.
+        database: The database to use for this client. Defaults to the default database.
+    """
+    if settings is None:
+        settings = Settings()
+    settings.persist_directory = path
+    settings.is_persistent = True
+    return ClientCreator(tenant=tenant, database=database, settings=settings)
+def HttpClient(
+    host: str = "localhost",
+    port: str = "8000",
+    ssl: bool = False,
+    headers: Optional[Dict[str, str]] = None,
+    settings: Optional[Settings] = None,
+    tenant: str = DEFAULT_TENANT,
+    database: str = DEFAULT_DATABASE,
+) -> ClientAPI:
+    """
+    Creates a client that connects to a remote Chroma server. This supports
+    many clients connecting to the same server, and is the recommended way to
+    use Chroma in production.
+    Args:
+        host: The hostname of the Chroma server. Defaults to "localhost".
+        port: The port of the Chroma server. Defaults to "8000".
+        ssl: Whether to use SSL to connect to the Chroma server. Defaults to False.
+        headers: A dictionary of headers to send to the Chroma server. Defaults to {}.
+        settings: A dictionary of settings to communicate with the chroma server.
+        tenant: The tenant to use for this client. Defaults to the default tenant.
+        database: The database to use for this client. Defaults to the default database.
+    """
+    if settings is None:
+        settings = Settings()
+    settings.chroma_api_impl = "chromadb.api.fastapi.FastAPI"
+    if settings.chroma_server_host and settings.chroma_server_host != host:
+        raise ValueError(
+            f"Chroma server host provided in settings[{settings.chroma_server_host}] is different to the one provided in HttpClient: [{host}]"
+        )
+    settings.chroma_server_host = host
+    if settings.chroma_server_http_port and settings.chroma_server_http_port != port:
+        raise ValueError(
+            f"Chroma server http port provided in settings[{settings.chroma_server_http_port}] is different to the one provided in HttpClient: [{port}]"
+        )
+    settings.chroma_server_http_port = port
+    settings.chroma_server_ssl_enabled = ssl
+    settings.chroma_server_headers = headers
+    return ClientCreator(tenant=tenant, database=database, settings=settings)
+def CloudClient(
+    tenant: str,
+    database: str,
+    api_key: Optional[str] = None,
+    settings: Optional[Settings] = None,
+    *,  # Following arguments are keyword-only, intended for testing only.
+    cloud_host: str = "api.trychroma.com",
+    cloud_port: str = "8000",
+    enable_ssl: bool = True,
+) -> ClientAPI:
+    """
+    Creates a client to connect to a tennant and database on the Chroma cloud.
+    Args:
+        tenant: The tenant to use for this client.
+        database: The database to use for this client.
+        api_key: The api key to use for this client.
+    """
+    # If no API key is provided, try to load it from the environment variable
+    if api_key is None:
+        import os
+        api_key = os.environ.get("CHROMA_API_KEY")
+    # If the API key is still not provided, prompt the user
+    if api_key is None:
+        print(
+            "\033[93mDon't have an API key?\033[0m Get one at https://app.trychroma.com"
+        )
+        api_key = input("Please enter your Chroma API key: ")
+    if settings is None:
+        settings = Settings()
+    settings.chroma_api_impl = "chromadb.api.fastapi.FastAPI"
+    settings.chroma_server_host = cloud_host
+    settings.chroma_server_http_port = cloud_port
+    # Always use SSL for cloud
+    settings.chroma_server_ssl_enabled = enable_ssl
+    settings.chroma_client_auth_provider = "chromadb.auth.token.TokenAuthClientProvider"
+    settings.chroma_client_auth_credentials = api_key
+    settings.chroma_client_auth_token_transport_header = (
+        TokenTransportHeader.X_CHROMA_TOKEN.name
+    )
+    return ClientCreator(tenant=tenant, database=database, settings=settings)
+def Client(
+    settings: Settings = __settings,
+    tenant: str = DEFAULT_TENANT,
+    database: str = DEFAULT_DATABASE,
+) -> ClientAPI:
+    """
+    Return a running chroma.API instance
+    tenant: The tenant to use for this client. Defaults to the default tenant.
+    database: The database to use for this client. Defaults to the default database.
+    """
+    return ClientCreator(tenant=tenant, database=database, settings=settings)
+def AdminClient(settings: Settings = Settings()) -> AdminAPI:
+    """
+    Creates an admin client that can be used to create tenants and databases.
+    """
+    return AdminClientCreator(settings=settings)

chromadb/api/__init__.py ADDED Viewed

	@@ -0,0 +1,596 @@

+from abc import ABC, abstractmethod
+from typing import Sequence, Optional
+from uuid import UUID
+from overrides import override
+from chromadb.config import DEFAULT_DATABASE, DEFAULT_TENANT
+from chromadb.api.models.Collection import Collection
+from chromadb.api.types import (
+    CollectionMetadata,
+    Documents,
+    Embeddable,
+    EmbeddingFunction,
+    DataLoader,
+    Embeddings,
+    IDs,
+    Include,
+    Loadable,
+    Metadatas,
+    URIs,
+    Where,
+    QueryResult,
+    GetResult,
+    WhereDocument,
+)
+from chromadb.config import Component, Settings
+from chromadb.types import Database, Tenant
+import chromadb.utils.embedding_functions as ef
+class BaseAPI(ABC):
+    @abstractmethod
+    def heartbeat(self) -> int:
+        """Get the current time in nanoseconds since epoch.
+        Used to check if the server is alive.
+        Returns:
+            int: The current time in nanoseconds since epoch
+        """
+        pass
+    #
+    # COLLECTION METHODS
+    #
+    @abstractmethod
+    def list_collections(
+        self,
+        limit: Optional[int] = None,
+        offset: Optional[int] = None,
+    ) -> Sequence[Collection]:
+        """List all collections.
+        Args:
+            limit: The maximum number of entries to return. Defaults to None.
+            offset: The number of entries to skip before returning. Defaults to None.
+        Returns:
+            Sequence[Collection]: A list of collections
+        Examples:
+            ```python
+            client.list_collections()
+            # [collection(name="my_collection", metadata={})]
+            ```
+        """
+        pass
+    @abstractmethod
+    def count_collections(self) -> int:
+        """Count the number of collections.
+        Returns:
+            int: The number of collections.
+        Examples:
+            ```python
+            client.count_collections()
+            # 1
+            ```
+        """
+        pass
+    @abstractmethod
+    def create_collection(
+        self,
+        name: str,
+        metadata: Optional[CollectionMetadata] = None,
+        embedding_function: Optional[
+            EmbeddingFunction[Embeddable]
+        ] = ef.DefaultEmbeddingFunction(),  # type: ignore
+        data_loader: Optional[DataLoader[Loadable]] = None,
+        get_or_create: bool = False,
+    ) -> Collection:
+        """Create a new collection with the given name and metadata.
+        Args:
+            name: The name of the collection to create.
+            metadata: Optional metadata to associate with the collection.
+            embedding_function: Optional function to use to embed documents.
+                                Uses the default embedding function if not provided.
+            get_or_create: If True, return the existing collection if it exists.
+            data_loader: Optional function to use to load records (documents, images, etc.)
+        Returns:
+            Collection: The newly created collection.
+        Raises:
+            ValueError: If the collection already exists and get_or_create is False.
+            ValueError: If the collection name is invalid.
+        Examples:
+            ```python
+            client.create_collection("my_collection")
+            # collection(name="my_collection", metadata={})
+            client.create_collection("my_collection", metadata={"foo": "bar"})
+            # collection(name="my_collection", metadata={"foo": "bar"})
+            ```
+        """
+        pass
+    @abstractmethod
+    def get_collection(
+        self,
+        name: str,
+        id: Optional[UUID] = None,
+        embedding_function: Optional[
+            EmbeddingFunction[Embeddable]
+        ] = ef.DefaultEmbeddingFunction(),  # type: ignore
+        data_loader: Optional[DataLoader[Loadable]] = None,
+    ) -> Collection:
+        """Get a collection with the given name.
+        Args:
+            id: The UUID of the collection to get. Id and Name are simultaneously used for lookup if provided.
+            name: The name of the collection to get
+            embedding_function: Optional function to use to embed documents.
+                                Uses the default embedding function if not provided.
+            data_loader: Optional function to use to load records (documents, images, etc.)
+        Returns:
+            Collection: The collection
+        Raises:
+            ValueError: If the collection does not exist
+        Examples:
+            ```python
+            client.get_collection("my_collection")
+            # collection(name="my_collection", metadata={})
+            ```
+        """
+        pass
+    @abstractmethod
+    def get_or_create_collection(
+        self,
+        name: str,
+        metadata: Optional[CollectionMetadata] = None,
+        embedding_function: Optional[
+            EmbeddingFunction[Embeddable]
+        ] = ef.DefaultEmbeddingFunction(),  # type: ignore
+        data_loader: Optional[DataLoader[Loadable]] = None,
+    ) -> Collection:
+        """Get or create a collection with the given name and metadata.
+        Args:
+            name: The name of the collection to get or create
+            metadata: Optional metadata to associate with the collection. If
+            the collection alredy exists, the metadata will be updated if
+            provided and not None. If the collection does not exist, the
+            new collection will be created with the provided metadata.
+            embedding_function: Optional function to use to embed documents
+            data_loader: Optional function to use to load records (documents, images, etc.)
+        Returns:
+            The collection
+        Examples:
+            ```python
+            client.get_or_create_collection("my_collection")
+            # collection(name="my_collection", metadata={})
+            ```
+        """
+        pass
+    def _modify(
+        self,
+        id: UUID,
+        new_name: Optional[str] = None,
+        new_metadata: Optional[CollectionMetadata] = None,
+    ) -> None:
+        """[Internal] Modify a collection by UUID. Can update the name and/or metadata.
+        Args:
+            id: The internal UUID of the collection to modify.
+            new_name: The new name of the collection.
+                                If None, the existing name will remain. Defaults to None.
+            new_metadata: The new metadata to associate with the collection.
+                                      Defaults to None.
+        """
+        pass
+    @abstractmethod
+    def delete_collection(
+        self,
+        name: str,
+    ) -> None:
+        """Delete a collection with the given name.
+        Args:
+            name: The name of the collection to delete.
+        Raises:
+            ValueError: If the collection does not exist.
+        Examples:
+            ```python
+            client.delete_collection("my_collection")
+            ```
+        """
+        pass
+    #
+    # ITEM METHODS
+    #
+    @abstractmethod
+    def _add(
+        self,
+        ids: IDs,
+        collection_id: UUID,
+        embeddings: Embeddings,
+        metadatas: Optional[Metadatas] = None,
+        documents: Optional[Documents] = None,
+        uris: Optional[URIs] = None,
+    ) -> bool:
+        """[Internal] Add embeddings to a collection specified by UUID.
+        If (some) ids already exist, only the new embeddings will be added.
+        Args:
+            ids: The ids to associate with the embeddings.
+            collection_id: The UUID of the collection to add the embeddings to.
+            embedding: The sequence of embeddings to add.
+            metadata: The metadata to associate with the embeddings. Defaults to None.
+            documents: The documents to associate with the embeddings. Defaults to None.
+            uris: URIs of data sources for each embedding. Defaults to None.
+        Returns:
+            True if the embeddings were added successfully.
+        """
+        pass
+    @abstractmethod
+    def _update(
+        self,
+        collection_id: UUID,
+        ids: IDs,
+        embeddings: Optional[Embeddings] = None,
+        metadatas: Optional[Metadatas] = None,
+        documents: Optional[Documents] = None,
+        uris: Optional[URIs] = None,
+    ) -> bool:
+        """[Internal] Update entries in a collection specified by UUID.
+        Args:
+            collection_id: The UUID of the collection to update the embeddings in.
+            ids: The IDs of the entries to update.
+            embeddings: The sequence of embeddings to update. Defaults to None.
+            metadatas: The metadata to associate with the embeddings. Defaults to None.
+            documents: The documents to associate with the embeddings. Defaults to None.
+            uris: URIs of data sources for each embedding. Defaults to None.
+        Returns:
+            True if the embeddings were updated successfully.
+        """
+        pass
+    @abstractmethod
+    def _upsert(
+        self,
+        collection_id: UUID,
+        ids: IDs,
+        embeddings: Embeddings,
+        metadatas: Optional[Metadatas] = None,
+        documents: Optional[Documents] = None,
+        uris: Optional[URIs] = None,
+    ) -> bool:
+        """[Internal] Add or update entries in the a collection specified by UUID.
+        If an entry with the same id already exists, it will be updated,
+        otherwise it will be added.
+        Args:
+            collection_id: The collection to add the embeddings to
+            ids: The ids to associate with the embeddings. Defaults to None.
+            embeddings: The sequence of embeddings to add
+            metadatas: The metadata to associate with the embeddings. Defaults to None.
+            documents: The documents to associate with the embeddings. Defaults to None.
+            uris: URIs of data sources for each embedding. Defaults to None.
+        """
+        pass
+    @abstractmethod
+    def _count(self, collection_id: UUID) -> int:
+        """[Internal] Returns the number of entries in a collection specified by UUID.
+        Args:
+            collection_id: The UUID of the collection to count the embeddings in.
+        Returns:
+            int: The number of embeddings in the collection
+        """
+        pass
+    @abstractmethod
+    def _peek(self, collection_id: UUID, n: int = 10) -> GetResult:
+        """[Internal] Returns the first n entries in a collection specified by UUID.
+        Args:
+            collection_id: The UUID of the collection to peek into.
+            n: The number of entries to peek. Defaults to 10.
+        Returns:
+            GetResult: The first n entries in the collection.
+        """
+        pass
+    @abstractmethod
+    def _get(
+        self,
+        collection_id: UUID,
+        ids: Optional[IDs] = None,
+        where: Optional[Where] = {},
+        sort: Optional[str] = None,
+        limit: Optional[int] = None,
+        offset: Optional[int] = None,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        where_document: Optional[WhereDocument] = {},
+        include: Include = ["embeddings", "metadatas", "documents"],
+    ) -> GetResult:
+        """[Internal] Returns entries from a collection specified by UUID.
+        Args:
+            ids: The IDs of the entries to get. Defaults to None.
+            where: Conditional filtering on metadata. Defaults to {}.
+            sort: The column to sort the entries by. Defaults to None.
+            limit: The maximum number of entries to return. Defaults to None.
+            offset: The number of entries to skip before returning. Defaults to None.
+            page: The page number to return. Defaults to None.
+            page_size: The number of entries to return per page. Defaults to None.
+            where_document: Conditional filtering on documents. Defaults to {}.
+            include: The fields to include in the response.
+                          Defaults to ["embeddings", "metadatas", "documents"].
+        Returns:
+            GetResult: The entries in the collection that match the query.
+        """
+        pass
+    @abstractmethod
+    def _delete(
+        self,
+        collection_id: UUID,
+        ids: Optional[IDs],
+        where: Optional[Where] = {},
+        where_document: Optional[WhereDocument] = {},
+    ) -> IDs:
+        """[Internal] Deletes entries from a collection specified by UUID.
+        Args:
+            collection_id: The UUID of the collection to delete the entries from.
+            ids: The IDs of the entries to delete. Defaults to None.
+            where: Conditional filtering on metadata. Defaults to {}.
+            where_document: Conditional filtering on documents. Defaults to {}.
+        Returns:
+            IDs: The list of IDs of the entries that were deleted.
+        """
+        pass
+    @abstractmethod
+    def _query(
+        self,
+        collection_id: UUID,
+        query_embeddings: Embeddings,
+        n_results: int = 10,
+        where: Where = {},
+        where_document: WhereDocument = {},
+        include: Include = ["embeddings", "metadatas", "documents", "distances"],
+    ) -> QueryResult:
+        """[Internal] Performs a nearest neighbors query on a collection specified by UUID.
+        Args:
+            collection_id: The UUID of the collection to query.
+            query_embeddings: The embeddings to use as the query.
+            n_results: The number of results to return. Defaults to 10.
+            where: Conditional filtering on metadata. Defaults to {}.
+            where_document: Conditional filtering on documents. Defaults to {}.
+            include: The fields to include in the response.
+                          Defaults to ["embeddings", "metadatas", "documents", "distances"].
+        Returns:
+            QueryResult: The results of the query.
+        """
+        pass
+    @abstractmethod
+    def reset(self) -> bool:
+        """Resets the database. This will delete all collections and entries.
+        Returns:
+            bool: True if the database was reset successfully.
+        """
+        pass
+    @abstractmethod
+    def get_version(self) -> str:
+        """Get the version of Chroma.
+        Returns:
+            str: The version of Chroma
+        """
+        pass
+    @abstractmethod
+    def get_settings(self) -> Settings:
+        """Get the settings used to initialize.
+        Returns:
+            Settings: The settings used to initialize.
+        """
+        pass
+    @property
+    @abstractmethod
+    def max_batch_size(self) -> int:
+        """Return the maximum number of records that can be submitted in a single call
+        to submit_embeddings."""
+        pass
+class ClientAPI(BaseAPI, ABC):
+    tenant: str
+    database: str
+    @abstractmethod
+    def set_tenant(self, tenant: str, database: str = DEFAULT_DATABASE) -> None:
+        """Set the tenant and database for the client. Raises an error if the tenant or
+        database does not exist.
+        Args:
+            tenant: The tenant to set.
+            database: The database to set.
+        """
+        pass
+    @abstractmethod
+    def set_database(self, database: str) -> None:
+        """Set the database for the client. Raises an error if the database does not exist.
+        Args:
+            database: The database to set.
+        """
+        pass
+    @staticmethod
+    @abstractmethod
+    def clear_system_cache() -> None:
+        """Clear the system cache so that new systems can be created for an existing path.
+        This should only be used for testing purposes."""
+        pass
+class AdminAPI(ABC):
+    @abstractmethod
+    def create_database(self, name: str, tenant: str = DEFAULT_TENANT) -> None:
+        """Create a new database. Raises an error if the database already exists.
+        Args:
+            database: The name of the database to create.
+        """
+        pass
+    @abstractmethod
+    def get_database(self, name: str, tenant: str = DEFAULT_TENANT) -> Database:
+        """Get a database. Raises an error if the database does not exist.
+        Args:
+            database: The name of the database to get.
+            tenant: The tenant of the database to get.
+        """
+        pass
+    @abstractmethod
+    def create_tenant(self, name: str) -> None:
+        """Create a new tenant. Raises an error if the tenant already exists.
+        Args:
+            tenant: The name of the tenant to create.
+        """
+        pass
+    @abstractmethod
+    def get_tenant(self, name: str) -> Tenant:
+        """Get a tenant. Raises an error if the tenant does not exist.
+        Args:
+            tenant: The name of the tenant to get.
+        """
+        pass
+class ServerAPI(BaseAPI, AdminAPI, Component):
+    """An API instance that extends the relevant Base API methods by passing
+    in a tenant and database. This is the root component of the Chroma System"""
+    @abstractmethod
+    @override
+    def list_collections(
+        self,
+        limit: Optional[int] = None,
+        offset: Optional[int] = None,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+    ) -> Sequence[Collection]:
+        pass
+    @abstractmethod
+    @override
+    def count_collections(
+        self, tenant: str = DEFAULT_TENANT, database: str = DEFAULT_DATABASE
+    ) -> int:
+        pass
+    @abstractmethod
+    @override
+    def create_collection(
+        self,
+        name: str,
+        metadata: Optional[CollectionMetadata] = None,
+        embedding_function: Optional[
+            EmbeddingFunction[Embeddable]
+        ] = ef.DefaultEmbeddingFunction(),  # type: ignore
+        data_loader: Optional[DataLoader[Loadable]] = None,
+        get_or_create: bool = False,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+    ) -> Collection:
+        pass
+    @abstractmethod
+    @override
+    def get_collection(
+        self,
+        name: str,
+        id: Optional[UUID] = None,
+        embedding_function: Optional[
+            EmbeddingFunction[Embeddable]
+        ] = ef.DefaultEmbeddingFunction(),  # type: ignore
+        data_loader: Optional[DataLoader[Loadable]] = None,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+    ) -> Collection:
+        pass
+    @abstractmethod
+    @override
+    def get_or_create_collection(
+        self,
+        name: str,
+        metadata: Optional[CollectionMetadata] = None,
+        embedding_function: Optional[
+            EmbeddingFunction[Embeddable]
+        ] = ef.DefaultEmbeddingFunction(),  # type: ignore
+        data_loader: Optional[DataLoader[Loadable]] = None,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+    ) -> Collection:
+        pass
+    @abstractmethod
+    @override
+    def delete_collection(
+        self,
+        name: str,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+    ) -> None:
+        pass

chromadb/api/client.py ADDED Viewed

	@@ -0,0 +1,496 @@

+from typing import ClassVar, Dict, Optional, Sequence
+from uuid import UUID
+import uuid
+from overrides import override
+import requests
+from chromadb.api import AdminAPI, ClientAPI, ServerAPI
+from chromadb.api.types import (
+    CollectionMetadata,
+    DataLoader,
+    Documents,
+    Embeddable,
+    EmbeddingFunction,
+    Embeddings,
+    GetResult,
+    IDs,
+    Include,
+    Loadable,
+    Metadatas,
+    QueryResult,
+    URIs,
+)
+from chromadb.config import Settings, System
+from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE
+from chromadb.api.models.Collection import Collection
+from chromadb.errors import ChromaError
+from chromadb.telemetry.product import ProductTelemetryClient
+from chromadb.telemetry.product.events import ClientStartEvent
+from chromadb.types import Database, Tenant, Where, WhereDocument
+import chromadb.utils.embedding_functions as ef
+class SharedSystemClient:
+    _identifer_to_system: ClassVar[Dict[str, System]] = {}
+    _identifier: str
+    # region Initialization
+    def __init__(
+        self,
+        settings: Settings = Settings(),
+    ) -> None:
+        self._identifier = SharedSystemClient._get_identifier_from_settings(settings)
+        SharedSystemClient._create_system_if_not_exists(self._identifier, settings)
+    @classmethod
+    def _create_system_if_not_exists(
+        cls, identifier: str, settings: Settings
+    ) -> System:
+        if identifier not in cls._identifer_to_system:
+            new_system = System(settings)
+            cls._identifer_to_system[identifier] = new_system
+            new_system.instance(ProductTelemetryClient)
+            new_system.instance(ServerAPI)
+            new_system.start()
+        else:
+            previous_system = cls._identifer_to_system[identifier]
+            # For now, the settings must match
+            if previous_system.settings != settings:
+                raise ValueError(
+                    f"An instance of Chroma already exists for {identifier} with different settings"
+                )
+        return cls._identifer_to_system[identifier]
+    @staticmethod
+    def _get_identifier_from_settings(settings: Settings) -> str:
+        identifier = ""
+        api_impl = settings.chroma_api_impl
+        if api_impl is None:
+            raise ValueError("Chroma API implementation must be set in settings")
+        elif api_impl == "chromadb.api.segment.SegmentAPI":
+            if settings.is_persistent:
+                identifier = settings.persist_directory
+            else:
+                identifier = (
+                    "ephemeral"  # TODO: support pathing and  multiple ephemeral clients
+                )
+        elif api_impl == "chromadb.api.fastapi.FastAPI":
+            # FastAPI clients can all use unique system identifiers since their configurations can be independent, e.g. different auth tokens
+            identifier = str(uuid.uuid4())
+        else:
+            raise ValueError(f"Unsupported Chroma API implementation {api_impl}")
+        return identifier
+    @staticmethod
+    def _populate_data_from_system(system: System) -> str:
+        identifier = SharedSystemClient._get_identifier_from_settings(system.settings)
+        SharedSystemClient._identifer_to_system[identifier] = system
+        return identifier
+    @classmethod
+    def from_system(cls, system: System) -> "SharedSystemClient":
+        """Create a client from an existing system. This is useful for testing and debugging."""
+        SharedSystemClient._populate_data_from_system(system)
+        instance = cls(system.settings)
+        return instance
+    @staticmethod
+    def clear_system_cache() -> None:
+        SharedSystemClient._identifer_to_system = {}
+    @property
+    def _system(self) -> System:
+        return SharedSystemClient._identifer_to_system[self._identifier]
+    # endregion
+class Client(SharedSystemClient, ClientAPI):
+    """A client for Chroma. This is the main entrypoint for interacting with Chroma.
+    A client internally stores its tenant and database and proxies calls to a
+    Server API instance of Chroma. It treats the Server API and corresponding System
+    as a singleton, so multiple clients connecting to the same resource will share the
+    same API instance.
+    Client implementations should be implement their own API-caching strategies.
+    """
+    tenant: str = DEFAULT_TENANT
+    database: str = DEFAULT_DATABASE
+    _server: ServerAPI
+    # An internal admin client for verifying that databases and tenants exist
+    _admin_client: AdminAPI
+    # region Initialization
+    def __init__(
+        self,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+        settings: Settings = Settings(),
+    ) -> None:
+        super().__init__(settings=settings)
+        self.tenant = tenant
+        self.database = database
+        # Create an admin client for verifying that databases and tenants exist
+        self._admin_client = AdminClient.from_system(self._system)
+        self._validate_tenant_database(tenant=tenant, database=database)
+        # Get the root system component we want to interact with
+        self._server = self._system.instance(ServerAPI)
+        # Submit event for a client start
+        telemetry_client = self._system.instance(ProductTelemetryClient)
+        telemetry_client.capture(ClientStartEvent())
+    @classmethod
+    @override
+    def from_system(
+        cls,
+        system: System,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+    ) -> "Client":
+        SharedSystemClient._populate_data_from_system(system)
+        instance = cls(tenant=tenant, database=database, settings=system.settings)
+        return instance
+    # endregion
+    # region BaseAPI Methods
+    # Note - we could do this in less verbose ways, but they break type checking
+    @override
+    def heartbeat(self) -> int:
+        return self._server.heartbeat()
+    @override
+    def list_collections(
+        self, limit: Optional[int] = None, offset: Optional[int] = None
+    ) -> Sequence[Collection]:
+        return self._server.list_collections(
+            limit, offset, tenant=self.tenant, database=self.database
+        )
+    @override
+    def count_collections(self) -> int:
+        return self._server.count_collections(
+            tenant=self.tenant, database=self.database
+        )
+    @override
+    def create_collection(
+        self,
+        name: str,
+        metadata: Optional[CollectionMetadata] = None,
+        embedding_function: Optional[
+            EmbeddingFunction[Embeddable]
+        ] = ef.DefaultEmbeddingFunction(),  # type: ignore
+        data_loader: Optional[DataLoader[Loadable]] = None,
+        get_or_create: bool = False,
+    ) -> Collection:
+        return self._server.create_collection(
+            name=name,
+            metadata=metadata,
+            embedding_function=embedding_function,
+            data_loader=data_loader,
+            tenant=self.tenant,
+            database=self.database,
+            get_or_create=get_or_create,
+        )
+    @override
+    def get_collection(
+        self,
+        name: str,
+        id: Optional[UUID] = None,
+        embedding_function: Optional[
+            EmbeddingFunction[Embeddable]
+        ] = ef.DefaultEmbeddingFunction(),  # type: ignore
+        data_loader: Optional[DataLoader[Loadable]] = None,
+    ) -> Collection:
+        return self._server.get_collection(
+            id=id,
+            name=name,
+            embedding_function=embedding_function,
+            data_loader=data_loader,
+            tenant=self.tenant,
+            database=self.database,
+        )
+    @override
+    def get_or_create_collection(
+        self,
+        name: str,
+        metadata: Optional[CollectionMetadata] = None,
+        embedding_function: Optional[
+            EmbeddingFunction[Embeddable]
+        ] = ef.DefaultEmbeddingFunction(),  # type: ignore
+        data_loader: Optional[DataLoader[Loadable]] = None,
+    ) -> Collection:
+        return self._server.get_or_create_collection(
+            name=name,
+            metadata=metadata,
+            embedding_function=embedding_function,
+            data_loader=data_loader,
+            tenant=self.tenant,
+            database=self.database,
+        )
+    @override
+    def _modify(
+        self,
+        id: UUID,
+        new_name: Optional[str] = None,
+        new_metadata: Optional[CollectionMetadata] = None,
+    ) -> None:
+        return self._server._modify(
+            id=id,
+            new_name=new_name,
+            new_metadata=new_metadata,
+        )
+    @override
+    def delete_collection(
+        self,
+        name: str,
+    ) -> None:
+        return self._server.delete_collection(
+            name=name,
+            tenant=self.tenant,
+            database=self.database,
+        )
+    #
+    # ITEM METHODS
+    #
+    @override
+    def _add(
+        self,
+        ids: IDs,
+        collection_id: UUID,
+        embeddings: Embeddings,
+        metadatas: Optional[Metadatas] = None,
+        documents: Optional[Documents] = None,
+        uris: Optional[URIs] = None,
+    ) -> bool:
+        return self._server._add(
+            ids=ids,
+            collection_id=collection_id,
+            embeddings=embeddings,
+            metadatas=metadatas,
+            documents=documents,
+            uris=uris,
+        )
+    @override
+    def _update(
+        self,
+        collection_id: UUID,
+        ids: IDs,
+        embeddings: Optional[Embeddings] = None,
+        metadatas: Optional[Metadatas] = None,
+        documents: Optional[Documents] = None,
+        uris: Optional[URIs] = None,
+    ) -> bool:
+        return self._server._update(
+            collection_id=collection_id,
+            ids=ids,
+            embeddings=embeddings,
+            metadatas=metadatas,
+            documents=documents,
+            uris=uris,
+        )
+    @override
+    def _upsert(
+        self,
+        collection_id: UUID,
+        ids: IDs,
+        embeddings: Embeddings,
+        metadatas: Optional[Metadatas] = None,
+        documents: Optional[Documents] = None,
+        uris: Optional[URIs] = None,
+    ) -> bool:
+        return self._server._upsert(
+            collection_id=collection_id,
+            ids=ids,
+            embeddings=embeddings,
+            metadatas=metadatas,
+            documents=documents,
+            uris=uris,
+        )
+    @override
+    def _count(self, collection_id: UUID) -> int:
+        return self._server._count(
+            collection_id=collection_id,
+        )
+    @override
+    def _peek(self, collection_id: UUID, n: int = 10) -> GetResult:
+        return self._server._peek(
+            collection_id=collection_id,
+            n=n,
+        )
+    @override
+    def _get(
+        self,
+        collection_id: UUID,
+        ids: Optional[IDs] = None,
+        where: Optional[Where] = {},
+        sort: Optional[str] = None,
+        limit: Optional[int] = None,
+        offset: Optional[int] = None,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        where_document: Optional[WhereDocument] = {},
+        include: Include = ["embeddings", "metadatas", "documents"],
+    ) -> GetResult:
+        return self._server._get(
+            collection_id=collection_id,
+            ids=ids,
+            where=where,
+            sort=sort,
+            limit=limit,
+            offset=offset,
+            page=page,
+            page_size=page_size,
+            where_document=where_document,
+            include=include,
+        )
+    def _delete(
+        self,
+        collection_id: UUID,
+        ids: Optional[IDs],
+        where: Optional[Where] = {},
+        where_document: Optional[WhereDocument] = {},
+    ) -> IDs:
+        return self._server._delete(
+            collection_id=collection_id,
+            ids=ids,
+            where=where,
+            where_document=where_document,
+        )
+    @override
+    def _query(
+        self,
+        collection_id: UUID,
+        query_embeddings: Embeddings,
+        n_results: int = 10,
+        where: Where = {},
+        where_document: WhereDocument = {},
+        include: Include = ["embeddings", "metadatas", "documents", "distances"],
+    ) -> QueryResult:
+        return self._server._query(
+            collection_id=collection_id,
+            query_embeddings=query_embeddings,
+            n_results=n_results,
+            where=where,
+            where_document=where_document,
+            include=include,
+        )
+    @override
+    def reset(self) -> bool:
+        return self._server.reset()
+    @override
+    def get_version(self) -> str:
+        return self._server.get_version()
+    @override
+    def get_settings(self) -> Settings:
+        return self._server.get_settings()
+    @property
+    @override
+    def max_batch_size(self) -> int:
+        return self._server.max_batch_size
+    # endregion
+    # region ClientAPI Methods
+    @override
+    def set_tenant(self, tenant: str, database: str = DEFAULT_DATABASE) -> None:
+        self._validate_tenant_database(tenant=tenant, database=database)
+        self.tenant = tenant
+        self.database = database
+    @override
+    def set_database(self, database: str) -> None:
+        self._validate_tenant_database(tenant=self.tenant, database=database)
+        self.database = database
+    def _validate_tenant_database(self, tenant: str, database: str) -> None:
+        try:
+            self._admin_client.get_tenant(name=tenant)
+        except requests.exceptions.ConnectionError:
+            raise ValueError(
+                "Could not connect to a Chroma server. Are you sure it is running?"
+            )
+        # Propagate ChromaErrors
+        except ChromaError as e:
+            raise e
+        except Exception:
+            raise ValueError(
+                f"Could not connect to tenant {tenant}. Are you sure it exists?"
+            )
+        try:
+            self._admin_client.get_database(name=database, tenant=tenant)
+        except requests.exceptions.ConnectionError:
+            raise ValueError(
+                "Could not connect to a Chroma server. Are you sure it is running?"
+            )
+        except Exception:
+            raise ValueError(
+                f"Could not connect to database {database} for tenant {tenant}. Are you sure it exists?"
+            )
+    # endregion
+class AdminClient(SharedSystemClient, AdminAPI):
+    _server: ServerAPI
+    def __init__(self, settings: Settings = Settings()) -> None:
+        super().__init__(settings)
+        self._server = self._system.instance(ServerAPI)
+    @override
+    def create_database(self, name: str, tenant: str = DEFAULT_TENANT) -> None:
+        return self._server.create_database(name=name, tenant=tenant)
+    @override
+    def get_database(self, name: str, tenant: str = DEFAULT_TENANT) -> Database:
+        return self._server.get_database(name=name, tenant=tenant)
+    @override
+    def create_tenant(self, name: str) -> None:
+        return self._server.create_tenant(name=name)
+    @override
+    def get_tenant(self, name: str) -> Tenant:
+        return self._server.get_tenant(name=name)
+    @classmethod
+    @override
+    def from_system(
+        cls,
+        system: System,
+    ) -> "AdminClient":
+        SharedSystemClient._populate_data_from_system(system)
+        instance = cls(settings=system.settings)
+        return instance

chromadb/api/fastapi.py ADDED Viewed

	@@ -0,0 +1,654 @@

+import json
+import logging
+from typing import Optional, cast, Tuple
+from typing import Sequence
+from uuid import UUID
+import requests
+from overrides import override
+import chromadb.errors as errors
+from chromadb.types import Database, Tenant
+import chromadb.utils.embedding_functions as ef
+from chromadb.api import ServerAPI
+from chromadb.api.models.Collection import Collection
+from chromadb.api.types import (
+    DataLoader,
+    Documents,
+    Embeddable,
+    Embeddings,
+    EmbeddingFunction,
+    IDs,
+    Include,
+    Loadable,
+    Metadatas,
+    URIs,
+    Where,
+    WhereDocument,
+    GetResult,
+    QueryResult,
+    CollectionMetadata,
+    validate_batch,
+)
+from chromadb.auth import (
+    ClientAuthProvider,
+)
+from chromadb.auth.providers import RequestsClientAuthProtocolAdapter
+from chromadb.auth.registry import resolve_provider
+from chromadb.config import DEFAULT_DATABASE, DEFAULT_TENANT, Settings, System
+from chromadb.telemetry.opentelemetry import (
+    OpenTelemetryClient,
+    OpenTelemetryGranularity,
+    trace_method,
+)
+from chromadb.telemetry.product import ProductTelemetryClient
+from urllib.parse import urlparse, urlunparse, quote
+logger = logging.getLogger(__name__)
+class FastAPI(ServerAPI):
+    _settings: Settings
+    _max_batch_size: int = -1
+    @staticmethod
+    def _validate_host(host: str) -> None:
+        parsed = urlparse(host)
+        if "/" in host and parsed.scheme not in {"http", "https"}:
+            raise ValueError(
+                "Invalid URL. " f"Unrecognized protocol - {parsed.scheme}."
+            )
+        if "/" in host and (not host.startswith("http")):
+            raise ValueError(
+                "Invalid URL. "
+                "Seems that you are trying to pass URL as a host but without \
+                    specifying the protocol. "
+                "Please add http:// or https:// to the host."
+            )
+    @staticmethod
+    def resolve_url(
+        chroma_server_host: str,
+        chroma_server_ssl_enabled: Optional[bool] = False,
+        default_api_path: Optional[str] = "",
+        chroma_server_http_port: Optional[int] = 8000,
+    ) -> str:
+        _skip_port = False
+        _chroma_server_host = chroma_server_host
+        FastAPI._validate_host(_chroma_server_host)
+        if _chroma_server_host.startswith("http"):
+            logger.debug("Skipping port as the user is passing a full URL")
+            _skip_port = True
+        parsed = urlparse(_chroma_server_host)
+        scheme = "https" if chroma_server_ssl_enabled else parsed.scheme or "http"
+        net_loc = parsed.netloc or parsed.hostname or chroma_server_host
+        port = (
+            ":" + str(parsed.port or chroma_server_http_port) if not _skip_port else ""
+        )
+        path = parsed.path or default_api_path
+        if not path or path == net_loc:
+            path = default_api_path if default_api_path else ""
+        if not path.endswith(default_api_path or ""):
+            path = path + default_api_path if default_api_path else ""
+        full_url = urlunparse(
+            (scheme, f"{net_loc}{port}", quote(path.replace("//", "/")), "", "", "")
+        )
+        return full_url
+    def __init__(self, system: System):
+        super().__init__(system)
+        system.settings.require("chroma_server_host")
+        system.settings.require("chroma_server_http_port")
+        self._opentelemetry_client = self.require(OpenTelemetryClient)
+        self._product_telemetry_client = self.require(ProductTelemetryClient)
+        self._settings = system.settings
+        self._api_url = FastAPI.resolve_url(
+            chroma_server_host=str(system.settings.chroma_server_host),
+            chroma_server_http_port=int(str(system.settings.chroma_server_http_port)),
+            chroma_server_ssl_enabled=system.settings.chroma_server_ssl_enabled,
+            default_api_path=system.settings.chroma_server_api_default_path,
+        )
+        self._header = system.settings.chroma_server_headers
+        if (
+            system.settings.chroma_client_auth_provider
+            and system.settings.chroma_client_auth_protocol_adapter
+        ):
+            self._auth_provider = self.require(
+                resolve_provider(
+                    system.settings.chroma_client_auth_provider, ClientAuthProvider
+                )
+            )
+            self._adapter = cast(
+                RequestsClientAuthProtocolAdapter,
+                system.require(
+                    resolve_provider(
+                        system.settings.chroma_client_auth_protocol_adapter,
+                        RequestsClientAuthProtocolAdapter,
+                    )
+                ),
+            )
+            self._session = self._adapter.session
+        else:
+            self._session = requests.Session()
+        if self._header is not None:
+            self._session.headers.update(self._header)
+        if self._settings.chroma_server_ssl_verify is not None:
+            self._session.verify = self._settings.chroma_server_ssl_verify
+    @trace_method("FastAPI.heartbeat", OpenTelemetryGranularity.OPERATION)
+    @override
+    def heartbeat(self) -> int:
+        """Returns the current server time in nanoseconds to check if the server is alive"""
+        resp = self._session.get(self._api_url)
+        raise_chroma_error(resp)
+        return int(resp.json()["nanosecond heartbeat"])
+    @trace_method("FastAPI.create_database", OpenTelemetryGranularity.OPERATION)
+    @override
+    def create_database(
+        self,
+        name: str,
+        tenant: str = DEFAULT_TENANT,
+    ) -> None:
+        """Creates a database"""
+        resp = self._session.post(
+            self._api_url + "/databases",
+            data=json.dumps({"name": name}),
+            params={"tenant": tenant},
+        )
+        raise_chroma_error(resp)
+    @trace_method("FastAPI.get_database", OpenTelemetryGranularity.OPERATION)
+    @override
+    def get_database(
+        self,
+        name: str,
+        tenant: str = DEFAULT_TENANT,
+    ) -> Database:
+        """Returns a database"""
+        resp = self._session.get(
+            self._api_url + "/databases/" + name,
+            params={"tenant": tenant},
+        )
+        raise_chroma_error(resp)
+        resp_json = resp.json()
+        return Database(
+            id=resp_json["id"], name=resp_json["name"], tenant=resp_json["tenant"]
+        )
+    @trace_method("FastAPI.create_tenant", OpenTelemetryGranularity.OPERATION)
+    @override
+    def create_tenant(self, name: str) -> None:
+        resp = self._session.post(
+            self._api_url + "/tenants",
+            data=json.dumps({"name": name}),
+        )
+        raise_chroma_error(resp)
+    @trace_method("FastAPI.get_tenant", OpenTelemetryGranularity.OPERATION)
+    @override
+    def get_tenant(self, name: str) -> Tenant:
+        resp = self._session.get(
+            self._api_url + "/tenants/" + name,
+        )
+        raise_chroma_error(resp)
+        resp_json = resp.json()
+        return Tenant(name=resp_json["name"])
+    @trace_method("FastAPI.list_collections", OpenTelemetryGranularity.OPERATION)
+    @override
+    def list_collections(
+        self,
+        limit: Optional[int] = None,
+        offset: Optional[int] = None,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+    ) -> Sequence[Collection]:
+        """Returns a list of all collections"""
+        resp = self._session.get(
+            self._api_url + "/collections",
+            params={
+                "tenant": tenant,
+                "database": database,
+                "limit": limit,
+                "offset": offset,
+            },
+        )
+        raise_chroma_error(resp)
+        json_collections = resp.json()
+        collections = []
+        for json_collection in json_collections:
+            collections.append(Collection(self, **json_collection))
+        return collections
+    @trace_method("FastAPI.count_collections", OpenTelemetryGranularity.OPERATION)
+    @override
+    def count_collections(
+        self, tenant: str = DEFAULT_TENANT, database: str = DEFAULT_DATABASE
+    ) -> int:
+        """Returns a count of collections"""
+        resp = self._session.get(
+            self._api_url + "/count_collections",
+            params={"tenant": tenant, "database": database},
+        )
+        raise_chroma_error(resp)
+        return cast(int, resp.json())
+    @trace_method("FastAPI.create_collection", OpenTelemetryGranularity.OPERATION)
+    @override
+    def create_collection(
+        self,
+        name: str,
+        metadata: Optional[CollectionMetadata] = None,
+        embedding_function: Optional[
+            EmbeddingFunction[Embeddable]
+        ] = ef.DefaultEmbeddingFunction(),  # type: ignore
+        data_loader: Optional[DataLoader[Loadable]] = None,
+        get_or_create: bool = False,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+    ) -> Collection:
+        """Creates a collection"""
+        resp = self._session.post(
+            self._api_url + "/collections",
+            data=json.dumps(
+                {
+                    "name": name,
+                    "metadata": metadata,
+                    "get_or_create": get_or_create,
+                }
+            ),
+            params={"tenant": tenant, "database": database},
+        )
+        raise_chroma_error(resp)
+        resp_json = resp.json()
+        return Collection(
+            client=self,
+            id=resp_json["id"],
+            name=resp_json["name"],
+            embedding_function=embedding_function,
+            data_loader=data_loader,
+            metadata=resp_json["metadata"],
+        )
+    @trace_method("FastAPI.get_collection", OpenTelemetryGranularity.OPERATION)
+    @override
+    def get_collection(
+        self,
+        name: str,
+        id: Optional[UUID] = None,
+        embedding_function: Optional[
+            EmbeddingFunction[Embeddable]
+        ] = ef.DefaultEmbeddingFunction(),  # type: ignore
+        data_loader: Optional[DataLoader[Loadable]] = None,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+    ) -> Collection:
+        """Returns a collection"""
+        if (name is None and id is None) or (name is not None and id is not None):
+            raise ValueError("Name or id must be specified, but not both")
+        _params = {"tenant": tenant, "database": database}
+        if id is not None:
+            _params["type"] = str(id)
+        resp = self._session.get(
+            self._api_url + "/collections/" + name if name else str(id), params=_params
+        )
+        raise_chroma_error(resp)
+        resp_json = resp.json()
+        return Collection(
+            client=self,
+            name=resp_json["name"],
+            id=resp_json["id"],
+            embedding_function=embedding_function,
+            data_loader=data_loader,
+            metadata=resp_json["metadata"],
+        )
+    @trace_method(
+        "FastAPI.get_or_create_collection", OpenTelemetryGranularity.OPERATION
+    )
+    @override
+    def get_or_create_collection(
+        self,
+        name: str,
+        metadata: Optional[CollectionMetadata] = None,
+        embedding_function: Optional[
+            EmbeddingFunction[Embeddable]
+        ] = ef.DefaultEmbeddingFunction(),  # type: ignore
+        data_loader: Optional[DataLoader[Loadable]] = None,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+    ) -> Collection:
+        return cast(
+            Collection,
+            self.create_collection(
+                name=name,
+                metadata=metadata,
+                embedding_function=embedding_function,
+                data_loader=data_loader,
+                get_or_create=True,
+                tenant=tenant,
+                database=database,
+            ),
+        )
+    @trace_method("FastAPI._modify", OpenTelemetryGranularity.OPERATION)
+    @override
+    def _modify(
+        self,
+        id: UUID,
+        new_name: Optional[str] = None,
+        new_metadata: Optional[CollectionMetadata] = None,
+    ) -> None:
+        """Updates a collection"""
+        resp = self._session.put(
+            self._api_url + "/collections/" + str(id),
+            data=json.dumps({"new_metadata": new_metadata, "new_name": new_name}),
+        )
+        raise_chroma_error(resp)
+    @trace_method("FastAPI.delete_collection", OpenTelemetryGranularity.OPERATION)
+    @override
+    def delete_collection(
+        self,
+        name: str,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+    ) -> None:
+        """Deletes a collection"""
+        resp = self._session.delete(
+            self._api_url + "/collections/" + name,
+            params={"tenant": tenant, "database": database},
+        )
+        raise_chroma_error(resp)
+    @trace_method("FastAPI._count", OpenTelemetryGranularity.OPERATION)
+    @override
+    def _count(
+        self,
+        collection_id: UUID,
+    ) -> int:
+        """Returns the number of embeddings in the database"""
+        resp = self._session.get(
+            self._api_url + "/collections/" + str(collection_id) + "/count"
+        )
+        raise_chroma_error(resp)
+        return cast(int, resp.json())
+    @trace_method("FastAPI._peek", OpenTelemetryGranularity.OPERATION)
+    @override
+    def _peek(
+        self,
+        collection_id: UUID,
+        n: int = 10,
+    ) -> GetResult:
+        return cast(
+            GetResult,
+            self._get(
+                collection_id,
+                limit=n,
+                include=["embeddings", "documents", "metadatas"],
+            ),
+        )
+    @trace_method("FastAPI._get", OpenTelemetryGranularity.OPERATION)
+    @override
+    def _get(
+        self,
+        collection_id: UUID,
+        ids: Optional[IDs] = None,
+        where: Optional[Where] = {},
+        sort: Optional[str] = None,
+        limit: Optional[int] = None,
+        offset: Optional[int] = None,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        where_document: Optional[WhereDocument] = {},
+        include: Include = ["metadatas", "documents"],
+    ) -> GetResult:
+        if page and page_size:
+            offset = (page - 1) * page_size
+            limit = page_size
+        resp = self._session.post(
+            self._api_url + "/collections/" + str(collection_id) + "/get",
+            data=json.dumps(
+                {
+                    "ids": ids,
+                    "where": where,
+                    "sort": sort,
+                    "limit": limit,
+                    "offset": offset,
+                    "where_document": where_document,
+                    "include": include,
+                }
+            ),
+        )
+        raise_chroma_error(resp)
+        body = resp.json()
+        return GetResult(
+            ids=body["ids"],
+            embeddings=body.get("embeddings", None),
+            metadatas=body.get("metadatas", None),
+            documents=body.get("documents", None),
+            data=None,
+            uris=body.get("uris", None),
+        )
+    @trace_method("FastAPI._delete", OpenTelemetryGranularity.OPERATION)
+    @override
+    def _delete(
+        self,
+        collection_id: UUID,
+        ids: Optional[IDs] = None,
+        where: Optional[Where] = {},
+        where_document: Optional[WhereDocument] = {},
+    ) -> IDs:
+        """Deletes embeddings from the database"""
+        resp = self._session.post(
+            self._api_url + "/collections/" + str(collection_id) + "/delete",
+            data=json.dumps(
+                {"where": where, "ids": ids, "where_document": where_document}
+            ),
+        )
+        raise_chroma_error(resp)
+        return cast(IDs, resp.json())
+    @trace_method("FastAPI._submit_batch", OpenTelemetryGranularity.ALL)
+    def _submit_batch(
+        self,
+        batch: Tuple[
+            IDs,
+            Optional[Embeddings],
+            Optional[Metadatas],
+            Optional[Documents],
+            Optional[URIs],
+        ],
+        url: str,
+    ) -> requests.Response:
+        """
+        Submits a batch of embeddings to the database
+        """
+        resp = self._session.post(
+            self._api_url + url,
+            data=json.dumps(
+                {
+                    "ids": batch[0],
+                    "embeddings": batch[1],
+                    "metadatas": batch[2],
+                    "documents": batch[3],
+                    "uris": batch[4],
+                }
+            ),
+        )
+        return resp
+    @trace_method("FastAPI._add", OpenTelemetryGranularity.ALL)
+    @override
+    def _add(
+        self,
+        ids: IDs,
+        collection_id: UUID,
+        embeddings: Embeddings,
+        metadatas: Optional[Metadatas] = None,
+        documents: Optional[Documents] = None,
+        uris: Optional[URIs] = None,
+    ) -> bool:
+        """
+        Adds a batch of embeddings to the database
+        - pass in column oriented data lists
+        """
+        batch = (ids, embeddings, metadatas, documents, uris)
+        validate_batch(batch, {"max_batch_size": self.max_batch_size})
+        resp = self._submit_batch(batch, "/collections/" + str(collection_id) + "/add")
+        raise_chroma_error(resp)
+        return True
+    @trace_method("FastAPI._update", OpenTelemetryGranularity.ALL)
+    @override
+    def _update(
+        self,
+        collection_id: UUID,
+        ids: IDs,
+        embeddings: Optional[Embeddings] = None,
+        metadatas: Optional[Metadatas] = None,
+        documents: Optional[Documents] = None,
+        uris: Optional[URIs] = None,
+    ) -> bool:
+        """
+        Updates a batch of embeddings in the database
+        - pass in column oriented data lists
+        """
+        batch = (ids, embeddings, metadatas, documents, uris)
+        validate_batch(batch, {"max_batch_size": self.max_batch_size})
+        resp = self._submit_batch(
+            batch, "/collections/" + str(collection_id) + "/update"
+        )
+        raise_chroma_error(resp)
+        return True
+    @trace_method("FastAPI._upsert", OpenTelemetryGranularity.ALL)
+    @override
+    def _upsert(
+        self,
+        collection_id: UUID,
+        ids: IDs,
+        embeddings: Embeddings,
+        metadatas: Optional[Metadatas] = None,
+        documents: Optional[Documents] = None,
+        uris: Optional[URIs] = None,
+    ) -> bool:
+        """
+        Upserts a batch of embeddings in the database
+        - pass in column oriented data lists
+        """
+        batch = (ids, embeddings, metadatas, documents, uris)
+        validate_batch(batch, {"max_batch_size": self.max_batch_size})
+        resp = self._submit_batch(
+            batch, "/collections/" + str(collection_id) + "/upsert"
+        )
+        raise_chroma_error(resp)
+        return True
+    @trace_method("FastAPI._query", OpenTelemetryGranularity.ALL)
+    @override
+    def _query(
+        self,
+        collection_id: UUID,
+        query_embeddings: Embeddings,
+        n_results: int = 10,
+        where: Optional[Where] = {},
+        where_document: Optional[WhereDocument] = {},
+        include: Include = ["metadatas", "documents", "distances"],
+    ) -> QueryResult:
+        """Gets the nearest neighbors of a single embedding"""
+        resp = self._session.post(
+            self._api_url + "/collections/" + str(collection_id) + "/query",
+            data=json.dumps(
+                {
+                    "query_embeddings": query_embeddings,
+                    "n_results": n_results,
+                    "where": where,
+                    "where_document": where_document,
+                    "include": include,
+                }
+            ),
+        )
+        raise_chroma_error(resp)
+        body = resp.json()
+        return QueryResult(
+            ids=body["ids"],
+            distances=body.get("distances", None),
+            embeddings=body.get("embeddings", None),
+            metadatas=body.get("metadatas", None),
+            documents=body.get("documents", None),
+            uris=body.get("uris", None),
+            data=None,
+        )
+    @trace_method("FastAPI.reset", OpenTelemetryGranularity.ALL)
+    @override
+    def reset(self) -> bool:
+        """Resets the database"""
+        resp = self._session.post(self._api_url + "/reset")
+        raise_chroma_error(resp)
+        return cast(bool, resp.json())
+    @trace_method("FastAPI.get_version", OpenTelemetryGranularity.OPERATION)
+    @override
+    def get_version(self) -> str:
+        """Returns the version of the server"""
+        resp = self._session.get(self._api_url + "/version")
+        raise_chroma_error(resp)
+        return cast(str, resp.json())
+    @override
+    def get_settings(self) -> Settings:
+        """Returns the settings of the client"""
+        return self._settings
+    @property
+    @trace_method("FastAPI.max_batch_size", OpenTelemetryGranularity.OPERATION)
+    @override
+    def max_batch_size(self) -> int:
+        if self._max_batch_size == -1:
+            resp = self._session.get(self._api_url + "/pre-flight-checks")
+            raise_chroma_error(resp)
+            self._max_batch_size = cast(int, resp.json()["max_batch_size"])
+        return self._max_batch_size
+def raise_chroma_error(resp: requests.Response) -> None:
+    """Raises an error if the response is not ok, using a ChromaError if possible"""
+    if resp.ok:
+        return
+    chroma_error = None
+    try:
+        body = resp.json()
+        if "error" in body:
+            if body["error"] in errors.error_types:
+                chroma_error = errors.error_types[body["error"]](body["message"])
+    except BaseException:
+        pass
+    if chroma_error:
+        raise chroma_error
+    try:
+        resp.raise_for_status()
+    except requests.HTTPError:
+        raise (Exception(resp.text))

chromadb/api/models/Collection.py ADDED Viewed

	@@ -0,0 +1,633 @@

+from typing import TYPE_CHECKING, Optional, Tuple, Any, Union
+import numpy as np
+from pydantic import BaseModel, PrivateAttr
+from uuid import UUID
+import chromadb.utils.embedding_functions as ef
+from chromadb.api.types import (
+    URI,
+    CollectionMetadata,
+    DataLoader,
+    Embedding,
+    Embeddings,
+    Embeddable,
+    Include,
+    Loadable,
+    Metadata,
+    Metadatas,
+    Document,
+    Documents,
+    Image,
+    Images,
+    URIs,
+    Where,
+    IDs,
+    EmbeddingFunction,
+    GetResult,
+    QueryResult,
+    ID,
+    OneOrMany,
+    WhereDocument,
+    maybe_cast_one_to_many_ids,
+    maybe_cast_one_to_many_embedding,
+    maybe_cast_one_to_many_metadata,
+    maybe_cast_one_to_many_document,
+    maybe_cast_one_to_many_image,
+    maybe_cast_one_to_many_uri,
+    validate_ids,
+    validate_include,
+    validate_metadata,
+    validate_metadatas,
+    validate_where,
+    validate_where_document,
+    validate_n_results,
+    validate_embeddings,
+    validate_embedding_function,
+)
+import logging
+logger = logging.getLogger(__name__)
+if TYPE_CHECKING:
+    from chromadb.api import ServerAPI
+class Collection(BaseModel):
+    name: str
+    id: UUID
+    metadata: Optional[CollectionMetadata] = None
+    tenant: Optional[str] = None
+    database: Optional[str] = None
+    _client: "ServerAPI" = PrivateAttr()
+    _embedding_function: Optional[EmbeddingFunction[Embeddable]] = PrivateAttr()
+    _data_loader: Optional[DataLoader[Loadable]] = PrivateAttr()
+    def __init__(
+        self,
+        client: "ServerAPI",
+        name: str,
+        id: UUID,
+        embedding_function: Optional[
+            EmbeddingFunction[Embeddable]
+        ] = ef.DefaultEmbeddingFunction(),  # type: ignore
+        data_loader: Optional[DataLoader[Loadable]] = None,
+        tenant: Optional[str] = None,
+        database: Optional[str] = None,
+        metadata: Optional[CollectionMetadata] = None,
+    ):
+        super().__init__(
+            name=name, metadata=metadata, id=id, tenant=tenant, database=database
+        )
+        self._client = client
+        # Check to make sure the embedding function has the right signature, as defined by the EmbeddingFunction protocol
+        if embedding_function is not None:
+            validate_embedding_function(embedding_function)
+        self._embedding_function = embedding_function
+        self._data_loader = data_loader
+    def __repr__(self) -> str:
+        return f"Collection(name={self.name})"
+    def count(self) -> int:
+        """The total number of embeddings added to the database
+        Returns:
+            int: The total number of embeddings added to the database
+        """
+        return self._client._count(collection_id=self.id)
+    def add(
+        self,
+        ids: OneOrMany[ID],
+        embeddings: Optional[
+            Union[
+                OneOrMany[Embedding],
+                OneOrMany[np.ndarray],
+            ]
+        ] = None,
+        metadatas: Optional[OneOrMany[Metadata]] = None,
+        documents: Optional[OneOrMany[Document]] = None,
+        images: Optional[OneOrMany[Image]] = None,
+        uris: Optional[OneOrMany[URI]] = None,
+    ) -> None:
+        """Add embeddings to the data store.
+        Args:
+            ids: The ids of the embeddings you wish to add
+            embeddings: The embeddings to add. If None, embeddings will be computed based on the documents or images using the embedding_function set for the Collection. Optional.
+            metadatas: The metadata to associate with the embeddings. When querying, you can filter on this metadata. Optional.
+            documents: The documents to associate with the embeddings. Optional.
+            images: The images to associate with the embeddings. Optional.
+            uris: The uris of the images to associate with the embeddings. Optional.
+        Returns:
+            None
+        Raises:
+            ValueError: If you don't provide either embeddings or documents
+            ValueError: If the length of ids, embeddings, metadatas, or documents don't match
+            ValueError: If you don't provide an embedding function and don't provide embeddings
+            ValueError: If you provide both embeddings and documents
+            ValueError: If you provide an id that already exists
+        """
+        (
+            ids,
+            embeddings,
+            metadatas,
+            documents,
+            images,
+            uris,
+        ) = self._validate_embedding_set(
+            ids, embeddings, metadatas, documents, images, uris
+        )
+        # We need to compute the embeddings if they're not provided
+        if embeddings is None:
+            # At this point, we know that one of documents or images are provided from the validation above
+            if documents is not None:
+                embeddings = self._embed(input=documents)
+            elif images is not None:
+                embeddings = self._embed(input=images)
+            else:
+                if uris is None:
+                    raise ValueError(
+                        "You must provide either embeddings, documents, images, or uris."
+                    )
+                if self._data_loader is None:
+                    raise ValueError(
+                        "You must set a data loader on the collection if loading from URIs."
+                    )
+                embeddings = self._embed(self._data_loader(uris))
+        self._client._add(ids, self.id, embeddings, metadatas, documents, uris)
+    def get(
+        self,
+        ids: Optional[OneOrMany[ID]] = None,
+        where: Optional[Where] = None,
+        limit: Optional[int] = None,
+        offset: Optional[int] = None,
+        where_document: Optional[WhereDocument] = None,
+        include: Include = ["metadatas", "documents"],
+    ) -> GetResult:
+        """Get embeddings and their associate data from the data store. If no ids or where filter is provided returns
+        all embeddings up to limit starting at offset.
+        Args:
+            ids: The ids of the embeddings to get. Optional.
+            where: A Where type dict used to filter results by. E.g. `{"$and": ["color" : "red", "price": {"$gte": 4.20}]}`. Optional.
+            limit: The number of documents to return. Optional.
+            offset: The offset to start returning results from. Useful for paging results with limit. Optional.
+            where_document: A WhereDocument type dict used to filter by the documents. E.g. `{$contains: {"text": "hello"}}`. Optional.
+            include: A list of what to include in the results. Can contain `"embeddings"`, `"metadatas"`, `"documents"`. Ids are always included. Defaults to `["metadatas", "documents"]`. Optional.
+        Returns:
+            GetResult: A GetResult object containing the results.
+        """
+        valid_where = validate_where(where) if where else None
+        valid_where_document = (
+            validate_where_document(where_document) if where_document else None
+        )
+        valid_ids = validate_ids(maybe_cast_one_to_many_ids(ids)) if ids else None
+        valid_include = validate_include(include, allow_distances=False)
+        if "data" in include and self._data_loader is None:
+            raise ValueError(
+                "You must set a data loader on the collection if loading from URIs."
+            )
+        # We need to include uris in the result from the API to load datas
+        if "data" in include and "uris" not in include:
+            valid_include.append("uris")
+        get_results = self._client._get(
+            self.id,
+            valid_ids,
+            valid_where,
+            None,
+            limit,
+            offset,
+            where_document=valid_where_document,
+            include=valid_include,
+        )
+        if (
+            "data" in include
+            and self._data_loader is not None
+            and get_results["uris"] is not None
+        ):
+            get_results["data"] = self._data_loader(get_results["uris"])
+        # Remove URIs from the result if they weren't requested
+        if "uris" not in include:
+            get_results["uris"] = None
+        return get_results
+    def peek(self, limit: int = 10) -> GetResult:
+        """Get the first few results in the database up to limit
+        Args:
+            limit: The number of results to return.
+        Returns:
+            GetResult: A GetResult object containing the results.
+        """
+        return self._client._peek(self.id, limit)
+    def query(
+        self,
+        query_embeddings: Optional[
+            Union[
+                OneOrMany[Embedding],
+                OneOrMany[np.ndarray],
+            ]
+        ] = None,
+        query_texts: Optional[OneOrMany[Document]] = None,
+        query_images: Optional[OneOrMany[Image]] = None,
+        query_uris: Optional[OneOrMany[URI]] = None,
+        n_results: int = 10,
+        where: Optional[Where] = None,
+        where_document: Optional[WhereDocument] = None,
+        include: Include = ["metadatas", "documents", "distances"],
+    ) -> QueryResult:
+        """Get the n_results nearest neighbor embeddings for provided query_embeddings or query_texts.
+        Args:
+            query_embeddings: The embeddings to get the closes neighbors of. Optional.
+            query_texts: The document texts to get the closes neighbors of. Optional.
+            query_images: The images to get the closes neighbors of. Optional.
+            n_results: The number of neighbors to return for each query_embedding or query_texts. Optional.
+            where: A Where type dict used to filter results by. E.g. `{"$and": ["color" : "red", "price": {"$gte": 4.20}]}`. Optional.
+            where_document: A WhereDocument type dict used to filter by the documents. E.g. `{$contains: {"text": "hello"}}`. Optional.
+            include: A list of what to include in the results. Can contain `"embeddings"`, `"metadatas"`, `"documents"`, `"distances"`. Ids are always included. Defaults to `["metadatas", "documents", "distances"]`. Optional.
+        Returns:
+            QueryResult: A QueryResult object containing the results.
+        Raises:
+            ValueError: If you don't provide either query_embeddings, query_texts, or query_images
+            ValueError: If you provide both query_embeddings and query_texts
+            ValueError: If you provide both query_embeddings and query_images
+            ValueError: If you provide both query_texts and query_images
+        """
+        # Users must provide only one of query_embeddings, query_texts, query_images, or query_uris
+        if not (
+            (query_embeddings is not None)
+            ^ (query_texts is not None)
+            ^ (query_images is not None)
+            ^ (query_uris is not None)
+        ):
+            raise ValueError(
+                "You must provide one of query_embeddings, query_texts, query_images, or query_uris."
+            )
+        valid_where = validate_where(where) if where else {}
+        valid_where_document = (
+            validate_where_document(where_document) if where_document else {}
+        )
+        valid_query_embeddings = (
+            validate_embeddings(
+                self._normalize_embeddings(
+                    maybe_cast_one_to_many_embedding(query_embeddings)
+                )
+            )
+            if query_embeddings is not None
+            else None
+        )
+        valid_query_texts = (
+            maybe_cast_one_to_many_document(query_texts)
+            if query_texts is not None
+            else None
+        )
+        valid_query_images = (
+            maybe_cast_one_to_many_image(query_images)
+            if query_images is not None
+            else None
+        )
+        valid_query_uris = (
+            maybe_cast_one_to_many_uri(query_uris) if query_uris is not None else None
+        )
+        valid_include = validate_include(include, allow_distances=True)
+        valid_n_results = validate_n_results(n_results)
+        # If query_embeddings are not provided, we need to compute them from the inputs
+        if valid_query_embeddings is None:
+            if query_texts is not None:
+                valid_query_embeddings = self._embed(input=valid_query_texts)
+            elif query_images is not None:
+                valid_query_embeddings = self._embed(input=valid_query_images)
+            else:
+                if valid_query_uris is None:
+                    raise ValueError(
+                        "You must provide either query_embeddings, query_texts, query_images, or query_uris."
+                    )
+                if self._data_loader is None:
+                    raise ValueError(
+                        "You must set a data loader on the collection if loading from URIs."
+                    )
+                valid_query_embeddings = self._embed(
+                    self._data_loader(valid_query_uris)
+                )
+        if "data" in include and "uris" not in include:
+            valid_include.append("uris")
+        query_results = self._client._query(
+            collection_id=self.id,
+            query_embeddings=valid_query_embeddings,
+            n_results=valid_n_results,
+            where=valid_where,
+            where_document=valid_where_document,
+            include=include,
+        )
+        if (
+            "data" in include
+            and self._data_loader is not None
+            and query_results["uris"] is not None
+        ):
+            query_results["data"] = [
+                self._data_loader(uris) for uris in query_results["uris"]
+            ]
+        # Remove URIs from the result if they weren't requested
+        if "uris" not in include:
+            query_results["uris"] = None
+        return query_results
+    def modify(
+        self, name: Optional[str] = None, metadata: Optional[CollectionMetadata] = None
+    ) -> None:
+        """Modify the collection name or metadata
+        Args:
+            name: The updated name for the collection. Optional.
+            metadata: The updated metadata for the collection. Optional.
+        Returns:
+            None
+        """
+        if metadata is not None:
+            validate_metadata(metadata)
+            if "hnsw:space" in metadata:
+                raise ValueError(
+                    "Changing the distance function of a collection once it is created is not supported currently.")
+        self._client._modify(id=self.id, new_name=name, new_metadata=metadata)
+        if name:
+            self.name = name
+        if metadata:
+            self.metadata = metadata
+    def update(
+        self,
+        ids: OneOrMany[ID],
+        embeddings: Optional[
+            Union[
+                OneOrMany[Embedding],
+                OneOrMany[np.ndarray],
+            ]
+        ] = None,
+        metadatas: Optional[OneOrMany[Metadata]] = None,
+        documents: Optional[OneOrMany[Document]] = None,
+        images: Optional[OneOrMany[Image]] = None,
+        uris: Optional[OneOrMany[URI]] = None,
+    ) -> None:
+        """Update the embeddings, metadatas or documents for provided ids.
+        Args:
+            ids: The ids of the embeddings to update
+            embeddings: The embeddings to update. If None, embeddings will be computed based on the documents or images using the embedding_function set for the Collection. Optional.
+            metadatas:  The metadata to associate with the embeddings. When querying, you can filter on this metadata. Optional.
+            documents: The documents to associate with the embeddings. Optional.
+            images: The images to associate with the embeddings. Optional.
+        Returns:
+            None
+        """
+        (
+            ids,
+            embeddings,
+            metadatas,
+            documents,
+            images,
+            uris,
+        ) = self._validate_embedding_set(
+            ids,
+            embeddings,
+            metadatas,
+            documents,
+            images,
+            uris,
+            require_embeddings_or_data=False,
+        )
+        if embeddings is None:
+            if documents is not None:
+                embeddings = self._embed(input=documents)
+            elif images is not None:
+                embeddings = self._embed(input=images)
+        self._client._update(self.id, ids, embeddings, metadatas, documents, uris)
+    def upsert(
+        self,
+        ids: OneOrMany[ID],
+        embeddings: Optional[
+            Union[
+                OneOrMany[Embedding],
+                OneOrMany[np.ndarray],
+            ]
+        ] = None,
+        metadatas: Optional[OneOrMany[Metadata]] = None,
+        documents: Optional[OneOrMany[Document]] = None,
+        images: Optional[OneOrMany[Image]] = None,
+        uris: Optional[OneOrMany[URI]] = None,
+    ) -> None:
+        """Update the embeddings, metadatas or documents for provided ids, or create them if they don't exist.
+        Args:
+            ids: The ids of the embeddings to update
+            embeddings: The embeddings to add. If None, embeddings will be computed based on the documents using the embedding_function set for the Collection. Optional.
+            metadatas:  The metadata to associate with the embeddings. When querying, you can filter on this metadata. Optional.
+            documents: The documents to associate with the embeddings. Optional.
+        Returns:
+            None
+        """
+        (
+            ids,
+            embeddings,
+            metadatas,
+            documents,
+            images,
+            uris,
+        ) = self._validate_embedding_set(
+            ids, embeddings, metadatas, documents, images, uris
+        )
+        if embeddings is None:
+            if documents is not None:
+                embeddings = self._embed(input=documents)
+            else:
+                embeddings = self._embed(input=images)
+        self._client._upsert(
+            collection_id=self.id,
+            ids=ids,
+            embeddings=embeddings,
+            metadatas=metadatas,
+            documents=documents,
+            uris=uris,
+        )
+    def delete(
+        self,
+        ids: Optional[IDs] = None,
+        where: Optional[Where] = None,
+        where_document: Optional[WhereDocument] = None,
+    ) -> None:
+        """Delete the embeddings based on ids and/or a where filter
+        Args:
+            ids: The ids of the embeddings to delete
+            where: A Where type dict used to filter the delection by. E.g. `{"$and": ["color" : "red", "price": {"$gte": 4.20}]}`. Optional.
+            where_document: A WhereDocument type dict used to filter the deletion by the document content. E.g. `{$contains: {"text": "hello"}}`. Optional.
+        Returns:
+            None
+        Raises:
+            ValueError: If you don't provide either ids, where, or where_document
+        """
+        ids = validate_ids(maybe_cast_one_to_many_ids(ids)) if ids else None
+        where = validate_where(where) if where else None
+        where_document = (
+            validate_where_document(where_document) if where_document else None
+        )
+        self._client._delete(self.id, ids, where, where_document)
+    def _validate_embedding_set(
+        self,
+        ids: OneOrMany[ID],
+        embeddings: Optional[
+            Union[
+                OneOrMany[Embedding],
+                OneOrMany[np.ndarray],
+            ]
+        ],
+        metadatas: Optional[OneOrMany[Metadata]],
+        documents: Optional[OneOrMany[Document]],
+        images: Optional[OneOrMany[Image]] = None,
+        uris: Optional[OneOrMany[URI]] = None,
+        require_embeddings_or_data: bool = True,
+    ) -> Tuple[
+        IDs,
+        Optional[Embeddings],
+        Optional[Metadatas],
+        Optional[Documents],
+        Optional[Images],
+        Optional[URIs],
+    ]:
+        valid_ids = validate_ids(maybe_cast_one_to_many_ids(ids))
+        valid_embeddings = (
+            validate_embeddings(
+                self._normalize_embeddings(maybe_cast_one_to_many_embedding(embeddings))
+            )
+            if embeddings is not None
+            else None
+        )
+        valid_metadatas = (
+            validate_metadatas(maybe_cast_one_to_many_metadata(metadatas))
+            if metadatas is not None
+            else None
+        )
+        valid_documents = (
+            maybe_cast_one_to_many_document(documents)
+            if documents is not None
+            else None
+        )
+        valid_images = (
+            maybe_cast_one_to_many_image(images) if images is not None else None
+        )
+        valid_uris = maybe_cast_one_to_many_uri(uris) if uris is not None else None
+        # Check that one of embeddings or ducuments or images is provided
+        if require_embeddings_or_data:
+            if (
+                valid_embeddings is None
+                and valid_documents is None
+                and valid_images is None
+                and valid_uris is None
+            ):
+                raise ValueError(
+                    "You must provide embeddings, documents, images, or uris."
+                )
+        # Only one of documents or images can be provided
+        if valid_documents is not None and valid_images is not None:
+            raise ValueError("You can only provide documents or images, not both.")
+        # Check that, if they're provided, the lengths of the arrays match the length of ids
+        if valid_embeddings is not None and len(valid_embeddings) != len(valid_ids):
+            raise ValueError(
+                f"Number of embeddings {len(valid_embeddings)} must match number of ids {len(valid_ids)}"
+            )
+        if valid_metadatas is not None and len(valid_metadatas) != len(valid_ids):
+            raise ValueError(
+                f"Number of metadatas {len(valid_metadatas)} must match number of ids {len(valid_ids)}"
+            )
+        if valid_documents is not None and len(valid_documents) != len(valid_ids):
+            raise ValueError(
+                f"Number of documents {len(valid_documents)} must match number of ids {len(valid_ids)}"
+            )
+        if valid_images is not None and len(valid_images) != len(valid_ids):
+            raise ValueError(
+                f"Number of images {len(valid_images)} must match number of ids {len(valid_ids)}"
+            )
+        if valid_uris is not None and len(valid_uris) != len(valid_ids):
+            raise ValueError(
+                f"Number of uris {len(valid_uris)} must match number of ids {len(valid_ids)}"
+            )
+        return (
+            valid_ids,
+            valid_embeddings,
+            valid_metadatas,
+            valid_documents,
+            valid_images,
+            valid_uris,
+        )
+    @staticmethod
+    def _normalize_embeddings(
+        embeddings: Union[
+            OneOrMany[Embedding],
+            OneOrMany[np.ndarray],
+        ]
+    ) -> Embeddings:
+        if isinstance(embeddings, np.ndarray):
+            return embeddings.tolist()
+        return embeddings
+    def _embed(self, input: Any) -> Embeddings:
+        if self._embedding_function is None:
+            raise ValueError(
+                "You must provide an embedding function to compute embeddings."
+                "https://docs.trychroma.com/embeddings"
+            )
+        return self._embedding_function(input=input)

chromadb/api/segment.py ADDED Viewed

	@@ -0,0 +1,914 @@

+from chromadb.api import ServerAPI
+from chromadb.config import DEFAULT_DATABASE, DEFAULT_TENANT, Settings, System
+from chromadb.db.system import SysDB
+from chromadb.segment import SegmentManager, MetadataReader, VectorReader
+from chromadb.telemetry.opentelemetry import (
+    add_attributes_to_current_span,
+    OpenTelemetryClient,
+    OpenTelemetryGranularity,
+    trace_method,
+)
+from chromadb.telemetry.product import ProductTelemetryClient
+from chromadb.ingest import Producer
+from chromadb.api.models.Collection import Collection
+from chromadb import __version__
+from chromadb.errors import InvalidDimensionException, InvalidCollectionException
+import chromadb.utils.embedding_functions as ef
+from chromadb.api.types import (
+    URI,
+    CollectionMetadata,
+    Embeddable,
+    Document,
+    EmbeddingFunction,
+    DataLoader,
+    IDs,
+    Embeddings,
+    Embedding,
+    Loadable,
+    Metadatas,
+    Documents,
+    URIs,
+    Where,
+    WhereDocument,
+    Include,
+    GetResult,
+    QueryResult,
+    validate_metadata,
+    validate_update_metadata,
+    validate_where,
+    validate_where_document,
+    validate_batch,
+)
+from chromadb.telemetry.product.events import (
+    CollectionAddEvent,
+    CollectionDeleteEvent,
+    CollectionGetEvent,
+    CollectionUpdateEvent,
+    CollectionQueryEvent,
+    ClientCreateCollectionEvent,
+)
+import chromadb.types as t
+from typing import Any, Optional, Sequence, Generator, List, cast, Set, Dict
+from overrides import override
+from uuid import UUID, uuid4
+import time
+import logging
+import re
+logger = logging.getLogger(__name__)
+# mimics s3 bucket requirements for naming
+def check_index_name(index_name: str) -> None:
+    msg = (
+        "Expected collection name that "
+        "(1) contains 3-63 characters, "
+        "(2) starts and ends with an alphanumeric character, "
+        "(3) otherwise contains only alphanumeric characters, underscores or hyphens (-), "
+        "(4) contains no two consecutive periods (..) and "
+        "(5) is not a valid IPv4 address, "
+        f"got {index_name}"
+    )
+    if len(index_name) < 3 or len(index_name) > 63:
+        raise ValueError(msg)
+    if not re.match("^[a-zA-Z0-9][a-zA-Z0-9._-]*[a-zA-Z0-9]$", index_name):
+        raise ValueError(msg)
+    if ".." in index_name:
+        raise ValueError(msg)
+    if re.match("^[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}$", index_name):
+        raise ValueError(msg)
+class SegmentAPI(ServerAPI):
+    """API implementation utilizing the new segment-based internal architecture"""
+    _settings: Settings
+    _sysdb: SysDB
+    _manager: SegmentManager
+    _producer: Producer
+    _product_telemetry_client: ProductTelemetryClient
+    _opentelemetry_client: OpenTelemetryClient
+    _tenant_id: str
+    _topic_ns: str
+    _collection_cache: Dict[UUID, t.Collection]
+    def __init__(self, system: System):
+        super().__init__(system)
+        self._settings = system.settings
+        self._sysdb = self.require(SysDB)
+        self._manager = self.require(SegmentManager)
+        self._product_telemetry_client = self.require(ProductTelemetryClient)
+        self._opentelemetry_client = self.require(OpenTelemetryClient)
+        self._producer = self.require(Producer)
+        self._collection_cache = {}
+    @override
+    def heartbeat(self) -> int:
+        return int(time.time_ns())
+    @override
+    def create_database(self, name: str, tenant: str = DEFAULT_TENANT) -> None:
+        if len(name) < 3:
+            raise ValueError("Database name must be at least 3 characters long")
+        self._sysdb.create_database(
+            id=uuid4(),
+            name=name,
+            tenant=tenant,
+        )
+    @override
+    def get_database(self, name: str, tenant: str = DEFAULT_TENANT) -> t.Database:
+        return self._sysdb.get_database(name=name, tenant=tenant)
+    @override
+    def create_tenant(self, name: str) -> None:
+        if len(name) < 3:
+            raise ValueError("Tenant name must be at least 3 characters long")
+        self._sysdb.create_tenant(
+            name=name,
+        )
+    @override
+    def get_tenant(self, name: str) -> t.Tenant:
+        return self._sysdb.get_tenant(name=name)
+    # TODO: Actually fix CollectionMetadata type to remove type: ignore flags. This is
+    # necessary because changing the value type from `Any` to`` `Union[str, int, float]`
+    # causes the system to somehow convert all values to strings.
+    @trace_method("SegmentAPI.create_collection", OpenTelemetryGranularity.OPERATION)
+    @override
+    def create_collection(
+        self,
+        name: str,
+        metadata: Optional[CollectionMetadata] = None,
+        embedding_function: Optional[
+            EmbeddingFunction[Any]
+        ] = ef.DefaultEmbeddingFunction(),
+        data_loader: Optional[DataLoader[Loadable]] = None,
+        get_or_create: bool = False,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+    ) -> Collection:
+        if metadata is not None:
+            validate_metadata(metadata)
+        # TODO: remove backwards compatibility in naming requirements
+        check_index_name(name)
+        id = uuid4()
+        coll, created = self._sysdb.create_collection(
+            id=id,
+            name=name,
+            metadata=metadata,
+            dimension=None,
+            get_or_create=get_or_create,
+            tenant=tenant,
+            database=database,
+        )
+        if created:
+            segments = self._manager.create_segments(coll)
+            for segment in segments:
+                self._sysdb.create_segment(segment)
+        # TODO: This event doesn't capture the get_or_create case appropriately
+        self._product_telemetry_client.capture(
+            ClientCreateCollectionEvent(
+                collection_uuid=str(id),
+                embedding_function=embedding_function.__class__.__name__,
+            )
+        )
+        add_attributes_to_current_span({"collection_uuid": str(id)})
+        return Collection(
+            client=self,
+            id=coll["id"],
+            name=name,
+            metadata=coll["metadata"],  # type: ignore
+            embedding_function=embedding_function,
+            data_loader=data_loader,
+            tenant=tenant,
+            database=database,
+        )
+    @trace_method(
+        "SegmentAPI.get_or_create_collection", OpenTelemetryGranularity.OPERATION
+    )
+    @override
+    def get_or_create_collection(
+        self,
+        name: str,
+        metadata: Optional[CollectionMetadata] = None,
+        embedding_function: Optional[
+            EmbeddingFunction[Embeddable]
+        ] = ef.DefaultEmbeddingFunction(),  # type: ignore
+        data_loader: Optional[DataLoader[Loadable]] = None,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+    ) -> Collection:
+        return self.create_collection(  # type: ignore
+            name=name,
+            metadata=metadata,
+            embedding_function=embedding_function,
+            data_loader=data_loader,
+            get_or_create=True,
+            tenant=tenant,
+            database=database,
+        )
+    # TODO: Actually fix CollectionMetadata type to remove type: ignore flags. This is
+    # necessary because changing the value type from `Any` to`` `Union[str, int, float]`
+    # causes the system to somehow convert all values to strings
+    @trace_method("SegmentAPI.get_collection", OpenTelemetryGranularity.OPERATION)
+    @override
+    def get_collection(
+        self,
+        name: Optional[str] = None,
+        id: Optional[UUID] = None,
+        embedding_function: Optional[
+            EmbeddingFunction[Embeddable]
+        ] = ef.DefaultEmbeddingFunction(),  # type: ignore
+        data_loader: Optional[DataLoader[Loadable]] = None,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+    ) -> Collection:
+        if id is None and name is None or (id is not None and name is not None):
+            raise ValueError("Name or id must be specified, but not both")
+        existing = self._sysdb.get_collections(
+            id=id, name=name, tenant=tenant, database=database
+        )
+        if existing:
+            return Collection(
+                client=self,
+                id=existing[0]["id"],
+                name=existing[0]["name"],
+                metadata=existing[0]["metadata"],  # type: ignore
+                embedding_function=embedding_function,
+                data_loader=data_loader,
+                tenant=existing[0]["tenant"],
+                database=existing[0]["database"],
+            )
+        else:
+            raise ValueError(f"Collection {name} does not exist.")
+    @trace_method("SegmentAPI.list_collection", OpenTelemetryGranularity.OPERATION)
+    @override
+    def list_collections(
+        self,
+        limit: Optional[int] = None,
+        offset: Optional[int] = None,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+    ) -> Sequence[Collection]:
+        collections = []
+        db_collections = self._sysdb.get_collections(
+            limit=limit, offset=offset, tenant=tenant, database=database
+        )
+        for db_collection in db_collections:
+            collections.append(
+                Collection(
+                    client=self,
+                    id=db_collection["id"],
+                    name=db_collection["name"],
+                    metadata=db_collection["metadata"],  # type: ignore
+                    tenant=db_collection["tenant"],
+                    database=db_collection["database"],
+                )
+            )
+        return collections
+    @trace_method("SegmentAPI.count_collections", OpenTelemetryGranularity.OPERATION)
+    @override
+    def count_collections(
+        self,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+    ) -> int:
+        collection_count = len(
+            self._sysdb.get_collections(tenant=tenant, database=database)
+        )
+        return collection_count
+    @trace_method("SegmentAPI._modify", OpenTelemetryGranularity.OPERATION)
+    @override
+    def _modify(
+        self,
+        id: UUID,
+        new_name: Optional[str] = None,
+        new_metadata: Optional[CollectionMetadata] = None,
+    ) -> None:
+        if new_name:
+            # backwards compatibility in naming requirements (for now)
+            check_index_name(new_name)
+        if new_metadata:
+            validate_update_metadata(new_metadata)
+        # TODO eventually we'll want to use OptionalArgument and Unspecified in the
+        # signature of `_modify` but not changing the API right now.
+        if new_name and new_metadata:
+            self._sysdb.update_collection(id, name=new_name, metadata=new_metadata)
+        elif new_name:
+            self._sysdb.update_collection(id, name=new_name)
+        elif new_metadata:
+            self._sysdb.update_collection(id, metadata=new_metadata)
+    @trace_method("SegmentAPI.delete_collection", OpenTelemetryGranularity.OPERATION)
+    @override
+    def delete_collection(
+        self,
+        name: str,
+        tenant: str = DEFAULT_TENANT,
+        database: str = DEFAULT_DATABASE,
+    ) -> None:
+        existing = self._sysdb.get_collections(
+            name=name, tenant=tenant, database=database
+        )
+        if existing:
+            self._sysdb.delete_collection(
+                existing[0]["id"], tenant=tenant, database=database
+            )
+            for s in self._manager.delete_segments(existing[0]["id"]):
+                self._sysdb.delete_segment(s)
+            if existing and existing[0]["id"] in self._collection_cache:
+                del self._collection_cache[existing[0]["id"]]
+        else:
+            raise ValueError(f"Collection {name} does not exist.")
+    @trace_method("SegmentAPI._add", OpenTelemetryGranularity.OPERATION)
+    @override
+    def _add(
+        self,
+        ids: IDs,
+        collection_id: UUID,
+        embeddings: Embeddings,
+        metadatas: Optional[Metadatas] = None,
+        documents: Optional[Documents] = None,
+        uris: Optional[URIs] = None,
+    ) -> bool:
+        coll = self._get_collection(collection_id)
+        self._manager.hint_use_collection(collection_id, t.Operation.ADD)
+        validate_batch(
+            (ids, embeddings, metadatas, documents, uris),
+            {"max_batch_size": self.max_batch_size},
+        )
+        records_to_submit = []
+        for r in _records(
+            t.Operation.ADD,
+            ids=ids,
+            collection_id=collection_id,
+            embeddings=embeddings,
+            metadatas=metadatas,
+            documents=documents,
+            uris=uris,
+        ):
+            self._validate_embedding_record(coll, r)
+            records_to_submit.append(r)
+        self._producer.submit_embeddings(coll["topic"], records_to_submit)
+        self._product_telemetry_client.capture(
+            CollectionAddEvent(
+                collection_uuid=str(collection_id),
+                add_amount=len(ids),
+                with_metadata=len(ids) if metadatas is not None else 0,
+                with_documents=len(ids) if documents is not None else 0,
+                with_uris=len(ids) if uris is not None else 0,
+            )
+        )
+        return True
+    @trace_method("SegmentAPI._update", OpenTelemetryGranularity.OPERATION)
+    @override
+    def _update(
+        self,
+        collection_id: UUID,
+        ids: IDs,
+        embeddings: Optional[Embeddings] = None,
+        metadatas: Optional[Metadatas] = None,
+        documents: Optional[Documents] = None,
+        uris: Optional[URIs] = None,
+    ) -> bool:
+        coll = self._get_collection(collection_id)
+        self._manager.hint_use_collection(collection_id, t.Operation.UPDATE)
+        validate_batch(
+            (ids, embeddings, metadatas, documents, uris),
+            {"max_batch_size": self.max_batch_size},
+        )
+        records_to_submit = []
+        for r in _records(
+            t.Operation.UPDATE,
+            ids=ids,
+            collection_id=collection_id,
+            embeddings=embeddings,
+            metadatas=metadatas,
+            documents=documents,
+            uris=uris,
+        ):
+            self._validate_embedding_record(coll, r)
+            records_to_submit.append(r)
+        self._producer.submit_embeddings(coll["topic"], records_to_submit)
+        self._product_telemetry_client.capture(
+            CollectionUpdateEvent(
+                collection_uuid=str(collection_id),
+                update_amount=len(ids),
+                with_embeddings=len(embeddings) if embeddings else 0,
+                with_metadata=len(metadatas) if metadatas else 0,
+                with_documents=len(documents) if documents else 0,
+                with_uris=len(uris) if uris else 0,
+            )
+        )
+        return True
+    @trace_method("SegmentAPI._upsert", OpenTelemetryGranularity.OPERATION)
+    @override
+    def _upsert(
+        self,
+        collection_id: UUID,
+        ids: IDs,
+        embeddings: Embeddings,
+        metadatas: Optional[Metadatas] = None,
+        documents: Optional[Documents] = None,
+        uris: Optional[URIs] = None,
+    ) -> bool:
+        coll = self._get_collection(collection_id)
+        self._manager.hint_use_collection(collection_id, t.Operation.UPSERT)
+        validate_batch(
+            (ids, embeddings, metadatas, documents, uris),
+            {"max_batch_size": self.max_batch_size},
+        )
+        records_to_submit = []
+        for r in _records(
+            t.Operation.UPSERT,
+            ids=ids,
+            collection_id=collection_id,
+            embeddings=embeddings,
+            metadatas=metadatas,
+            documents=documents,
+            uris=uris,
+        ):
+            self._validate_embedding_record(coll, r)
+            records_to_submit.append(r)
+        self._producer.submit_embeddings(coll["topic"], records_to_submit)
+        return True
+    @trace_method("SegmentAPI._get", OpenTelemetryGranularity.OPERATION)
+    @override
+    def _get(
+        self,
+        collection_id: UUID,
+        ids: Optional[IDs] = None,
+        where: Optional[Where] = {},
+        sort: Optional[str] = None,
+        limit: Optional[int] = None,
+        offset: Optional[int] = None,
+        page: Optional[int] = None,
+        page_size: Optional[int] = None,
+        where_document: Optional[WhereDocument] = {},
+        include: Include = ["embeddings", "metadatas", "documents"],
+    ) -> GetResult:
+        add_attributes_to_current_span(
+            {
+                "collection_id": str(collection_id),
+                "ids_count": len(ids) if ids else 0,
+            }
+        )
+        where = validate_where(where) if where is not None and len(where) > 0 else None
+        where_document = (
+            validate_where_document(where_document)
+            if where_document is not None and len(where_document) > 0
+            else None
+        )
+        metadata_segment = self._manager.get_segment(collection_id, MetadataReader)
+        if sort is not None:
+            raise NotImplementedError("Sorting is not yet supported")
+        if page and page_size:
+            offset = (page - 1) * page_size
+            limit = page_size
+        records = metadata_segment.get_metadata(
+            where=where,
+            where_document=where_document,
+            ids=ids,
+            limit=limit,
+            offset=offset,
+        )
+        if len(records) == 0:
+            # Nothing to return if there are no records
+            return GetResult(
+                ids=[],
+                embeddings=[] if "embeddings" in include else None,
+                metadatas=[] if "metadatas" in include else None,
+                documents=[] if "documents" in include else None,
+                uris=[] if "uris" in include else None,
+                data=[] if "data" in include else None,
+            )
+        vectors: Sequence[t.VectorEmbeddingRecord] = []
+        if "embeddings" in include:
+            vector_ids = [r["id"] for r in records]
+            vector_segment = self._manager.get_segment(collection_id, VectorReader)
+            vectors = vector_segment.get_vectors(ids=vector_ids)
+        # TODO: Fix type so we don't need to ignore
+        # It is possible to have a set of records, some with metadata and some without
+        # Same with documents
+        metadatas = [r["metadata"] for r in records]
+        if "documents" in include:
+            documents = [_doc(m) for m in metadatas]
+        if "uris" in include:
+            uris = [_uri(m) for m in metadatas]
+        ids_amount = len(ids) if ids else 0
+        self._product_telemetry_client.capture(
+            CollectionGetEvent(
+                collection_uuid=str(collection_id),
+                ids_count=ids_amount,
+                limit=limit if limit else 0,
+                include_metadata=ids_amount if "metadatas" in include else 0,
+                include_documents=ids_amount if "documents" in include else 0,
+                include_uris=ids_amount if "uris" in include else 0,
+            )
+        )
+        return GetResult(
+            ids=[r["id"] for r in records],
+            embeddings=[r["embedding"] for r in vectors]
+            if "embeddings" in include
+            else None,
+            metadatas=_clean_metadatas(metadatas)
+            if "metadatas" in include
+            else None,  # type: ignore
+            documents=documents if "documents" in include else None,  # type: ignore
+            uris=uris if "uris" in include else None,  # type: ignore
+            data=None,
+        )
+    @trace_method("SegmentAPI._delete", OpenTelemetryGranularity.OPERATION)
+    @override
+    def _delete(
+        self,
+        collection_id: UUID,
+        ids: Optional[IDs] = None,
+        where: Optional[Where] = None,
+        where_document: Optional[WhereDocument] = None,
+    ) -> IDs:
+        add_attributes_to_current_span(
+            {
+                "collection_id": str(collection_id),
+                "ids_count": len(ids) if ids else 0,
+            }
+        )
+        where = validate_where(where) if where is not None and len(where) > 0 else None
+        where_document = (
+            validate_where_document(where_document)
+            if where_document is not None and len(where_document) > 0
+            else None
+        )
+        # You must have at least one of non-empty ids, where, or where_document.
+        if (
+            (ids is None or (ids is not None and len(ids) == 0))
+            and (where is None or (where is not None and len(where) == 0))
+            and (
+                where_document is None
+                or (where_document is not None and len(where_document) == 0)
+            )
+        ):
+            raise ValueError(
+                """
+                You must provide either ids, where, or where_document to delete. If
+                you want to delete all data in a collection you can delete the
+                collection itself using the delete_collection method. Or alternatively,
+                you can get() all the relevant ids and then delete them.
+                """
+            )
+        coll = self._get_collection(collection_id)
+        self._manager.hint_use_collection(collection_id, t.Operation.DELETE)
+        if (where or where_document) or not ids:
+            metadata_segment = self._manager.get_segment(collection_id, MetadataReader)
+            records = metadata_segment.get_metadata(
+                where=where, where_document=where_document, ids=ids
+            )
+            ids_to_delete = [r["id"] for r in records]
+        else:
+            ids_to_delete = ids
+        if len(ids_to_delete) == 0:
+            return []
+        records_to_submit = []
+        for r in _records(
+            operation=t.Operation.DELETE, ids=ids_to_delete, collection_id=collection_id
+        ):
+            self._validate_embedding_record(coll, r)
+            records_to_submit.append(r)
+        self._producer.submit_embeddings(coll["topic"], records_to_submit)
+        self._product_telemetry_client.capture(
+            CollectionDeleteEvent(
+                collection_uuid=str(collection_id), delete_amount=len(ids_to_delete)
+            )
+        )
+        return ids_to_delete
+    @trace_method("SegmentAPI._count", OpenTelemetryGranularity.OPERATION)
+    @override
+    def _count(self, collection_id: UUID) -> int:
+        add_attributes_to_current_span({"collection_id": str(collection_id)})
+        metadata_segment = self._manager.get_segment(collection_id, MetadataReader)
+        return metadata_segment.count()
+    @trace_method("SegmentAPI._query", OpenTelemetryGranularity.OPERATION)
+    @override
+    def _query(
+        self,
+        collection_id: UUID,
+        query_embeddings: Embeddings,
+        n_results: int = 10,
+        where: Where = {},
+        where_document: WhereDocument = {},
+        include: Include = ["documents", "metadatas", "distances"],
+    ) -> QueryResult:
+        add_attributes_to_current_span(
+            {
+                "collection_id": str(collection_id),
+                "n_results": n_results,
+                "where": str(where),
+            }
+        )
+        where = validate_where(where) if where is not None and len(where) > 0 else where
+        where_document = (
+            validate_where_document(where_document)
+            if where_document is not None and len(where_document) > 0
+            else where_document
+        )
+        allowed_ids = None
+        coll = self._get_collection(collection_id)
+        for embedding in query_embeddings:
+            self._validate_dimension(coll, len(embedding), update=False)
+        metadata_reader = self._manager.get_segment(collection_id, MetadataReader)
+        if where or where_document:
+            records = metadata_reader.get_metadata(
+                where=where, where_document=where_document
+            )
+            allowed_ids = [r["id"] for r in records]
+        query = t.VectorQuery(
+            vectors=query_embeddings,
+            k=n_results,
+            allowed_ids=allowed_ids,
+            include_embeddings="embeddings" in include,
+            options=None,
+        )
+        vector_reader = self._manager.get_segment(collection_id, VectorReader)
+        results = vector_reader.query_vectors(query)
+        ids: List[List[str]] = []
+        distances: List[List[float]] = []
+        embeddings: List[List[Embedding]] = []
+        documents: List[List[Document]] = []
+        uris: List[List[URI]] = []
+        metadatas: List[List[t.Metadata]] = []
+        for result in results:
+            ids.append([r["id"] for r in result])
+            if "distances" in include:
+                distances.append([r["distance"] for r in result])
+            if "embeddings" in include:
+                embeddings.append([cast(Embedding, r["embedding"]) for r in result])
+        if "documents" in include or "metadatas" in include or "uris" in include:
+            all_ids: Set[str] = set()
+            for id_list in ids:
+                all_ids.update(id_list)
+            records = metadata_reader.get_metadata(ids=list(all_ids))
+            metadata_by_id = {r["id"]: r["metadata"] for r in records}
+            for id_list in ids:
+                # In the segment based architecture, it is possible for one segment
+                # to have a record that another segment does not have. This results in
+                # data inconsistency. For the case of the local segments and the
+                # local segment manager, there is a case where a thread writes
+                # a record to the vector segment but not the metadata segment.
+                # Then a query'ing thread reads from the vector segment and
+                # queries the metadata segment. The metadata segment does not have
+                # the record. In this case we choose to return potentially
+                # incorrect data in the form of None.
+                metadata_list = [metadata_by_id.get(id, None) for id in id_list]
+                if "metadatas" in include:
+                    metadatas.append(_clean_metadatas(metadata_list))  # type: ignore
+                if "documents" in include:
+                    doc_list = [_doc(m) for m in metadata_list]
+                    documents.append(doc_list)  # type: ignore
+                if "uris" in include:
+                    uri_list = [_uri(m) for m in metadata_list]
+                    uris.append(uri_list)  # type: ignore
+        query_amount = len(query_embeddings)
+        self._product_telemetry_client.capture(
+            CollectionQueryEvent(
+                collection_uuid=str(collection_id),
+                query_amount=query_amount,
+                n_results=n_results,
+                with_metadata_filter=query_amount if where is not None else 0,
+                with_document_filter=query_amount if where_document is not None else 0,
+                include_metadatas=query_amount if "metadatas" in include else 0,
+                include_documents=query_amount if "documents" in include else 0,
+                include_uris=query_amount if "uris" in include else 0,
+                include_distances=query_amount if "distances" in include else 0,
+            )
+        )
+        return QueryResult(
+            ids=ids,
+            distances=distances if distances else None,
+            metadatas=metadatas if metadatas else None,
+            embeddings=embeddings if embeddings else None,
+            documents=documents if documents else None,
+            uris=uris if uris else None,
+            data=None,
+        )
+    @trace_method("SegmentAPI._peek", OpenTelemetryGranularity.OPERATION)
+    @override
+    def _peek(self, collection_id: UUID, n: int = 10) -> GetResult:
+        add_attributes_to_current_span({"collection_id": str(collection_id)})
+        return self._get(collection_id, limit=n)  # type: ignore
+    @override
+    def get_version(self) -> str:
+        return __version__
+    @override
+    def reset_state(self) -> None:
+        self._collection_cache = {}
+    @override
+    def reset(self) -> bool:
+        self._system.reset_state()
+        return True
+    @override
+    def get_settings(self) -> Settings:
+        return self._settings
+    @property
+    @override
+    def max_batch_size(self) -> int:
+        return self._producer.max_batch_size
+    # TODO: This could potentially cause race conditions in a distributed version of the
+    # system, since the cache is only local.
+    # TODO: promote collection -> topic to a base class method so that it can be
+    # used for channel assignment in the distributed version of the system.
+    @trace_method("SegmentAPI._validate_embedding_record", OpenTelemetryGranularity.ALL)
+    def _validate_embedding_record(
+        self, collection: t.Collection, record: t.SubmitEmbeddingRecord
+    ) -> None:
+        """Validate the dimension of an embedding record before submitting it to the system."""
+        add_attributes_to_current_span({"collection_id": str(collection["id"])})
+        if record["embedding"]:
+            self._validate_dimension(collection, len(record["embedding"]), update=True)
+    @trace_method("SegmentAPI._validate_dimension", OpenTelemetryGranularity.ALL)
+    def _validate_dimension(
+        self, collection: t.Collection, dim: int, update: bool
+    ) -> None:
+        """Validate that a collection supports records of the given dimension. If update
+        is true, update the collection if the collection doesn't already have a
+        dimension."""
+        if collection["dimension"] is None:
+            if update:
+                id = collection["id"]
+                self._sysdb.update_collection(id=id, dimension=dim)
+                self._collection_cache[id]["dimension"] = dim
+        elif collection["dimension"] != dim:
+            raise InvalidDimensionException(
+                f"Embedding dimension {dim} does not match collection dimensionality {collection['dimension']}"
+            )
+        else:
+            return  # all is well
+    @trace_method("SegmentAPI._get_collection", OpenTelemetryGranularity.ALL)
+    def _get_collection(self, collection_id: UUID) -> t.Collection:
+        """Read-through cache for collection data"""
+        if collection_id not in self._collection_cache:
+            collections = self._sysdb.get_collections(id=collection_id)
+            if not collections:
+                raise InvalidCollectionException(
+                    f"Collection {collection_id} does not exist."
+                )
+            self._collection_cache[collection_id] = collections[0]
+        return self._collection_cache[collection_id]
+def _records(
+    operation: t.Operation,
+    ids: IDs,
+    collection_id: UUID,
+    embeddings: Optional[Embeddings] = None,
+    metadatas: Optional[Metadatas] = None,
+    documents: Optional[Documents] = None,
+    uris: Optional[URIs] = None,
+) -> Generator[t.SubmitEmbeddingRecord, None, None]:
+    """Convert parallel lists of embeddings, metadatas and documents to a sequence of
+    SubmitEmbeddingRecords"""
+    # Presumes that callers were invoked via  Collection model, which means
+    # that we know that the embeddings, metadatas and documents have already been
+    # normalized and are guaranteed to be consistently named lists.
+    for i, id in enumerate(ids):
+        metadata = None
+        if metadatas:
+            metadata = metadatas[i]
+        if documents:
+            document = documents[i]
+            if metadata:
+                metadata = {**metadata, "chroma:document": document}
+            else:
+                metadata = {"chroma:document": document}
+        if uris:
+            uri = uris[i]
+            if metadata:
+                metadata = {**metadata, "chroma:uri": uri}
+            else:
+                metadata = {"chroma:uri": uri}
+        record = t.SubmitEmbeddingRecord(
+            id=id,
+            embedding=embeddings[i] if embeddings else None,
+            encoding=t.ScalarEncoding.FLOAT32,  # Hardcode for now
+            metadata=metadata,
+            operation=operation,
+            collection_id=collection_id,
+        )
+        yield record
+def _doc(metadata: Optional[t.Metadata]) -> Optional[str]:
+    """Retrieve the document (if any) from a Metadata map"""
+    if metadata and "chroma:document" in metadata:
+        return str(metadata["chroma:document"])
+    return None
+def _uri(metadata: Optional[t.Metadata]) -> Optional[str]:
+    """Retrieve the uri (if any) from a Metadata map"""
+    if metadata and "chroma:uri" in metadata:
+        return str(metadata["chroma:uri"])
+    return None
+def _clean_metadatas(
+    metadata: List[Optional[t.Metadata]],
+) -> List[Optional[t.Metadata]]:
+    """Remove any chroma-specific metadata keys that the client shouldn't see from a
+    list of metadata maps."""
+    return [_clean_metadata(m) for m in metadata]
+def _clean_metadata(metadata: Optional[t.Metadata]) -> Optional[t.Metadata]:
+    """Remove any chroma-specific metadata keys that the client shouldn't see from a
+    metadata map."""
+    if not metadata:
+        return None
+    result = {}
+    for k, v in metadata.items():
+        if not k.startswith("chroma:"):
+            result[k] = v
+    if len(result) == 0:
+        return None
+    return result

chromadb/api/types.py ADDED Viewed

	@@ -0,0 +1,509 @@

+from typing import Optional, Union, TypeVar, List, Dict, Any, Tuple, cast
+from numpy.typing import NDArray
+import numpy as np
+from typing_extensions import Literal, TypedDict, Protocol
+import chromadb.errors as errors
+from chromadb.types import (
+    Metadata,
+    UpdateMetadata,
+    Vector,
+    LiteralValue,
+    LogicalOperator,
+    WhereOperator,
+    OperatorExpression,
+    Where,
+    WhereDocumentOperator,
+    WhereDocument,
+)
+from inspect import signature
+from tenacity import retry
+# Re-export types from chromadb.types
+__all__ = ["Metadata", "Where", "WhereDocument", "UpdateCollectionMetadata"]
+T = TypeVar("T")
+OneOrMany = Union[T, List[T]]
+# URIs
+URI = str
+URIs = List[URI]
+def maybe_cast_one_to_many_uri(target: OneOrMany[URI]) -> URIs:
+    if isinstance(target, str):
+        # One URI
+        return cast(URIs, [target])
+    # Already a sequence
+    return cast(URIs, target)
+# IDs
+ID = str
+IDs = List[ID]
+def maybe_cast_one_to_many_ids(target: OneOrMany[ID]) -> IDs:
+    if isinstance(target, str):
+        # One ID
+        return cast(IDs, [target])
+    # Already a sequence
+    return cast(IDs, target)
+# Embeddings
+Embedding = Vector
+Embeddings = List[Embedding]
+def maybe_cast_one_to_many_embedding(target: OneOrMany[Embedding]) -> Embeddings:
+    if isinstance(target, List):
+        # One Embedding
+        if isinstance(target[0], (int, float)):
+            return cast(Embeddings, [target])
+    # Already a sequence
+    return cast(Embeddings, target)
+# Metadatas
+Metadatas = List[Metadata]
+def maybe_cast_one_to_many_metadata(target: OneOrMany[Metadata]) -> Metadatas:
+    # One Metadata dict
+    if isinstance(target, dict):
+        return cast(Metadatas, [target])
+    # Already a sequence
+    return cast(Metadatas, target)
+CollectionMetadata = Dict[str, Any]
+UpdateCollectionMetadata = UpdateMetadata
+# Documents
+Document = str
+Documents = List[Document]
+def is_document(target: Any) -> bool:
+    if not isinstance(target, str):
+        return False
+    return True
+def maybe_cast_one_to_many_document(target: OneOrMany[Document]) -> Documents:
+    # One Document
+    if is_document(target):
+        return cast(Documents, [target])
+    # Already a sequence
+    return cast(Documents, target)
+# Images
+ImageDType = Union[np.uint, np.int_, np.float_]
+Image = NDArray[ImageDType]
+Images = List[Image]
+def is_image(target: Any) -> bool:
+    if not isinstance(target, np.ndarray):
+        return False
+    if len(target.shape) < 2:
+        return False
+    return True
+def maybe_cast_one_to_many_image(target: OneOrMany[Image]) -> Images:
+    if is_image(target):
+        return cast(Images, [target])
+    # Already a sequence
+    return cast(Images, target)
+Parameter = TypeVar("Parameter", Document, Image, Embedding, Metadata, ID)
+# This should ust be List[Literal["documents", "embeddings", "metadatas", "distances"]]
+# However, this provokes an incompatibility with the Overrides library and Python 3.7
+Include = List[
+    Union[
+        Literal["documents"],
+        Literal["embeddings"],
+        Literal["metadatas"],
+        Literal["distances"],
+        Literal["uris"],
+        Literal["data"],
+    ]
+]
+# Re-export types from chromadb.types
+LiteralValue = LiteralValue
+LogicalOperator = LogicalOperator
+WhereOperator = WhereOperator
+OperatorExpression = OperatorExpression
+Where = Where
+WhereDocumentOperator = WhereDocumentOperator
+Embeddable = Union[Documents, Images]
+D = TypeVar("D", bound=Embeddable, contravariant=True)
+Loadable = List[Optional[Image]]
+L = TypeVar("L", covariant=True, bound=Loadable)
+class GetResult(TypedDict):
+    ids: List[ID]
+    embeddings: Optional[List[Embedding]]
+    documents: Optional[List[Document]]
+    uris: Optional[URIs]
+    data: Optional[Loadable]
+    metadatas: Optional[List[Metadata]]
+class QueryResult(TypedDict):
+    ids: List[IDs]
+    embeddings: Optional[List[List[Embedding]]]
+    documents: Optional[List[List[Document]]]
+    uris: Optional[List[List[URI]]]
+    data: Optional[List[Loadable]]
+    metadatas: Optional[List[List[Metadata]]]
+    distances: Optional[List[List[float]]]
+class IndexMetadata(TypedDict):
+    dimensionality: int
+    # The current number of elements in the index (total = additions - deletes)
+    curr_elements: int
+    # The auto-incrementing ID of the last inserted element, never decreases so
+    # can be used as a count of total historical size. Should increase by 1 every add.
+    # Assume cannot overflow
+    total_elements_added: int
+    time_created: float
+class EmbeddingFunction(Protocol[D]):
+    def __call__(self, input: D) -> Embeddings:
+        ...
+    def __init_subclass__(cls) -> None:
+        super().__init_subclass__()
+        # Raise an exception if __call__ is not defined since it is expected to be defined
+        call = getattr(cls, "__call__")
+        def __call__(self: EmbeddingFunction[D], input: D) -> Embeddings:
+            result = call(self, input)
+            return validate_embeddings(maybe_cast_one_to_many_embedding(result))
+        setattr(cls, "__call__", __call__)
+    def embed_with_retries(self, input: D, **retry_kwargs: Dict) -> Embeddings:
+        return retry(**retry_kwargs)(self.__call__)(input)
+def validate_embedding_function(
+    embedding_function: EmbeddingFunction[Embeddable],
+) -> None:
+    function_signature = signature(
+        embedding_function.__class__.__call__
+    ).parameters.keys()
+    protocol_signature = signature(EmbeddingFunction.__call__).parameters.keys()
+    if not function_signature == protocol_signature:
+        raise ValueError(
+            f"Expected EmbeddingFunction.__call__ to have the following signature: {protocol_signature}, got {function_signature}\n"
+            "Please see https://docs.trychroma.com/embeddings for details of the EmbeddingFunction interface.\n"
+            "Please note the recent change to the EmbeddingFunction interface: https://docs.trychroma.com/migration#migration-to-0416---november-7-2023 \n"
+        )
+class DataLoader(Protocol[L]):
+    def __call__(self, uris: URIs) -> L:
+        ...
+def validate_ids(ids: IDs) -> IDs:
+    """Validates ids to ensure it is a list of strings"""
+    if not isinstance(ids, list):
+        raise ValueError(f"Expected IDs to be a list, got {ids}")
+    if len(ids) == 0:
+        raise ValueError(f"Expected IDs to be a non-empty list, got {ids}")
+    seen = set()
+    dups = set()
+    for id_ in ids:
+        if not isinstance(id_, str):
+            raise ValueError(f"Expected ID to be a str, got {id_}")
+        if id_ in seen:
+            dups.add(id_)
+        else:
+            seen.add(id_)
+    if dups:
+        n_dups = len(dups)
+        if n_dups < 10:
+            example_string = ", ".join(dups)
+            message = (
+                f"Expected IDs to be unique, found duplicates of: {example_string}"
+            )
+        else:
+            examples = []
+            for idx, dup in enumerate(dups):
+                examples.append(dup)
+                if idx == 10:
+                    break
+            example_string = (
+                f"{', '.join(examples[:5])}, ..., {', '.join(examples[-5:])}"
+            )
+            message = f"Expected IDs to be unique, found {n_dups} duplicated IDs: {example_string}"
+        raise errors.DuplicateIDError(message)
+    return ids
+def validate_metadata(metadata: Metadata) -> Metadata:
+    """Validates metadata to ensure it is a dictionary of strings to strings, ints, floats or bools"""
+    if not isinstance(metadata, dict) and metadata is not None:
+        raise ValueError(f"Expected metadata to be a dict or None, got {metadata}")
+    if metadata is None:
+        return metadata
+    if len(metadata) == 0:
+        raise ValueError(f"Expected metadata to be a non-empty dict, got {metadata}")
+    for key, value in metadata.items():
+        if not isinstance(key, str):
+            raise TypeError(
+                f"Expected metadata key to be a str, got {key} which is a {type(key)}"
+            )
+        # isinstance(True, int) evaluates to True, so we need to check for bools separately
+        if not isinstance(value, bool) and not isinstance(value, (str, int, float)):
+            raise ValueError(
+                f"Expected metadata value to be a str, int, float or bool, got {value} which is a {type(value)}"
+            )
+    return metadata
+def validate_update_metadata(metadata: UpdateMetadata) -> UpdateMetadata:
+    """Validates metadata to ensure it is a dictionary of strings to strings, ints, floats or bools"""
+    if not isinstance(metadata, dict) and metadata is not None:
+        raise ValueError(f"Expected metadata to be a dict or None, got {metadata}")
+    if metadata is None:
+        return metadata
+    if len(metadata) == 0:
+        raise ValueError(f"Expected metadata to be a non-empty dict, got {metadata}")
+    for key, value in metadata.items():
+        if not isinstance(key, str):
+            raise ValueError(f"Expected metadata key to be a str, got {key}")
+        # isinstance(True, int) evaluates to True, so we need to check for bools separately
+        if not isinstance(value, bool) and not isinstance(
+            value, (str, int, float, type(None))
+        ):
+            raise ValueError(
+                f"Expected metadata value to be a str, int, or float, got {value}"
+            )
+    return metadata
+def validate_metadatas(metadatas: Metadatas) -> Metadatas:
+    """Validates metadatas to ensure it is a list of dictionaries of strings to strings, ints, floats or bools"""
+    if not isinstance(metadatas, list):
+        raise ValueError(f"Expected metadatas to be a list, got {metadatas}")
+    for metadata in metadatas:
+        validate_metadata(metadata)
+    return metadatas
+def validate_where(where: Where) -> Where:
+    """
+    Validates where to ensure it is a dictionary of strings to strings, ints, floats or operator expressions,
+    or in the case of $and and $or, a list of where expressions
+    """
+    if not isinstance(where, dict):
+        raise ValueError(f"Expected where to be a dict, got {where}")
+    if len(where) != 1:
+        raise ValueError(f"Expected where to have exactly one operator, got {where}")
+    for key, value in where.items():
+        if not isinstance(key, str):
+            raise ValueError(f"Expected where key to be a str, got {key}")
+        if (
+            key != "$and"
+            and key != "$or"
+            and key != "$in"
+            and key != "$nin"
+            and not isinstance(value, (str, int, float, dict))
+        ):
+            raise ValueError(
+                f"Expected where value to be a str, int, float, or operator expression, got {value}"
+            )
+        if key == "$and" or key == "$or":
+            if not isinstance(value, list):
+                raise ValueError(
+                    f"Expected where value for $and or $or to be a list of where expressions, got {value}"
+                )
+            if len(value) <= 1:
+                raise ValueError(
+                    f"Expected where value for $and or $or to be a list with at least two where expressions, got {value}"
+                )
+            for where_expression in value:
+                validate_where(where_expression)
+        # Value is a operator expression
+        if isinstance(value, dict):
+            # Ensure there is only one operator
+            if len(value) != 1:
+                raise ValueError(
+                    f"Expected operator expression to have exactly one operator, got {value}"
+                )
+            for operator, operand in value.items():
+                # Only numbers can be compared with gt, gte, lt, lte
+                if operator in ["$gt", "$gte", "$lt", "$lte"]:
+                    if not isinstance(operand, (int, float)):
+                        raise ValueError(
+                            f"Expected operand value to be an int or a float for operator {operator}, got {operand}"
+                        )
+                if operator in ["$in", "$nin"]:
+                    if not isinstance(operand, list):
+                        raise ValueError(
+                            f"Expected operand value to be an list for operator {operator}, got {operand}"
+                        )
+                if operator not in [
+                    "$gt",
+                    "$gte",
+                    "$lt",
+                    "$lte",
+                    "$ne",
+                    "$eq",
+                    "$in",
+                    "$nin",
+                ]:
+                    raise ValueError(
+                        f"Expected where operator to be one of $gt, $gte, $lt, $lte, $ne, $eq, $in, $nin, "
+                        f"got {operator}"
+                    )
+                if not isinstance(operand, (str, int, float, list)):
+                    raise ValueError(
+                        f"Expected where operand value to be a str, int, float, or list of those type, got {operand}"
+                    )
+                if isinstance(operand, list) and (
+                    len(operand) == 0
+                    or not all(isinstance(x, type(operand[0])) for x in operand)
+                ):
+                    raise ValueError(
+                        f"Expected where operand value to be a non-empty list, and all values to obe of the same type "
+                        f"got {operand}"
+                    )
+    return where
+def validate_where_document(where_document: WhereDocument) -> WhereDocument:
+    """
+    Validates where_document to ensure it is a dictionary of WhereDocumentOperator to strings, or in the case of $and and $or,
+    a list of where_document expressions
+    """
+    if not isinstance(where_document, dict):
+        raise ValueError(
+            f"Expected where document to be a dictionary, got {where_document}"
+        )
+    if len(where_document) != 1:
+        raise ValueError(
+            f"Expected where document to have exactly one operator, got {where_document}"
+        )
+    for operator, operand in where_document.items():
+        if operator not in ["$contains", "$not_contains", "$and", "$or"]:
+            raise ValueError(
+                f"Expected where document operator to be one of $contains, $and, $or, got {operator}"
+            )
+        if operator == "$and" or operator == "$or":
+            if not isinstance(operand, list):
+                raise ValueError(
+                    f"Expected document value for $and or $or to be a list of where document expressions, got {operand}"
+                )
+            if len(operand) <= 1:
+                raise ValueError(
+                    f"Expected document value for $and or $or to be a list with at least two where document expressions, got {operand}"
+                )
+            for where_document_expression in operand:
+                validate_where_document(where_document_expression)
+        # Value is a $contains operator
+        elif not isinstance(operand, str):
+            raise ValueError(
+                f"Expected where document operand value for operator $contains to be a str, got {operand}"
+            )
+        elif len(operand) == 0:
+            raise ValueError(
+                "Expected where document operand value for operator $contains to be a non-empty str"
+            )
+    return where_document
+def validate_include(include: Include, allow_distances: bool) -> Include:
+    """Validates include to ensure it is a list of strings. Since get does not allow distances, allow_distances is used
+    to control if distances is allowed"""
+    if not isinstance(include, list):
+        raise ValueError(f"Expected include to be a list, got {include}")
+    for item in include:
+        if not isinstance(item, str):
+            raise ValueError(f"Expected include item to be a str, got {item}")
+        allowed_values = ["embeddings", "documents", "metadatas", "uris", "data"]
+        if allow_distances:
+            allowed_values.append("distances")
+        if item not in allowed_values:
+            raise ValueError(
+                f"Expected include item to be one of {', '.join(allowed_values)}, got {item}"
+            )
+    return include
+def validate_n_results(n_results: int) -> int:
+    """Validates n_results to ensure it is a positive Integer. Since hnswlib does not allow n_results to be negative."""
+    # Check Number of requested results
+    if not isinstance(n_results, int):
+        raise ValueError(
+            f"Expected requested number of results to be a int, got {n_results}"
+        )
+    if n_results <= 0:
+        raise TypeError(
+            f"Number of requested results {n_results}, cannot be negative, or zero."
+        )
+    return n_results
+def validate_embeddings(embeddings: Embeddings) -> Embeddings:
+    """Validates embeddings to ensure it is a list of list of ints, or floats"""
+    if not isinstance(embeddings, list):
+        raise ValueError(f"Expected embeddings to be a list, got {embeddings}")
+    if len(embeddings) == 0:
+        raise ValueError(
+            f"Expected embeddings to be a list with at least one item, got {embeddings}"
+        )
+    if not all([isinstance(e, list) for e in embeddings]):
+        raise ValueError(
+            f"Expected each embedding in the embeddings to be a list, got {embeddings}"
+        )
+    for i,embedding in enumerate(embeddings):
+        if len(embedding) == 0:
+            raise ValueError(
+                f"Expected each embedding in the embeddings to be a non-empty list, got empty embedding at pos {i}"
+            )
+        if not all(
+            [
+                isinstance(value, (int, float)) and not isinstance(value, bool)
+                for value in embedding
+            ]
+        ):
+            raise ValueError(
+                f"Expected each value in the embedding to be a int or float, got {embeddings}"
+            )
+    return embeddings
+def validate_batch(
+    batch: Tuple[
+        IDs,
+        Optional[Embeddings],
+        Optional[Metadatas],
+        Optional[Documents],
+        Optional[URIs],
+    ],
+    limits: Dict[str, Any],
+) -> None:
+    if len(batch[0]) > limits["max_batch_size"]:
+        raise ValueError(
+            f"Batch size {len(batch[0])} exceeds maximum batch size {limits['max_batch_size']}"
+        )