Add support for XPU (sycl) (#3)

Browse files

- Add support for XPU(sycl) (b87db80d558d76f8193f293889e6aee81e9d3743)
- Revert changes to the build folder (f425aa577e0fb59ed08c568985b007ab686da778)
- Delete pyc files (acc75a3bf1a67e06659873b98909b4e997b3f45f)

Co-authored-by: Kai Yang <[email protected]>

Files changed (9) hide show

build.toml +8 -0
flake.lock +13 -14
flake.nix +3 -9
rotary-xpu/rotary_xpu.cpp +40 -0
rotary-xpu/rotary_xpu.hpp +375 -0
tests/__init__.py +0 -0
tests/test_rotary.py +127 -0
tests/utils.py +23 -0
torch-ext/torch_binding.cpp +17 -5

build.toml CHANGED Viewed

@@ -9,3 +9,11 @@ src = ["torch-ext/torch_binding.cpp"]
 backend = "cuda"
 depends = ["torch"]
 src = ["rotary/rotary_cuda.cu"]

 backend = "cuda"
 depends = ["torch"]
 src = ["rotary/rotary_cuda.cu"]
+[kernel.rotary_xpu]
+backend = "xpu"
+depends = ["torch"]
+src = [
+    "rotary-xpu/rotary_xpu.cpp",
+    "rotary-xpu/rotary_xpu.hpp",
+    ]

flake.lock CHANGED Viewed

@@ -17,11 +17,11 @@
     },
     "flake-compat_2": {
       "locked": {
-        "lastModified": 1733328505,
-        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
         "owner": "edolstra",
         "repo": "flake-compat",
-        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
         "type": "github"
       },
       "original": {
@@ -73,11 +73,11 @@
         "nixpkgs": "nixpkgs"
       },
       "locked": {
-        "lastModified": 1753354560,
-        "narHash": "sha256-vmOfRmr0Qm/IbZTWB2sBn+UFrABSTTA/cTg+m27Yt/E=",
         "owner": "huggingface",
         "repo": "hf-nix",
-        "rev": "7f2aceda2a2e72cd573bdb25e5c0667fd75f89d3",
         "type": "github"
       },
       "original": {
@@ -98,33 +98,32 @@
         ]
       },
       "locked": {
-        "lastModified": 1753354632,
-        "narHash": "sha256-31SX3Raiyx0qCuY9JSlx9ZZgxljeUxvW+JdujjxbofQ=",
         "owner": "huggingface",
         "repo": "kernel-builder",
-        "rev": "524b628fd8e58525dbd28455bffb0628092c5265",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
-        "ref": "torch-2.8",
         "repo": "kernel-builder",
         "type": "github"
       }
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1752785354,
-        "narHash": "sha256-Y33ryUz7MPqKrZwlbQcsYCUz2jAJCacRf8jbs0tYUlA=",
         "owner": "nixos",
         "repo": "nixpkgs",
-        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
         "type": "github"
       },
       "original": {
         "owner": "nixos",
         "repo": "nixpkgs",
-        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
         "type": "github"
       }
     },

     },
     "flake-compat_2": {
       "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
         "owner": "edolstra",
         "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
         "type": "github"
       },
       "original": {
         "nixpkgs": "nixpkgs"
       },
       "locked": {
+        "lastModified": 1757493151,
+        "narHash": "sha256-eirWlcvs2rjZmU8JcF4CKN1IEnNfpQnGuf2qbK3IQh8=",
         "owner": "huggingface",
         "repo": "hf-nix",
+        "rev": "503cd4eb9866103c983dbef93d9ad5db4fb6b415",
         "type": "github"
       },
       "original": {
         ]
       },
       "locked": {
+        "lastModified": 1757570810,
+        "narHash": "sha256-YFWQwy2LKbhjdLW8wkyNkE/+Vbdn6qlJif2CKvBT9Qo=",
         "owner": "huggingface",
         "repo": "kernel-builder",
+        "rev": "1201847af3ff757b65015c6e06b5bd75896d2d4b",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
         "repo": "kernel-builder",
         "type": "github"
       }
     },
     "nixpkgs": {
       "locked": {
+        "lastModified": 1755963616,
+        "narHash": "sha256-6yD0ww/S8n+U2uPYcJZ3DRURP8Kx036GRpR2uPNZroE=",
         "owner": "nixos",
         "repo": "nixpkgs",
+        "rev": "73e96df7cff5783f45e21342a75a1540c4eddce4",
         "type": "github"
       },
       "original": {
         "owner": "nixos",
+        "ref": "nixos-unstable-small",
         "repo": "nixpkgs",
         "type": "github"
       }
     },

flake.nix CHANGED Viewed

@@ -1,15 +1,9 @@
 {
-  description = "Flake for rotary kernel";
   inputs = {
-    kernel-builder.url = "github:huggingface/kernel-builder/torch-2.8";
   };
-  outputs =
-    {
-      self,
-      kernel-builder,
-    }:
     kernel-builder.lib.genFlakeOutputs {
       path = ./.;
       rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;

 {
+  description = "Flake for Torch kernel extension";
   inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
   };
+  outputs = { self, kernel-builder, }:
     kernel-builder.lib.genFlakeOutputs {
       path = ./.;
       rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;

rotary-xpu/rotary_xpu.cpp ADDED Viewed

	@@ -0,0 +1,40 @@

+#include <torch/all.h>
+#include "rotary_xpu.hpp"
+void _apply_rotary(torch::Tensor const &x1, torch::Tensor const &x2,
+                       torch::Tensor const &cos, torch::Tensor const &sin,
+                       torch::Tensor &out1, torch::Tensor &out2,
+                       bool const conj) {
+    auto iter = at::TensorIteratorConfig()
+        .add_output(out1)
+        .add_output(out2)
+        .add_input(x1)
+        .add_input(x2)
+        .add_input(cos)
+        .add_input(sin)
+        .check_all_same_dtype(false)
+        .promote_inputs_to_common_dtype(false)
+        .build();
+    if (!conj) {
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel_xpu", [&] {
+            gpu_kernel_multiple_outputs(
+                iter, [] (scalar_t x1, scalar_t x2, scalar_t cos,
+                                    scalar_t sin) -> std::tuple<scalar_t, scalar_t> {
+                scalar_t out1 = float(x1) * float(cos) - float(x2) * float(sin);
+                scalar_t out2 = float(x1) * float(sin) + float(x2) * float(cos);
+                return {out1, out2};
+            });
+        });
+    } else {
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel_xpu", [&] {
+            gpu_kernel_multiple_outputs(
+                iter, [] (scalar_t x1, scalar_t x2, scalar_t cos,
+                                    scalar_t sin) -> std::tuple<scalar_t, scalar_t> {
+                scalar_t out1 = float(x1) * float(cos) + float(x2) * float(sin);
+                scalar_t out2 = -float(x1) * float(sin) + float(x2) * float(cos);
+                return {out1, out2};
+            });
+        });
+    }
+}

rotary-xpu/rotary_xpu.hpp ADDED Viewed

	@@ -0,0 +1,375 @@

+#include <ATen/core/TensorBody.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/TensorIterator.h>
+#include <sycl/sycl.hpp>
+#include <ATen/core/Array.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/TypeCast.h>
+#include <cstdint>
+#include <type_traits>
+#include <array>
+#include <c10/core/ScalarType.h>
+#include <c10/xpu/XPUStream.h>
+#include <ATen/xpu/XPUContext.h>
+constexpr int MAX_DIMS = 12;
+struct LoadWithoutCast {
+  template <typename scalar_t>
+  C10_DEVICE scalar_t load(char* base_ptr, uint32_t offset, int arg) {
+    return c10::load(reinterpret_cast<scalar_t*>(base_ptr) + offset);
+  }
+};
+struct StoreWithoutCast {
+  template <typename scalar_t>
+  C10_DEVICE void store(scalar_t value, char* base_ptr, uint32_t offset, int arg = 0) {
+    *(reinterpret_cast<scalar_t*>(base_ptr) + offset) = value;
+  }
+};
+template <template <int i> typename func, int end, int current = 0>
+struct static_unroll {
+  template <typename... Args>
+  static inline C10_HOST_DEVICE void with_args(Args&&... args) {
+    func<current>::apply(std::forward<Args>(args)...);
+    static_unroll<func, end, current + 1>::with_args(args...);
+  }
+};
+template <template <int i> typename func, int end>
+struct static_unroll<func, end, end> {
+  template <typename... Args>
+  static inline C10_HOST_DEVICE void with_args(Args... args) {}
+};
+template <int current>
+struct multi_outputs_store_helper {
+  template <int ntensors, int num_outputs, typename... Args>
+  static C10_HOST_DEVICE void apply(
+      at::detail::Array<char*, ntensors> data,
+      at::detail::Array<uint32_t, num_outputs> offsets,
+      std::tuple<Args...> ret) {
+    using T = typename std::tuple_element<current, std::tuple<Args...>>::type;
+    T* to = reinterpret_cast<T*>(data[current]) + offsets[current];
+    *to = std::get<current>(ret);
+  }
+};
+template <int arg_index>
+struct unroll_load_helper {
+  template <typename args_t, typename policy_t, typename offset_t, typename loader_t>
+  static C10_DEVICE void apply(
+      policy_t& self,
+      args_t* args,
+      offset_t offset,
+      loader_t loader,
+      int j,
+      int num_outputs) {
+    using arg_t = std::tuple_element_t<arg_index, args_t>;
+    std::get<arg_index>(args[j]) = loader.template load<arg_t>(
+        self.data[arg_index + num_outputs], offset[arg_index], arg_index);
+  }
+};
+template <int item_work_size, typename data_t, typename inp_calc_t, typename out_calc_t, int num_outputs>
+struct multi_outputs_unroll {
+  data_t data;
+  int remaining;
+  inp_calc_t input_offset_calculator;
+  out_calc_t output_offset_calculator;
+  LoadWithoutCast loader;
+  StoreWithoutCast storer;
+  int item_idx;
+  int group_idx;
+  int num_items_per_group;
+  int group_work_size;
+  multi_outputs_unroll(
+      data_t data,
+      int remaining,
+      inp_calc_t ic,
+      out_calc_t oc,
+      int item_idx,
+      int group_idx,
+      int num_items_per_group)
+      : data(data),
+        remaining(remaining),
+        input_offset_calculator(ic),
+        output_offset_calculator(oc),
+        item_idx(item_idx),
+        group_idx(group_idx),
+        num_items_per_group(num_items_per_group),
+        group_work_size(item_work_size * num_items_per_group) {}
+  inline bool check_inbounds(int item_work_elem) const {
+    return (item_idx + item_work_elem * num_items_per_group < remaining);
+  }
+  template <typename args_t>
+  inline void load(args_t* args) {
+    constexpr int arity = std::tuple_size<args_t>::value;
+    int item_idx_ = item_idx;
+#pragma unroll
+    for (int i = 0; i < item_work_size; i++) {
+      if (item_idx_ >= remaining) {
+        return;
+      }
+      int linear_idx = item_idx_ + group_work_size * group_idx;
+      auto offset = input_offset_calculator.get(linear_idx);
+      static_unroll<unroll_load_helper, arity>::with_args(
+          *this, args, offset, loader, i, num_outputs);
+      item_idx_ += num_items_per_group;
+    }
+  }
+  template <typename return_t>
+  inline void store(return_t* from) {
+    int item_idx_ = item_idx;
+#pragma unroll
+    for (int i = 0; i < item_work_size; i++) {
+      if (item_idx_ >= this->remaining) {
+        return;
+      }
+      int linear_idx = item_idx_ + group_work_size * group_idx;
+      auto offsets = this->output_offset_calculator.get(linear_idx);
+      static_unroll<multi_outputs_store_helper, num_outputs>::with_args(this->data, offsets, from[i]);
+      item_idx_ += num_items_per_group;
+    }
+  }
+};
+template <int item_work_size, typename func_t, typename policy_t>
+inline void elementwise_kernel_helper(func_t f, policy_t policy) {
+  using traits = function_traits<func_t>;
+  using return_t = typename traits::result_type;
+  using args_t = typename traits::ArgsTuple;
+  return_t results[item_work_size];
+  args_t args[item_work_size];
+  policy.load(args);
+#pragma unroll
+  for (int i = 0; i < item_work_size; i++) {
+    if (policy.check_inbounds(i)) {
+      results[i] = std::apply(f, args[i]);
+    }
+  }
+  policy.store(results);
+}
+template <int num_outputs, typename func_t, typename array_t, typename in_calc_t, typename out_calc_t>
+struct UnrolledElementwiseForMultiOutputsKernel {
+  static constexpr int item_work_size = 4;
+  void operator()(sycl::nd_item<1> item_id) const {
+    int grpsz = item_id.get_local_range(0);
+    int grpid = item_id.get_group(0);
+    int lid = item_id.get_local_id(0);
+    int remaining = numel_ - item_work_size * grpsz * grpid;
+    auto policy = multi_outputs_unroll<item_work_size, array_t, in_calc_t, out_calc_t, num_outputs>(
+        data_, remaining, ic_, oc_, lid, grpid, grpsz);
+    elementwise_kernel_helper<item_work_size>(f_, policy);
+  };
+  UnrolledElementwiseForMultiOutputsKernel(int numel, func_t f, array_t data, in_calc_t ic, out_calc_t oc)
+      : numel_(numel), f_(f), data_(data), ic_(ic), oc_(oc) {}
+ private:
+  int numel_;
+  func_t f_;
+  array_t data_;
+  in_calc_t ic_;
+  out_calc_t oc_;
+};
+template <typename Value>
+struct IntDivider {
+  IntDivider() = default;
+  IntDivider(Value d) : divisor(d) {}
+  C10_HOST_DEVICE inline Value div(Value n) const {
+    return n / divisor;
+  }
+  C10_HOST_DEVICE inline Value mod(Value n) const {
+    return n % divisor;
+  }
+  C10_HOST_DEVICE inline auto divmod(Value n) const {
+    return std::make_pair(n / divisor, n % divisor);
+  }
+  Value divisor;
+};
+template <int NARGS, typename index_t = uint32_t, bool signed_strides = false>
+struct OffsetCalculator {
+  using stride_t = std::conditional_t<signed_strides, std::make_signed_t<index_t>, index_t>;
+  using offset_type = at::detail::Array<stride_t, std::max<int>(NARGS, 1)>;
+  OffsetCalculator(int dims, const int64_t* sizes, const int64_t* const* strides, const int64_t* element_sizes = nullptr)
+      : dims(dims) {
+    TORCH_CHECK(dims <= MAX_DIMS, "tensor has too many (>", MAX_DIMS, ") dims");
+    for (int i = 0; i < dims; i++) {
+      sizes_[i] = IntDivider<index_t>(sizes[i]);
+      for (int arg = 0; arg < NARGS; arg++) {
+        int64_t element_size = (element_sizes == nullptr ? 1LL : element_sizes[arg]);
+        strides_[i][arg] = strides[arg][i] / element_size;
+      }
+    }
+  }
+  C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
+    offset_type offsets;
+#pragma unroll
+    for (int arg = 0; arg < NARGS; arg++) {
+      offsets[arg] = 0;
+    }
+#pragma unroll
+    for (int dim = 0; dim < MAX_DIMS; ++dim) {
+      if (dim == dims) {
+        break;
+      }
+      auto divmod = sizes_[dim].divmod(linear_idx);
+      linear_idx = divmod.first;
+#pragma unroll
+      for (int arg = 0; arg < NARGS; arg++) {
+        offsets[arg] += divmod.second * strides_[dim][arg];
+      }
+    }
+    return offsets;
+  }
+  int dims;
+  IntDivider<index_t> sizes_[MAX_DIMS];
+  stride_t strides_[MAX_DIMS][std::max<int>(NARGS, 1)];
+};
+template <int N>
+static OffsetCalculator<N> make_input_offset_calculator(const at::TensorIteratorBase& iter) {
+  constexpr int array_size = std::max<int>(N, 1);
+  TORCH_INTERNAL_ASSERT(N == iter.ntensors() - iter.noutputs());
+  std::array<const int64_t*, array_size> strides;
+  int64_t element_sizes[array_size];
+  for (int i = 0; i < N; i++) {
+    strides[i] = iter.strides(i + iter.noutputs()).data();
+    element_sizes[i] = iter.element_size(i + iter.noutputs());
+  }
+  return OffsetCalculator<N>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes);
+}
+template <int num_outputs = 1>
+static OffsetCalculator<num_outputs> make_output_offset_calculator(const at::TensorIteratorBase& iter) {
+  TORCH_INTERNAL_ASSERT(num_outputs == iter.noutputs());
+  std::array<const int64_t*, num_outputs> strides;
+  int64_t element_sizes[num_outputs];
+  for (int i = 0; i < num_outputs; i++) {
+    strides[i] = iter.strides(i).data();
+    element_sizes[i] = iter.element_size(i);
+  }
+  return OffsetCalculator<num_outputs>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes);
+}
+static inline int64_t syclMaxWorkItemsPerSubSlice(at::DeviceIndex dev_id = c10::xpu::getCurrentXPUStream().device_index()) {
+  auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
+  int64_t simd_width = dev_prop->sub_group_sizes[0];
+  int64_t eu_count = dev_prop->gpu_eu_count_per_subslice;
+  return simd_width * eu_count;
+}
+template<class T>
+T ceil_div(T dividend, T divisor) {
+    return (dividend + divisor - 1) / divisor;
+}
+template <typename ker_t>
+static inline void sycl_kernel_submit(int64_t global_range, int64_t local_range, ::sycl::queue q, ker_t ker) {
+  q.parallel_for(
+    sycl::nd_range<1>(sycl::range<1>(global_range), sycl::range<1>(local_range)),
+    ker
+  );
+}
+template <int num_outputs, typename func_t, typename array_t, typename in_calc_t, typename out_calc_t>
+static inline void launch_unrolled_kernel_for_multi_outputs(
+    int64_t N,
+    const func_t& f,
+    array_t data,
+    in_calc_t ic,
+    out_calc_t oc) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  auto ker = UnrolledElementwiseForMultiOutputsKernel<num_outputs, func_t, array_t, in_calc_t, out_calc_t>(N, f, data, ic, oc);
+  using ker_t = decltype(ker);
+  int wg_sz = syclMaxWorkItemsPerSubSlice();
+  int num_wg = ceil_div<int>(N, ker_t::item_work_size * wg_sz);
+  sycl_kernel_submit(wg_sz * num_wg, wg_sz, c10::xpu::getCurrentXPUStream().queue(), ker);
+}
+template <int N>
+struct TrivialOffsetCalculator {
+  using offset_type = at::detail::Array<uint32_t, std::max<int>(N, 1)>;
+  C10_HOST_DEVICE offset_type get(uint32_t linear_idx) const {
+    offset_type offsets;
+#pragma unroll
+    for (int arg = 0; arg < N; arg++) {
+      offsets[arg] = linear_idx;
+    }
+    return offsets;
+  }
+};
+template <typename func_t>
+void gpu_kernel_multiple_outputs_impl(at::TensorIteratorBase& iter, const func_t& f) {
+  using traits = function_traits<func_t>;
+  using output_t = typename traits::result_type;
+  constexpr int num_outputs = std::tuple_size<output_t>::value;
+  constexpr int num_inputs = traits::arity;
+  constexpr int ntensors = num_outputs + num_inputs;
+  TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
+  TORCH_INTERNAL_ASSERT(iter.ntensors() == ntensors);
+  at::detail::Array<char*, ntensors> data;
+  for (int i = 0; i < ntensors; i++) {
+    data[i] = (char*)iter.data_ptr(i);
+  }
+  int64_t numel = iter.numel();
+  if (iter.is_contiguous()) {
+    auto input_calc = TrivialOffsetCalculator<num_inputs>();
+    auto output_calc = TrivialOffsetCalculator<num_outputs>();
+    launch_unrolled_kernel_for_multi_outputs<num_outputs>(numel, f, data, input_calc, output_calc);
+  } else {
+    auto input_calc = make_input_offset_calculator<num_inputs>(iter);
+    auto output_calc = make_output_offset_calculator<num_outputs>(iter);
+    launch_unrolled_kernel_for_multi_outputs<num_outputs>(numel, f, data, input_calc, output_calc);
+  }
+}
+template <typename func_t>
+void gpu_kernel_multiple_outputs(at::TensorIteratorBase& iter, const func_t& f) {
+  for (int arg = 0; arg < iter.ntensors(); arg++) {
+    TORCH_INTERNAL_ASSERT(iter.device(arg).is_xpu());
+  }
+  if (iter.numel() == 0) {
+    return;
+  }
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      gpu_kernel_multiple_outputs(sub_iter, f);
+    }
+    return;
+  }
+  gpu_kernel_multiple_outputs_impl(iter, f);
+}

tests/__init__.py ADDED Viewed

File without changes

tests/test_rotary.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import pytest
+import torch
+from tests.utils import infer_device, supports_bfloat16
+from kernels import get_local_kernel
+from pathlib import Path
+# import rotary
+# from transformers.trainer_utils import set_seed
+# set_seed(42)
+# Set the local repo path, relative path
+repo_path = Path(__file__).parent.parent
+rotary = get_local_kernel(repo_path=repo_path, package_name="rotary")
+def apply_rotary_torch(x1: torch.Tensor, x2: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, conj: bool = False):
+    assert x1.shape == x2.shape, "x1 and x2 must have the same shape"
+    if not conj:
+        out1 = x1 * cos - x2 * sin
+        out2 = x1 * sin + x2 * cos
+    else:
+        out1 = x1 * cos + x2 * sin
+        out2 = -x1 * sin + x2 * cos
+    return out1, out2
+def apply_rotary_torch_wrapper(q, k, cos, sin, conj: bool = False):
+    """the wrapper for apply_rotary_torch"""
+    rotary_dim = cos.shape[-1]
+    # apply rotation encoding to Q
+    q1 = q[..., :rotary_dim]
+    q2 = q[..., rotary_dim : 2 * rotary_dim]
+    q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
+    q_out = torch.cat([q_out_1, q_out_2, q[..., 2 * rotary_dim:]], dim=-1)
+    # apply rotation encoding to K
+    k1 = k[..., :rotary_dim]
+    k2 = k[..., rotary_dim : 2 * rotary_dim]
+    k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
+    k_out = torch.cat([k_out_1, k_out_2, k[..., 2 * rotary_dim:]], dim=-1)
+    return q_out, k_out
+def apply_rotary_kernel_wrapper(q, k, cos, sin, conj: bool = False):
+    """the wrapper for apply_rotary_kernel"""
+    rotary_dim = cos.shape[-1]
+    # apply rotation encoding to Q
+    q1 = q[..., :rotary_dim]
+    q2 = q[..., rotary_dim : 2 * rotary_dim]
+    rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)
+    # apply rotation encoding to K
+    k1 = k[..., :rotary_dim]
+    k2 = k[..., rotary_dim : 2 * rotary_dim]
+    rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("nheads", [8, 16])
+@pytest.mark.parametrize("seqlen", [128, 256])
+@pytest.mark.parametrize("headdim, rotary_dim", [(64, 32), (128, 64), (64, 30)])
+@pytest.mark.parametrize("qk_dim", [3, 4])
+@pytest.mark.parametrize(
+    "dtype, atol, rtol",
+    [
+        (torch.float32, 1e-5, 1e-5),
+        pytest.param(
+            torch.bfloat16,
+            1e-1,
+            1e-5,
+            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+        ),
+    ],
+)
+@pytest.mark.parametrize("conj", [False, True])
+@pytest.mark.flaky(max_runs=2, min_passes=1)
+def test_rotary_equivalence(batch_size, nheads, seqlen, headdim, rotary_dim, qk_dim, dtype, atol, rtol, conj):
+    device = infer_device()
+    if device is None:
+        pytest.skip("No suitable device found for testing")
+    if qk_dim == 4:
+        q_shape = (batch_size, seqlen, nheads, headdim)
+        cos_sin_shape = (seqlen, 1, rotary_dim)
+    elif qk_dim == 3:
+        q_shape = (batch_size * seqlen, nheads, headdim)
+        cos_sin_shape = (batch_size * seqlen, 1, rotary_dim)
+    q_orig = torch.randn(q_shape, device=device, dtype=dtype)
+    k_orig = torch.randn(q_shape, device=device, dtype=dtype)
+    cos = torch.randn(cos_sin_shape, device=device, dtype=dtype)
+    sin = torch.randn(cos_sin_shape, device=device, dtype=dtype)
+    q_kernel, k_kernel = q_orig.clone(), k_orig.clone()
+    q_torch, k_torch = q_orig.clone(), k_orig.clone()
+    q_torch_out, k_torch_out = apply_rotary_torch_wrapper(q_torch, k_torch, cos, sin, conj)
+    apply_rotary_kernel_wrapper(q_kernel, k_kernel, cos, sin, conj)
+    # verify the rotation results of Q and K are consistent
+    try:
+        assert torch.allclose(q_torch_out, q_kernel, atol=atol, rtol=rtol), "Rotary transformation results for Q do not match"
+    except AssertionError:
+        diff_q = torch.abs(q_torch_out - q_kernel)
+        max_diff_q = torch.max(diff_q)
+        print(f"Max difference for Q: {max_diff_q}")
+        raise
+    try:
+        assert torch.allclose(k_torch_out, k_kernel, atol=atol, rtol=rtol), "Rotary transformation results for K do not match"
+    except AssertionError:
+        diff_k = torch.abs(k_torch_out - k_kernel)
+        max_diff_k = torch.max(diff_k)
+        print(f"Max difference for K: {max_diff_k}")
+        raise
+    # verify the non-rotated part of Q and K remains unchanged
+    if (2 * rotary_dim) < headdim:
+        assert torch.equal(
+            q_kernel[..., 2 * rotary_dim:], q_orig[..., 2 * rotary_dim:]
+        ), "Non-rotated part of Q should be unchanged"
+        assert torch.equal(
+            k_kernel[..., 2 * rotary_dim:], k_orig[..., 2 * rotary_dim:]
+        ), "Non-rotated part of K should be unchanged"

tests/utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch
+def infer_device():
+    """
+    Get current device name based on available devices
+    """
+    if torch.cuda.is_available():  # Works for both Nvidia and AMD
+        return "cuda"
+    elif torch.xpu.is_available():
+        return "xpu"
+    else:
+        return None
+def supports_bfloat16():
+    device = infer_device()
+    if device == "cuda":
+        return torch.cuda.get_device_capability() >= (8, 0)  # Ampere and newer
+    elif device == "xpu":
+        return True
+    else:
+        return False

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -1,12 +1,17 @@
 #include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 #include "registration.h"
-#define CHECK_DEVICE(x) TORCH_CHECK(x.device().type() == torch::kCUDA, #x " must be on CUDA")
 #define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
-void apply_rotary_cuda(torch::Tensor const &x1, torch::Tensor const &x2,
                        torch::Tensor const &cos, torch::Tensor const &sin,
                        torch::Tensor &out1, torch::Tensor &out2,
                        bool const conj);
@@ -27,16 +32,23 @@ void apply_rotary(torch::Tensor const &x1, torch::Tensor const &x2,
     TORCH_CHECK(cos.sizes() == sin.sizes());
     TORCH_CHECK(out1.sizes() == out2.sizes());
     // Otherwise the kernel will be launched from cuda:0 device
     at::cuda::CUDAGuard device_guard{x1.device()};
-    apply_rotary_cuda(x1, x2, cos, sin, out1, out2, conj);
 }
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("apply_rotary(Tensor x1, Tensor x2, Tensor cos, Tensor sin,"
           "Tensor! out1, Tensor! out2, bool conj) -> ()");
-  ops.impl("apply_rotary", torch::kCUDA, &apply_rotary);
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

 #include <torch/all.h>
+#if defined(CUDA_KERNEL)
 #include <c10/cuda/CUDAGuard.h>
+#elif defined(XPU_KERNEL)
+#include <c10/core/DeviceGuard.h>
+#endif
 #include "registration.h"
+#define CHECK_DEVICE(x) TORCH_CHECK(x.device().type() == torch::kCUDA || x.device().type() == torch::kXPU, #x " must be on CUDA or XPU")
 #define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
+void _apply_rotary(torch::Tensor const &x1, torch::Tensor const &x2,
                        torch::Tensor const &cos, torch::Tensor const &sin,
                        torch::Tensor &out1, torch::Tensor &out2,
                        bool const conj);
     TORCH_CHECK(cos.sizes() == sin.sizes());
     TORCH_CHECK(out1.sizes() == out2.sizes());
+#if defined(CUDA_KERNEL)
     // Otherwise the kernel will be launched from cuda:0 device
     at::cuda::CUDAGuard device_guard{x1.device()};
+#elif defined(XPU_KERNEL)
+    c10::DeviceGuard device_guard{x1.device()};
+#endif
+    _apply_rotary(x1, x2, cos, sin, out1, out2, conj);
 }
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("apply_rotary(Tensor x1, Tensor x2, Tensor cos, Tensor sin,"
           "Tensor! out1, Tensor! out2, bool conj) -> ()");
+#if defined(CUDA_KERNEL)
+    ops.impl("apply_rotary", torch::kCUDA, &apply_rotary);
+#elif defined(XPU_KERNEL)
+    ops.impl("apply_rotary", torch::kXPU, &apply_rotary);
+#endif
 }
 REGISTER_EXTENSION(TORCH_EXTENSION_NAME)