Akos Hadnagy commited on 24 days ago

Commit

1e1ffe8

1 Parent(s): ff615fc

WIP

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build.toml +35 -0
csrc/bak.ops.cu +21 -0
csrc/cuda_util.h +62 -0
csrc/cumsum.h +163 -0
csrc/grouped_gemm/fill_arguments.cuh +141 -0
csrc/grouped_gemm/grouped_gemm.cu +567 -0
csrc/grouped_gemm/grouped_gemm.h +20 -0
csrc/grouped_gemm/ops.cu +11 -0
csrc/histogram.h +86 -0
csrc/indices.h +95 -0
csrc/new_cumsum.cu +161 -0
csrc/new_cumsum.h +11 -0
csrc/new_histogram.cu +85 -0
csrc/new_histogram.h +10 -0
csrc/new_indices.cu +97 -0
csrc/new_indices.h +14 -0
csrc/new_replicate.cu +220 -0
csrc/new_replicate.h +17 -0
csrc/new_sort.cu +90 -0
csrc/new_sort.h +13 -0
csrc/replicate.h +211 -0
csrc/sort.h +91 -0
flake.lock +168 -0
flake.nix +24 -0
tests/__init__.py +0 -0
tests/conftest.py +110 -0
tests/fixtures/autouse.py +107 -0
tests/fixtures/fixtures.py +13 -0
tests/layer_test.py +53 -0
tests/layers/architectures.py +53 -0
tests/layers/moe_test.py +199 -0
tests/ops/binned_gather_test.py +71 -0
tests/ops/binned_scatter_test.py +87 -0
tests/ops/cumsum_test.py +44 -0
tests/ops/histogram_test.py +82 -0
tests/ops/padded_gather_test.py +94 -0
tests/ops/padded_scatter_test.py +155 -0
tests/ops/replicate_test.py +108 -0
tests/ops/sort_test.py +65 -0
tests/ops/topology_test.py +81 -0
tests/ops_test.py +171 -0
tests/parallel_layer_test.py +94 -0
tests/test_gg.py +57 -0
tests/test_mb_moe.py +48 -0
tests/test_mb_moe_shared_expert.py +139 -0
tests/test_mb_moe_shared_expert_multi.py +200 -0
torch-ext/megablocks/__init__.py +202 -0
torch-ext/megablocks/_layers/__init__.py +10 -0
torch-ext/megablocks/_layers/activation_fn.py +33 -0
torch-ext/megablocks/_layers/all_to_all.py +54 -0

build.toml ADDED Viewed

	@@ -0,0 +1,35 @@

+[general]
+name = "megablocks"
+universal = false
+[torch]
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h"
+]
+[kernel.megablocks]
+backend = "rocm"
+rocm-archs = [
+    "gfx942",
+    "gfx1030",
+    "gfx1100",
+    "gfx1101",
+]
+depends = ["torch"]
+src = [
+    "csrc/new_cumsum.h",
+    "csrc/new_cumsum.cu",
+    "csrc/new_histogram.h",
+    "csrc/new_histogram.cu",
+    "csrc/new_indices.h",
+    "csrc/new_indices.cu",
+    "csrc/new_replicate.cu",
+    "csrc/new_replicate.h",
+    "csrc/new_sort.h",
+    "csrc/new_sort.cu",
+    # vendored grouped gemm
+    #"csrc/grouped_gemm/fill_arguments.cuh",
+    #"csrc/grouped_gemm/grouped_gemm.cu",
+    #"csrc/grouped_gemm/grouped_gemm.h",
+]

csrc/bak.ops.cu ADDED Viewed

	@@ -0,0 +1,21 @@

+#include "cumsum.h"
+#include "histogram.h"
+#include "indices.h"
+#include "replicate.h"
+#include "sort.h"
+#include <torch/extension.h>
+namespace megablocks {
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("exclusive_cumsum", &exclusive_cumsum, "batched exclusive cumsum.");
+  m.def("histogram", &histogram, "even width histogram.");
+  m.def("inclusive_cumsum", &inclusive_cumsum, "batched inclusive cumsum");
+  m.def("indices", &indices, "indices construction for sparse matrix.");
+  m.def("replicate_forward", &replicate_forward, "(fwd) replicate a vector dynamically.");
+  m.def("replicate_backward", &replicate_backward, "(bwd) replicate a vector dynamically.");
+  m.def("sort", &sort, "key/value sort.");
+}
+}  // namespace megablocks

csrc/cuda_util.h ADDED Viewed

	@@ -0,0 +1,62 @@

+#ifndef BLOCKPARTY_CSRC_CUDA_UTIL_H_
+#define BLOCKPARTY_CSRC_CUDA_UTIL_H_
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+// #include <torch/extension.h>
+namespace megablocks {
+typedef __half2 half2;
+struct __align__(8) half4 {
+  half2 x, y;
+};
+struct __align__(16) half8 {
+  half2 x, y, z, w;
+};
+template <class To, class From>
+__device__ __forceinline__ To BitCast(const From& src) noexcept {
+  To dst;
+  std::memcpy(&dst, &src, sizeof(To));
+  return dst;
+}
+template <typename T>
+__device__ __forceinline__ void Store(const T& value, T* ptr) {
+  *ptr = value;
+}
+template <typename T>
+__device__ __forceinline__ T Load(const T* address) {
+  return __ldg(address);
+}
+__device__ __forceinline__ half4 Load(const half4* address) {
+  float2 x = __ldg(reinterpret_cast<const float2*>(address));
+  return BitCast<half4>(x);
+}
+__device__ __forceinline__ half8 Load(const half8* address) {
+  float4 x = __ldg(reinterpret_cast<const float4*>(address));
+  return BitCast<half8>(x);
+}
+template <typename T>
+__device__ __forceinline__ T Zero() { return 0; };
+template <>
+__device__ __forceinline__ half2 Zero<half2>() {
+  return {(c10::Half)0., (c10::Half)0.};
+};
+template <>
+__device__ __forceinline__ half4 Zero<half4>() {
+  return {Zero<half2>(), Zero<half2>()};
+};
+}  // namespace megablocks
+#endif  // BLOCKPARTY_CSRC_CUDA_UTIL_H_

csrc/cumsum.h ADDED Viewed

	@@ -0,0 +1,163 @@

+#define CUB_IGNORE_DEPRECATED_API
+#undef CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE megablocks
+#include <cstdint>
+#include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/all.h>
+// #include <torch/extension.h>
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+namespace megablocks {
+struct Inclusive {};
+struct Exclusive {};
+template <typename Type> struct Cumsum {
+  template<
+    typename InputIteratorT,
+    typename OutputIteratorT>
+  static void Run(void * d_temp_storage,
+		  size_t & temp_storage_bytes,
+		  InputIteratorT d_in,
+		  OutputIteratorT d_out,
+		  int num_items,
+		  cudaStream_t stream = 0,
+		  bool debug_synchronous = false) {
+    CUDA_CALL(cub::DeviceScan::ExclusiveSum(d_temp_storage,
+					    temp_storage_bytes,
+					    d_in,
+					    d_out,
+					    num_items,
+					    stream));//,
+					    //debug_synchronous));
+  }
+};
+template <> struct Cumsum<Inclusive> {
+  template<
+    typename InputIteratorT,
+    typename OutputIteratorT>
+  static void Run(void * d_temp_storage,
+		  size_t & temp_storage_bytes,
+		  InputIteratorT d_in,
+		  OutputIteratorT d_out,
+		  int num_items,
+		  cudaStream_t stream = 0,
+		  bool debug_synchronous = false) {
+    CUDA_CALL(cub::DeviceScan::InclusiveSum(d_temp_storage,
+					    temp_storage_bytes,
+					    d_in,
+					    d_out,
+					    num_items,
+					    stream));//,
+					    //debug_synchronous));
+  }
+};
+template <typename SumType, typename T>
+void cub_cumsum(torch::Tensor x, int dim, torch::Tensor out) {
+  // Get temporary storage size.
+  size_t scratchpad_bytes = 0;
+  Cumsum<SumType>::Run(nullptr,
+		       scratchpad_bytes,
+		       x.data_ptr<T>(),
+		       out.data_ptr<T>(),
+		       x.size(1),
+		       c10::cuda::getCurrentCUDAStream());
+  // Allocate scratchpad.
+  //
+  // NOTE: We scale for the batch dimension so we can run in parallel.
+  auto options = torch::TensorOptions()
+    .dtype(torch::kInt8)
+    .device(x.device());
+  torch::Tensor scratchpad = torch::empty(scratchpad_bytes * x.size(0),
+  					  options);
+  // Run the kernel.
+  //
+  // NOTE: Using different streams for each issue does not appear to
+  // yield performance gains for our problem set. The overhead of
+  // event/stream synchronization appears to outweigh the benfits.
+  // We could write a true batched cumsum, but this would require
+  // significant code duplication from cub and we might move away
+  // from this formulation anyways.
+  for (int i = 0; i < x.size(0); ++i) {
+    void* scratchpad_ptr = (int8_t*)scratchpad.data_ptr() + scratchpad_bytes * i;
+    Cumsum<SumType>::Run(scratchpad_ptr,
+			 scratchpad_bytes,
+			 x.data_ptr<T>() + x.size(1) * i,
+			 out.data_ptr<T>() + x.size(1) * i,
+			 x.size(1),
+			 c10::cuda::getCurrentCUDAStream());
+  }
+}
+void exclusive_cumsum(torch::Tensor x, int dim, torch::Tensor out) {
+  // Validate the input matrix.
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 2);
+  TORCH_CHECK(x.scalar_type() == torch::kInt16 ||
+	      x.scalar_type() == torch::kInt32 ||
+	      x.scalar_type() == torch::kInt64);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 2);
+  TORCH_CHECK(out.scalar_type() == x.scalar_type());
+  // NOTE: We currently only support contraction across the contiguous
+  // dimension in the matrix.
+  TORCH_CHECK(dim == 1);
+  switch (x.scalar_type()) {
+  case torch::kInt16:
+    cub_cumsum<Exclusive, short>(x, dim, out);
+    return;
+  case torch::kInt32:
+    cub_cumsum<Exclusive, int>(x, dim, out);
+    return;
+  }
+  TORCH_CHECK(x.scalar_type() == torch::kInt64);
+  cub_cumsum<Exclusive, long>(x, dim, out);
+}
+void inclusive_cumsum(torch::Tensor x, int dim, torch::Tensor out) {
+  // Validate the input matrix.
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 2);
+  TORCH_CHECK(x.scalar_type() == torch::kInt16 ||
+	      x.scalar_type() == torch::kInt32 ||
+	      x.scalar_type() == torch::kInt64);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 2);
+  TORCH_CHECK(out.scalar_type() == x.scalar_type());
+  // NOTE: We currently only support contraction across the contiguous
+  // dimension in the matrix.
+  TORCH_CHECK(dim == 1);
+  switch (x.scalar_type()) {
+  case torch::kInt16:
+    cub_cumsum<Inclusive, short>(x, dim, out);
+    return;
+  case torch::kInt32:
+    cub_cumsum<Inclusive, int>(x, dim, out);
+    return;
+  }
+  TORCH_CHECK(x.scalar_type() == torch::kInt64);
+  cub_cumsum<Inclusive, long>(x, dim, out);
+}
+} // namespace megablocks
+#undef CUB_WRAPPED_NAMESPACE

csrc/grouped_gemm/fill_arguments.cuh ADDED Viewed

	@@ -0,0 +1,141 @@

+#pragma once
+#include <ATen/cuda/detail/KernelUtils.h>
+#include <cub/cub.cuh>
+#include <cutlass/bfloat16.h>
+#include <cutlass/gemm_coord.h>
+namespace grouped_gemm {
+constexpr int kDynamicDim = -1;
+constexpr int kMaxExperts = 512;
+struct GemmProblem {
+  ::cutlass::gemm::GemmCoord dims;
+  int64_t lda, ldb, ldc;
+  // All offsets are in elements.
+  int64_t a_offset, b_offset, c_offset;
+};
+// TODO: revisit `ExtractGemmProblemK` struct
+// struct ExtractGemmProblemK {
+//   __device__ ::cuda::std::tuple<int&> operator()(GemmProblem& problem) const {
+//       return {problem.dims.k()};
+//   }
+// };
+template <
+    // If `k` is dynamic, we sort the problems by `k` in descending order.
+    // Otherwise, `m` is dynamic, and no sorting happens.
+    bool kDynamicK,
+    typename ElementA, typename ElementB, typename ElementC,
+    typename LayoutA, typename LayoutB, typename LayoutC,
+    typename Args
+>
+__global__ void FillArguments(
+    int num_experts, const int64_t* batch_sizes,
+    ElementA* ptr_a, ElementB* ptr_b, ElementC* ptr_c,
+    Args args, ::cutlass::gemm::GemmCoord dims
+) {
+  const int expert_idx = threadIdx.x;
+  const int batch_size = expert_idx < num_experts ? batch_sizes[expert_idx] : -1;
+  if (kDynamicK) {
+    assert(dims.k() == kDynamicDim);
+    dims.k() = batch_size;
+  } else {
+    assert(dims.m() == kDynamicDim);
+    dims.m() = batch_size;
+  }
+  using BlockScan = cub::BlockScan<int, kMaxExperts>;
+  using BlockSort = cub::BlockRadixSort<int, kMaxExperts, 1, GemmProblem>;
+  union SharedMemory {
+    typename BlockScan::TempStorage scan_storage;
+    typename BlockSort::TempStorage sort_storage;
+  };
+  __shared__ SharedMemory shared_memory;
+  int dynamic_dim = kDynamicK ? dims.k() : dims.m();
+  int dynamic_dim_cumsum;
+  BlockScan(shared_memory.scan_storage).ExclusiveSum(dynamic_dim, dynamic_dim_cumsum);
+  __syncthreads();
+  // We have to use `GemmProblem[1]` here instead of just `GemmProblem` because `SortDescending()` expects
+  // `KeyT (&)[ITEMS_PER_THREAD]` for the `keys` argument (i.e., `GemmProblem (&keys)[1]` in our case).
+  GemmProblem problem[1] = {
+    GemmProblem {
+      .dims = dims,
+      .lda = LayoutA::packed({dims.m(), dims.k()}).stride(0),
+      .ldb = LayoutB::packed({dims.k(), dims.n()}).stride(0),
+      .ldc = LayoutC::packed({dims.m(), dims.n()}).stride(0),
+      .a_offset = kDynamicK
+          ? (dims.m() * dynamic_dim_cumsum)
+          : (dynamic_dim_cumsum * dims.k()),
+      .b_offset = (kDynamicK ? dynamic_dim_cumsum : expert_idx * dims.k()) * dims.n(),
+      .c_offset = (kDynamicK ? expert_idx * dims.m() : dynamic_dim_cumsum) * dims.n(),
+    },
+  };
+  if constexpr (kDynamicK) {
+    // Sort by k dimension in descending order
+    // We need to extract the key (k value) for sorting
+    int k_keys[1] = { problem[0].dims.k() };
+    BlockSort(shared_memory.sort_storage).SortDescending(k_keys, problem);
+    // TODO: revisit original impl without `__syncthreads()`
+    // BlockSort(shared_memory.sort_storage).SortDescending(problem, ExtractGemmProblemK{});
+    // Quoting the CUB documentation (https://nvidia.github.io/cccl/cub/api/classcub_1_1BlockRadixSort.html):
+    // > A subsequent __syncthreads() threadblock barrier should be invoked after calling this method if the collective’s temporary storage [...]
+    // > is **to be reused or repurposed**.
+    // We don't need `__syncthreads()` here, since we don't do either of these things.
+  }
+  if (expert_idx < num_experts) {
+    args.problem_sizes[expert_idx] = problem[0].dims;
+    args.lda[expert_idx] = problem[0].lda;
+    args.ldb[expert_idx] = problem[0].ldb;
+    args.ldc[expert_idx] = problem[0].ldc;
+    args.ptr_A[expert_idx] = ptr_a + problem[0].a_offset;
+    args.ptr_B[expert_idx] = ptr_b + problem[0].b_offset;
+    args.ptr_C[expert_idx] = ptr_c + problem[0].c_offset;
+  }
+}
+template <typename Args>
+__global__ void ZeroOutK0Outputs(int num_experts, Args args) {
+  const int64_t start_idx = (int64_t)blockIdx.x * blockDim.x + threadIdx.x;
+  const int64_t delta     = (int64_t)gridDim.x * blockDim.x;
+  for (int ei = 0; ei < num_experts; ++ei) {
+    auto& dims = args.problem_sizes[ei];
+    // CUTLASS doesn't handle problems with `k=0` correctly, see https://github.com/NVIDIA/cutlass/pull/1593.
+    // Until a fix is available on the CUTLASS side, handle these problems by ourselves:
+    //   * (here) set the output to zero
+    //   * (in `IgnoreK0Problems`) make this problem a no-op by setting `m=0` and `n=0` (CUTLASS can handle the outer dimensions being zero)
+    if (dims.k() == 0) {
+      // Assume packed layout, run a grid-strided loop over the output.
+      int64_t total_elems = (int64_t)dims.m() * dims.n();
+      auto* out           = args.ptr_C[ei];
+      for (int64_t idx = start_idx; idx < total_elems; idx += delta) {
+        out[idx] = {};
+      }
+    }
+  }
+}
+template <typename Args>
+__global__ void IgnoreK0Problems(int num_experts, Args args) {
+  const int expert_idx = threadIdx.x;
+  if (expert_idx < num_experts) {
+    auto& dims = args.problem_sizes[expert_idx];
+    if (dims.k() == 0) {
+      dims.m() = 0;
+      dims.n() = 0;
+    }
+  }
+}
+}  // namespace grouped_gemm

csrc/grouped_gemm/grouped_gemm.cu ADDED Viewed

	@@ -0,0 +1,567 @@

+#include "grouped_gemm.h"
+#include "fill_arguments.cuh"
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/detail/KernelUtils.h>
+#include <c10/util/BFloat16.h>
+#include <c10/cuda/CUDAStream.h>
+#include <cub/cub.cuh>
+#include <torch/torch.h>
+#include "cutlass/bfloat16.h"
+#include "cutlass/complex.h"
+#include "cutlass/gemm/kernel/gemm_grouped.h"
+#include "cutlass/gemm/kernel/default_gemm_grouped.h"
+#include "cutlass/gemm/device/gemm_grouped.h"
+#include <type_traits>
+namespace grouped_gemm {
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+#define CUBLAS_CALL(code)					  \
+  do {								  \
+    cublasStatus_t status = code;				  \
+    TORCH_CHECK(status == CUBLAS_STATUS_SUCCESS, "CuBLAS Error"); \
+  } while (0)
+#define GROUPED_GEMM_STRINGIFY_HELPER(x) #x
+#define GROUPED_GEMM_STRINGIFY(x) \
+  GROUPED_GEMM_STRINGIFY_HELPER(x)
+template <bool trans>
+using GroupedGemmInputLayout = std::conditional_t<trans, ::cutlass::layout::ColumnMajor, ::cutlass::layout::RowMajor>;
+using GroupedGemmConfig = ::cutlass::gemm::device::DefaultGemmConfiguration<
+  ::cutlass::arch::OpClassTensorOp,
+  ::cutlass::arch::Sm80,
+  ::cutlass::bfloat16_t,
+  ::cutlass::bfloat16_t,
+  ::cutlass::bfloat16_t,
+  float
+>;
+// TODO(tgale): Update this for SM90 when it's supported by CUTLASS.
+template <bool trans_a, bool trans_b>
+using GroupedGemmKernel = typename cutlass::gemm::kernel::DefaultGemmGrouped<
+  // A operand.
+  ::cutlass::bfloat16_t,
+  GroupedGemmInputLayout<trans_a>,
+  ::cutlass::ComplexTransform::kNone,
+  GroupedGemmConfig::kAlignmentA,
+  // B operand.
+  ::cutlass::bfloat16_t,
+  GroupedGemmInputLayout<trans_b>,
+  ::cutlass::ComplexTransform::kNone,
+  GroupedGemmConfig::kAlignmentB,
+  // C operand.
+  ::cutlass::bfloat16_t,
+  ::cutlass::layout::RowMajor,
+  float,
+  ::cutlass::arch::OpClassTensorOp,
+  ::cutlass::arch::Sm80,
+  GroupedGemmConfig::ThreadblockShape,
+  GroupedGemmConfig::WarpShape,
+  GroupedGemmConfig::InstructionShape,
+  GroupedGemmConfig::EpilogueOutputOp,
+  // NOTE: Threadblock swizzling is currently not supported by CUTLASS's grouped kernels.
+  // This parameter is passed in at present to match the APIs of other kernels. The parameter
+  // is unused within the kernel.
+  ::cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle,
+  // TODO(tgale): Tune this for SM90.
+  GroupedGemmConfig::kStages>::GemmKernel;
+template <bool trans_a, bool trans_b>
+using GemmGrouped = ::cutlass::gemm::device::GemmGrouped<GroupedGemmKernel<trans_a, trans_b>>;
+template <typename T>
+torch::Tensor CopyToDevice(const std::vector<T> &x, const torch::Device &device) {
+  size_t bytes = x.size() * sizeof(T);
+  auto options = torch::TensorOptions().dtype(torch::kInt8).device(device);
+  torch::Tensor out = torch::empty(bytes, options);
+  CUDA_CALL(cudaMemcpyAsync(out.data_ptr(),
+			    x.data(), bytes,
+			    cudaMemcpyHostToDevice,
+			    c10::cuda::getCurrentCUDAStream()));
+  return out;
+}
+template <typename T>
+static void ReorderArray(T* data, const std::vector<size_t>& indices) {
+    // For now, simply create a copy of the data and then copy over to the original.
+    std::vector<T> copy(data, data + indices.size());
+    for (size_t i = 0; i < indices.size(); ++i) {
+        data[i] = copy.at(indices[i]);
+    }
+}
+template <typename T>
+torch::Tensor TypedEmpty(size_t numel, const torch::Device& device) {
+    return torch::empty(numel * sizeof(T), torch::dtype(torch::kInt8).device(device));
+}
+struct RawGemmArguments {
+  torch::Tensor lda, ldb, ldc, ptr_a, ptr_b, ptr_c, problem_sizes;
+  int threadblock_count{};
+};
+template <
+  typename Gemm,
+  typename ElementA, typename ElementB, typename ElementC
+>
+RawGemmArguments MakeArgumentsOnDevice(int num_experts, const torch::Device& device) {
+    TORCH_CHECK(
+        num_experts <= kMaxExperts,
+        "At most ", kMaxExperts,
+        " experts are supported when batch_sizes is a CUDA tensor, but got ", num_experts
+    );
+    return RawGemmArguments {
+      .lda = TypedEmpty<int64_t>(num_experts, device),
+      .ldb = TypedEmpty<int64_t>(num_experts, device),
+      .ldc = TypedEmpty<int64_t>(num_experts, device),
+      .ptr_a = TypedEmpty<ElementA*>(num_experts, device),
+      .ptr_b = TypedEmpty<ElementB*>(num_experts, device),
+      .ptr_c = TypedEmpty<ElementC*>(num_experts, device),
+      .problem_sizes = TypedEmpty<cutlass::gemm::GemmCoord>(num_experts, device),
+      // We don't know the problem dimensions on the host, so we just base the number of threadblocks on occupancy here.
+      .threadblock_count = Gemm::sufficient(),
+    };
+}
+template <
+  bool kDynamicK,
+  typename Gemm,
+  typename ElementA, typename ElementB, typename ElementC,
+  typename LayoutA, typename LayoutB, typename LayoutC
+>
+RawGemmArguments MakeArgumentsOnHost(torch::Tensor a,
+				     torch::Tensor b,
+				     torch::Tensor c,
+				     torch::Tensor batch_sizes,
+				     ::cutlass::gemm::GemmCoord coord_template,
+				     int64_t num_experts) {
+  std::vector<::cutlass::gemm::GemmCoord> problem_sizes_host(num_experts);
+  // Create the host arrays of leading dimension data and pointer data.
+  std::vector<int64_t> lda_host(num_experts), ldb_host(num_experts), ldc_host(num_experts);
+  int64_t elements_a = 0, elements_b = 0, elements_c = 0;
+  std::vector<ElementA *> ptr_a_host(num_experts), ptr_b_host(num_experts), ptr_c_host(num_experts);
+  for (int i = 0; i < num_experts; ++i) {
+    auto& problem = problem_sizes_host[i];
+    problem = coord_template;
+    (kDynamicK ? problem.k() : problem.m()) = batch_sizes.data_ptr<int64_t>()[i];
+    lda_host[i] = LayoutA::packed({problem.m(), problem.k()}).stride(0);
+    ldb_host[i] = LayoutB::packed({problem.k(), problem.n()}).stride(0);
+    ldc_host[i] = LayoutC::packed({problem.m(), problem.n()}).stride(0);
+    ptr_a_host[i] = (ElementA*)a.data_ptr() + elements_a;
+    ptr_b_host[i] = (ElementB*)b.data_ptr() + elements_b;
+    ptr_c_host[i] = (ElementC*)c.data_ptr() + elements_c;
+    elements_a += problem.m() * problem.k();
+    elements_b += problem.k() * problem.n();
+    elements_c += problem.m() * problem.n();
+    if (problem.k() == 0) {
+      // CUTLASS doesn't handle problems with `k=0` correctly, see https://github.com/NVIDIA/cutlass/pull/1593.
+      // Until a fix is available on the CUTLASS side, handle these problems by ourselves:
+      //   * set the output to zero with `cudaMemsetAsync()`
+      //   * make this problem a no-op by setting `m=0` and `n=0` (CUTLASS can handle the outer dimensions being zero)
+      CUDA_CALL(cudaMemsetAsync(ptr_c_host[i],
+        0,
+        problem.m() * problem.n() * sizeof(ElementC),
+        c10::cuda::getCurrentCUDAStream()));
+      problem.m() = 0;
+      problem.n() = 0;
+    }
+  }
+  // Only sort problems when K are different
+  if (kDynamicK) {
+      std::vector<size_t> indices(num_experts);
+      std::iota(indices.begin(), indices.end(), 0);
+      std::stable_sort(indices.begin(), indices.end(), [&problem_sizes_host](size_t i, size_t j) {
+          return problem_sizes_host[i].k() > problem_sizes_host[j].k();
+      });
+      ReorderArray(problem_sizes_host.data(), indices);
+      ReorderArray(lda_host.data(), indices);
+      ReorderArray(ldb_host.data(), indices);
+      ReorderArray(ldc_host.data(), indices);
+      ReorderArray(ptr_a_host.data(), indices);
+      ReorderArray(ptr_b_host.data(), indices);
+      ReorderArray(ptr_c_host.data(), indices);
+  }
+  // Copy the problem sizes, pointers and leading dimension data to the device.
+  return RawGemmArguments {
+    .lda = CopyToDevice(lda_host, a.device()),
+    .ldb = CopyToDevice(ldb_host, a.device()),
+    .ldc = CopyToDevice(ldc_host, a.device()),
+    .ptr_a = CopyToDevice(ptr_a_host, a.device()),
+    .ptr_b = CopyToDevice(ptr_b_host, a.device()),
+    .ptr_c = CopyToDevice(ptr_c_host, a.device()),
+    .problem_sizes = CopyToDevice(problem_sizes_host, a.device()),
+    // We know the problem dimensions on the host, so we can calculate the number of threadblocks based on that.
+    .threadblock_count = Gemm::sufficient(problem_sizes_host.data(), num_experts),
+  };
+}
+template <
+  bool kDynamicK,
+  typename Gemm,
+  typename ElementA, typename ElementB, typename ElementC,
+  typename LayoutA, typename LayoutB, typename LayoutC
+>
+typename Gemm::Arguments MakeArguments(torch::Tensor a,
+				       torch::Tensor b,
+				       torch::Tensor c,
+				       torch::Tensor batch_sizes,
+				       ::cutlass::gemm::GemmCoord coord_template,
+				       int64_t num_experts) {
+  RawGemmArguments raw_args;
+  if (batch_sizes.is_cuda()) {
+    raw_args = MakeArgumentsOnDevice<
+      Gemm, ElementA, ElementB, ElementC
+    >(num_experts, a.device());
+  } else {
+    raw_args = MakeArgumentsOnHost<
+      kDynamicK,
+      Gemm,
+      ElementA, ElementB, ElementC,
+      LayoutA, LayoutB, LayoutC
+    >(a, b, c, batch_sizes, coord_template, num_experts);
+  }
+  printf("Using %d threadblocks for grouped GEMM.\n", raw_args.threadblock_count);
+  // Validate the result.
+  if (!raw_args.threadblock_count) {
+    TORCH_CHECK(false, "Grouped GEMM execution not possible with HW");
+  }
+  typename Gemm::EpilogueOutputOp::Params epilogue_op(/*alpha=*/1.0f, /*beta=*/0.0f);
+  // We currently always use `GroupScheduleMode::kDeviceOnly`, which doesn't use `host_problem_sizes` at all,
+  // so we can safely pass `nullptr` for `host_problem_sizes`.
+  // TODO(tgale): Experiment with `GroupScheduleMode::kHostPrecompute` for `batch_sizes.is_cpu()`, where we
+  // know the problem dimensions on the host.
+  typename Gemm::Arguments arguments((cutlass::gemm::GemmCoord*)raw_args.problem_sizes.data_ptr(),
+				     (int)num_experts,
+				     (int)raw_args.threadblock_count,
+				     epilogue_op,
+				     (ElementA**)raw_args.ptr_a.data_ptr(),
+				     (ElementB**)raw_args.ptr_b.data_ptr(),
+				     (ElementC**)raw_args.ptr_c.data_ptr(),
+				     (ElementC**)raw_args.ptr_c.data_ptr(),
+				     /*lda=*/(int64_t*)raw_args.lda.data_ptr(),
+				     /*ldb=*/(int64_t*)raw_args.ldb.data_ptr(),
+				     /*ldc=*/(int64_t*)raw_args.ldc.data_ptr(),
+				     /*ldd=*/(int64_t*)raw_args.ldc.data_ptr(),
+				     /*host_problem_sizes=*/nullptr);
+  return arguments;
+}
+template <
+  bool trans_a,
+  typename ElementA, typename ElementB, typename ElementC,
+  typename LayoutA, typename LayoutB, typename LayoutC,
+  typename Arguments
+>
+void FillCutlassArguments(int num_experts,
+			  torch::Tensor batch_sizes,
+			  torch::Tensor a,
+			  torch::Tensor b,
+			  torch::Tensor c,
+			  const Arguments& arguments,
+			  ::cutlass::gemm::GemmCoord coord_template) {
+  // Convert the batch sizes to the format CUTLASS understands on the device.
+  // Use a single block here because:
+  //   * the number of elements to process is microscopically small
+  //   * we don't need any additional global memory
+  FillArguments<
+      /*kDynamicK*/trans_a,
+      ElementA, ElementB, ElementC,
+      LayoutA, LayoutB, LayoutC
+  ><<<1, kMaxExperts, 0, c10::cuda::getCurrentCUDAStream()>>>(
+      num_experts, batch_sizes.data_ptr<int64_t>(),
+      (ElementA*)a.data_ptr(), (ElementB*)b.data_ptr(), (ElementC*)c.data_ptr(),
+      arguments, coord_template
+  );
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
+}
+template <typename Args>
+void RemoveK0Problems(int num_experts, const Args& arguments) {
+  // For zeroing out the outputs (which might be arbitrarily large), we want to use
+  // as many threadblocks as possible in order to hit the maximum possible global memory bandwidth.
+  // `arguments.threadblock_count`, which we will use for the grouped GEMM proper,
+  // should be a good approximation for this.
+  // When the `k=0` case is fixed in CUTLASS, we can completely remove this function.
+  ZeroOutK0Outputs<><<<
+    arguments.threadblock_count, at::cuda::detail::CUDA_NUM_THREADS, 0, c10::cuda::getCurrentCUDAStream()
+  >>>(
+    num_experts, arguments
+  );
+  IgnoreK0Problems<><<<
+    1, kMaxExperts, 0, c10::cuda::getCurrentCUDAStream()
+  >>>(
+    num_experts, arguments
+  );
+}
+template <bool trans_a, bool trans_b>
+torch::Tensor CutlassGroupedGemm(torch::Tensor a,
+				 torch::Tensor b,
+				 torch::Tensor c,
+				 torch::Tensor batch_sizes,
+				 ::cutlass::gemm::GemmCoord coord_template) {
+  using Gemm = GemmGrouped<trans_a, trans_b>;
+  using LayoutA = typename Gemm::LayoutA;
+  using LayoutB = typename Gemm::LayoutB;
+  using LayoutC = typename Gemm::LayoutC;
+  using ElementA = typename Gemm::ElementA;
+  using ElementB = typename Gemm::ElementB;
+  using ElementC = typename Gemm::ElementC;
+  Gemm gemm;
+  int64_t num_experts = batch_sizes.size(0);
+  auto arguments = MakeArguments<
+    /*kDynamicK*/trans_a,
+    Gemm,
+    ElementA, ElementB, ElementC,
+    LayoutA, LayoutB, LayoutC
+  >(a, b, c, batch_sizes, coord_template, num_experts);
+  int64_t workspace_size = gemm.get_workspace_size(arguments);
+  auto options = torch::TensorOptions().dtype(torch::kInt8).device(a.device());
+  torch::Tensor workspace = torch::empty(workspace_size, options);
+  if (batch_sizes.is_cuda()) {
+      FillCutlassArguments<
+        trans_a,
+        ElementA, ElementB, ElementC,
+        LayoutA, LayoutB, LayoutC
+      >(num_experts, batch_sizes, a, b, c, arguments, coord_template);
+      RemoveK0Problems<>(num_experts, arguments);
+  }
+  // Initialize the kernel.
+  if(gemm.initialize(arguments, workspace.data_ptr()) != cutlass::Status::kSuccess) {
+    TORCH_CHECK(false, "Failed to initialize CUTLASS Grouped GEMM");
+  }
+  // Execute the kernel in the current stream.
+  if(gemm.run(c10::cuda::getCurrentCUDAStream()) != cutlass::Status::kSuccess) {
+    TORCH_CHECK(false, "Failed to run CUTLASS Grouped GEMM");
+  }
+  return c;
+}
+void CublasGemm(c10::BFloat16 *a, int64_t a_rows, int64_t a_cols, bool trans_a,
+		c10::BFloat16 *b, int64_t b_rows, int64_t b_cols, bool trans_b,
+		c10::BFloat16 *c, int64_t c_rows, int64_t c_cols) {
+  int m = trans_b ? b_rows : b_cols;
+  int k = trans_b ? b_cols : b_rows;
+  int n = trans_a ? a_cols : a_rows;
+  int lda = trans_a ? n : k;
+  int ldb = trans_b ? k : m;
+  cublasOperation_t transpose_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t transpose_b = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
+  float alpha = 1.0, beta = 0.0;
+  CUBLAS_CALL(cublasGemmEx(at::cuda::getCurrentCUDABlasHandle(),
+			   transpose_b, transpose_a,
+			   m, n, k, &alpha,
+			   b, CUDA_R_16BF, ldb,
+			   a, CUDA_R_16BF, lda,
+			   &beta,
+			   c, CUDA_R_16BF, c_cols, CUDA_R_32F,
+			   CUBLAS_GEMM_DEFAULT));
+}
+void CublasGroupedGemm(torch::Tensor a,
+		       torch::Tensor b,
+		       torch::Tensor c,
+		       torch::Tensor batch_sizes,
+		       bool trans_b) {
+  int64_t bs = batch_sizes.size(0), k = a.size(1);
+  int64_t n = trans_b ? b.size(1) : b.size(2);
+  int64_t b_rows = b.size(1), b_cols = b.size(2);
+  c10::BFloat16* a_ptr = a.data_ptr<c10::BFloat16>();
+  c10::BFloat16* b_ptr = b.data_ptr<c10::BFloat16>();
+  c10::BFloat16* c_ptr = c.data_ptr<c10::BFloat16>();
+  for (int i = 0; i < bs; ++i) {
+    int64_t m = batch_sizes.data_ptr<int64_t>()[i];
+    CublasGemm(a_ptr, m, k, /*trans_a=*/false,
+	       b_ptr, b_rows, b_cols, trans_b,
+	       c_ptr, m, n);
+    a_ptr += m * k;
+    b_ptr += b_rows * b_cols;
+    c_ptr += m * n;
+  }
+}
+void CublasGroupedGemmVariableK(torch::Tensor a,
+				torch::Tensor b,
+				torch::Tensor c,
+				torch::Tensor batch_sizes) {
+  int64_t bs = batch_sizes.size(0), m = a.size(1), n = b.size(1);
+  c10::BFloat16* a_ptr = a.data_ptr<c10::BFloat16>();
+  c10::BFloat16* b_ptr = b.data_ptr<c10::BFloat16>();
+  c10::BFloat16* c_ptr = c.data_ptr<c10::BFloat16>();
+  for (int i = 0; i < bs; ++i) {
+    int64_t k = batch_sizes.data_ptr<int64_t>()[i];
+    CublasGemm(a_ptr, k, m, /*trans_a=*/true,
+	       b_ptr, k, n, /*trans_b=*/false,
+	       c_ptr, m, n);
+    a_ptr += k * m;
+    b_ptr += k * n;
+    c_ptr += m * n;
+  }
+}
+void GroupedGemmVariableK(torch::Tensor a,
+			  torch::Tensor b,
+			  torch::Tensor c,
+			  torch::Tensor batch_sizes) {
+  // We expected a CUDA tensor with two dimensions and shape
+  // (tokens, hidden_out) for 'b'.
+  TORCH_CHECK(b.is_cuda());
+  TORCH_CHECK(b.ndimension() == 2);
+  TORCH_CHECK(b.scalar_type() == torch::kBFloat16);
+  // Validate the dimensions.
+  int64_t tokens = a.size(0), num_experts = batch_sizes.size(0);
+  int64_t m = a.size(1), n = b.size(1);
+  // Validate that we have the same contraction dimension.
+  TORCH_CHECK(tokens == b.size(0));
+  // Validate the output shape.
+  TORCH_CHECK(c.is_cuda());
+  TORCH_CHECK(c.ndimension() == 3);
+  TORCH_CHECK(c.scalar_type() == torch::kBFloat16);
+  TORCH_CHECK(c.size(0) == num_experts);
+  TORCH_CHECK(c.size(1) == m);
+  TORCH_CHECK(c.size(2) == n);
+  // Run the computation.
+  CublasGroupedGemmVariableK(a, b, c, batch_sizes);
+}
+// NOTE: We only support dynamic group sizes for the 'a' tensor. Tensor 'b' is
+// assumed to be batched with fixed sized batches.
+//
+// TODO(tgale): Validate alignment is true for every batch element.
+void GroupedGemm(torch::Tensor a,
+		 torch::Tensor b,
+		 torch::Tensor c,
+		 torch::Tensor batch_sizes,
+		 bool trans_a, bool trans_b) {
+  // NOTE: We only support 'trans_a' or 'trans_b', not both.
+  TORCH_CHECK(!(trans_a && trans_b));
+#if !defined(GROUPED_GEMM_CUTLASS)
+  // No way to run cuBLAS kernels if the problem dimensions are not known on the host.
+  TORCH_CHECK(batch_sizes.is_cpu());
+#else
+  // CUTLASS can handle both CPU- and CUDA-resident problem dimensions.
+  TORCH_CHECK(batch_sizes.is_cuda() || batch_sizes.is_cpu());
+#endif
+  TORCH_CHECK(batch_sizes.ndimension() == 1);
+  TORCH_CHECK(batch_sizes.scalar_type() == torch::kInt64);
+  // We expected a CUDA tensor with two dimensions and shape
+  // (tokens, hidden_in) for 'a'.
+  TORCH_CHECK(a.is_cuda());
+  TORCH_CHECK(a.ndimension() == 2);
+  TORCH_CHECK(a.scalar_type() == torch::kBFloat16);
+#if !defined(GROUPED_GEMM_CUTLASS)
+  if (trans_a) {
+    // If we can't use CUTLASS for the transposed cases, defer to the variable 'k' helper using cuBLAS
+    // for the rest of the op.
+    GroupedGemmVariableK(a, b, c, batch_sizes);
+    return;
+  }
+#endif
+  TORCH_CHECK(b.is_cuda());
+  TORCH_CHECK(c.is_cuda());
+  TORCH_CHECK(b.scalar_type() == torch::kBFloat16);
+  TORCH_CHECK(c.scalar_type() == torch::kBFloat16);
+  // The expected shapes of 'b' and 'c' are:
+  //   * when 'trans_a' is set: b=(tokens, hidden_out),                 c=(num_experts, hidden_in, hidden_out)
+  //   * when 'trans_b' is set: b=(num_experts, hidden_out, hidden_in), c=(tokens, hidden_out)
+  //   * otherwise:             b=(num_experts, hidden_in, hidden_out), c=(tokens, hidden
+  size_t hidden_in{}, hidden_out{};
+  if (trans_a) {
+    hidden_in = a.size(1);
+    hidden_out = b.size(1);
+    TORCH_CHECK(b.ndimension() == 2);
+    TORCH_CHECK(c.ndimension() == 3);
+    TORCH_CHECK(b.size(0) == a.size(0));
+    TORCH_CHECK(c.size(0) == batch_sizes.size(0));
+    TORCH_CHECK(c.size(1) == hidden_in);
+    TORCH_CHECK(c.size(2) == hidden_out);
+  } else {
+    TORCH_CHECK(b.ndimension() == 3);
+    TORCH_CHECK(c.ndimension() == 2);
+    // Validate the contraction dimensions match.
+    int64_t tokens = a.size(0), num_experts = b.size(0);
+    hidden_in = trans_b ? b.size(2) : b.size(1);
+    hidden_out = trans_b ? b.size(1) : b.size(2);
+    TORCH_CHECK(hidden_in == a.size(1));
+    // Validate that we have one size per expert.
+    TORCH_CHECK(batch_sizes.size(0) == num_experts);
+  }
+  // NOTE: We support transposition through the 'trans_b' flag.
+  TORCH_CHECK(a.is_contiguous());
+  TORCH_CHECK(b.is_contiguous());
+  TORCH_CHECK(c.is_contiguous());
+#if !defined(GROUPED_GEMM_CUTLASS)
+  CublasGroupedGemm(a, b, c, batch_sizes, trans_b);
+  return;
+#else
+  // The `coord_template` argument contains `kDynamicDim` as one of its dimensions
+  // as a placeholder. This placeholder is later expanded into the actual dimension
+  // for every element of the batch,  either on the host or on the device
+  // (if we can't do in on the host).
+  const auto coord_template = trans_a
+    ? cutlass::gemm::GemmCoord(hidden_in, hidden_out, kDynamicDim)
+    : cutlass::gemm::GemmCoord(kDynamicDim, hidden_out, hidden_in);
+  if (trans_a) {
+    CutlassGroupedGemm<true, false>(a, b, c, batch_sizes, coord_template);
+    return;
+  }
+  if (trans_b) {
+    CutlassGroupedGemm<false, true>(a, b, c, batch_sizes, coord_template);
+    return;
+  }
+  CutlassGroupedGemm<false, false>(a, b, c, batch_sizes, coord_template);
+  return;
+#endif
+}
+}  // namespace grouped_gemm

csrc/grouped_gemm/grouped_gemm.h ADDED Viewed

	@@ -0,0 +1,20 @@

+#pragma once
+// // Set default if not already defined
+// #ifndef GROUPED_GEMM_CUTLASS
+// #define GROUPED_GEMM_CUTLASS 0
+// #endif
+// #include <torch/extension.h>
+#include <torch/torch.h>
+namespace grouped_gemm {
+void GroupedGemm(torch::Tensor a,
+		 torch::Tensor b,
+		 torch::Tensor c,
+		 torch::Tensor batch_sizes,
+		 bool trans_a, bool trans_b);
+}  // namespace grouped_gemm

csrc/grouped_gemm/ops.cu ADDED Viewed

	@@ -0,0 +1,11 @@

+#include "grouped_gemm.h"
+#include <torch/extension.h>
+namespace grouped_gemm {
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("gmm", &GroupedGemm, "Grouped GEMM.");
+}
+}  // namespace grouped_gemm

csrc/histogram.h ADDED Viewed

	@@ -0,0 +1,86 @@

+#undef CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE megablocks
+#include <cstdint>
+#include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
+// #include <torch/extension.h>
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+namespace megablocks {
+template <typename T>
+torch::Tensor cub_histogram(torch::Tensor x, int num_bins) {
+  // Allocate the count buffer.
+  auto options = torch::TensorOptions()
+    .dtype(torch::kInt32)
+    .device(x.device());
+  torch::Tensor out = torch::empty({x.size(0), num_bins}, options);
+  // Exit early if there is not work to do.
+  if (out.numel() == 0) return out;
+  // Get scratchpad size.
+  size_t scratchpad_bytes = 0;
+  CUDA_CALL(cub::DeviceHistogram::HistogramEven(nullptr,
+						scratchpad_bytes,
+						x.data_ptr<T>(),
+						out.data_ptr<int>(),
+						/*num_levels=*/num_bins + 1,
+						/*lower_level=*/0,
+						/*upper_level=*/num_bins,
+						/*num_samples=*/int(x.size(1)),
+						c10::cuda::getCurrentCUDAStream()));
+  // Allocate scratchpad.
+  options = torch::TensorOptions().dtype(torch::kInt8).device(x.device());
+  torch::Tensor scratchpad = torch::empty(scratchpad_bytes, options);
+  // Run the kernel.
+  for (int i = 0; i < x.size(0); ++i) {
+    CUDA_CALL(cub::DeviceHistogram::HistogramEven(scratchpad.data_ptr(),
+						  scratchpad_bytes,
+						  x.data_ptr<T>() + x.size(1) * i,
+						  out.data_ptr<int>() + out.size(1) * i,
+						  /*num_levels=*/num_bins + 1,
+						  /*lower_level=*/0,
+						  /*upper_level=*/num_bins,
+						  /*num_samples=*/int(x.size(1)),
+						  c10::cuda::getCurrentCUDAStream()));
+  }
+  return out;
+}
+torch::Tensor histogram(torch::Tensor x, int num_bins) {
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 1 || x.ndimension() == 2);
+  TORCH_CHECK(x.scalar_type() == torch::kInt16 ||
+	      x.scalar_type() == torch::kInt32 ||
+	      x.scalar_type() == torch::kInt64);
+  bool no_batch = x.ndimension() == 1;
+  if (no_batch) x = x.view({1, x.numel()});
+  if (x.scalar_type() == torch::kInt16) {
+    auto out = cub_histogram<short>(x, num_bins);
+    return no_batch ? out.flatten() : out;
+  } else if (x.scalar_type() == torch::kInt32) {
+    auto out = cub_histogram<int>(x, num_bins);
+    return no_batch ? out.flatten() : out;
+  } else {
+    TORCH_CHECK(x.scalar_type() == torch::kInt64);
+    auto out = cub_histogram<long>(x, num_bins);
+    return no_batch ? out.flatten() : out;
+  }
+}
+}  // namespace megablocks
+#undef CUDA_CALL
+#undef CUB_WRAPPED_NAMESPACE

csrc/indices.h ADDED Viewed

	@@ -0,0 +1,95 @@

+#include <cstdint>
+#include <c10/util/Half.h>
+// #include <torch/extension.h>
+#include <c10/cuda/CUDAStream.h>
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+namespace megablocks {
+namespace construct_indices {
+// We expect the number of outputs per block to be small. For
+// example, with ffn_hidden_size=4096, we only need to write
+// 32 elements per block per iteration.
+const int kThreadsPerBlock = 32;
+__global__ void __launch_bounds__(kThreadsPerBlock)
+  ConstructIndicesKernel(short * __restrict__ indices,
+			 int num_columns,
+			 int block_size,
+			 const int * __restrict__ padded_bins) {
+  // Load the offset for this bins indices.
+  int start = 0;
+  if (blockIdx.x > 0) start = __ldg(padded_bins + blockIdx.x - 1);
+  int end = __ldg(padded_bins + blockIdx.x);
+  // Divide the start and end into blocks.
+  start /= block_size;
+  end /= block_size;
+  // Offset the output buffer to the start of the bin.
+  indices += (start + blockIdx.y) * num_columns + threadIdx.x;
+  // Write the indices to the output.
+  int bin_offset = blockIdx.y;
+  int num_rows = end - start;
+  for (; bin_offset < num_rows; num_rows -= gridDim.y) {
+    short *out = indices;
+    for (int bid = threadIdx.x; bid < num_columns; bid += kThreadsPerBlock) {
+      *out = bid + (blockIdx.x * num_columns);
+      out += kThreadsPerBlock;
+    }
+    indices += gridDim.y * num_columns;
+  }
+}
+cudaError_t ConstructIndices(short * __restrict__ indices,
+			     int output_block_rows,
+			     int output_block_columns,
+			     int block_size,
+			     const int * __restrict__ padded_bins,
+			     int num_bins,
+			     cudaStream_t stream) {
+  dim3 block_dim(kThreadsPerBlock);
+  dim3 grid_dim(num_bins, (int)std::ceil((float)output_block_rows / num_bins));
+  ConstructIndicesKernel<<<grid_dim, block_dim, 0, stream>>>(indices,
+							     output_block_columns,
+							     block_size,
+							     padded_bins);
+  return cudaGetLastError();
+}
+}  // namespace construct_indices
+void indices(torch::Tensor padded_bins,
+	     int block_size,
+	     int output_block_rows,
+	     int output_block_columns,
+	     torch::Tensor out) {
+  TORCH_CHECK(padded_bins.is_cuda());
+  TORCH_CHECK(padded_bins.ndimension() == 1);
+  TORCH_CHECK(padded_bins.scalar_type() == torch::kInt);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 1);
+  TORCH_CHECK(out.scalar_type() == torch::kInt16);
+  TORCH_CHECK(out.numel() == (output_block_rows * output_block_columns));
+  // Exit early if there is no work to do.
+  if (out.numel() == 0) return;
+  CUDA_CALL(construct_indices::ConstructIndices(out.data_ptr<short>(),
+						output_block_rows,
+						output_block_columns,
+						block_size,
+						padded_bins.data_ptr<int>(),
+						padded_bins.numel(),
+						c10::cuda::getCurrentCUDAStream()));
+}
+}  // namespace megablocks

csrc/new_cumsum.cu ADDED Viewed

	@@ -0,0 +1,161 @@

+#define CUB_IGNORE_DEPRECATED_API
+#undef CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE megablocks
+#include "new_cumsum.h"
+#include <cstdint>
+#include <hipcub/hipcub.hpp>
+#include <c10/cuda/CUDAStream.h>
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+namespace megablocks {
+struct Inclusive {};
+struct Exclusive {};
+template <typename Type> struct Cumsum {
+  template<
+    typename InputIteratorT,
+    typename OutputIteratorT>
+  static void Run(void * d_temp_storage,
+		  size_t & temp_storage_bytes,
+		  InputIteratorT d_in,
+		  OutputIteratorT d_out,
+		  int num_items,
+		  cudaStream_t stream = 0,
+		  bool debug_synchronous = false) {
+    CUDA_CALL(hipcub::DeviceScan::ExclusiveSum(d_temp_storage,
+					    temp_storage_bytes,
+					    d_in,
+					    d_out,
+					    num_items,
+					    stream));//,
+					    //debug_synchronous));
+  }
+};
+template <> struct Cumsum<Inclusive> {
+  template<
+    typename InputIteratorT,
+    typename OutputIteratorT>
+  static void Run(void * d_temp_storage,
+		  size_t & temp_storage_bytes,
+		  InputIteratorT d_in,
+		  OutputIteratorT d_out,
+		  int num_items,
+		  cudaStream_t stream = 0,
+		  bool debug_synchronous = false) {
+    CUDA_CALL(hipcub::DeviceScan::InclusiveSum(d_temp_storage,
+					    temp_storage_bytes,
+					    d_in,
+					    d_out,
+					    num_items,
+					    stream));//,
+					    //debug_synchronous));
+  }
+};
+template <typename SumType, typename T>
+void cub_cumsum(torch::Tensor x, int dim, torch::Tensor out) {
+  // Get temporary storage size.
+  size_t scratchpad_bytes = 0;
+  Cumsum<SumType>::Run(nullptr,
+		       scratchpad_bytes,
+		       x.data_ptr<T>(),
+		       out.data_ptr<T>(),
+		       x.size(1),
+		       c10::cuda::getCurrentCUDAStream());
+  // Allocate scratchpad.
+  //
+  // NOTE: We scale for the batch dimension so we can run in parallel.
+  auto options = torch::TensorOptions()
+    .dtype(torch::kInt8)
+    .device(x.device());
+  torch::Tensor scratchpad = torch::empty(scratchpad_bytes * x.size(0),
+  					  options);
+  // Run the kernel.
+  //
+  // NOTE: Using different streams for each issue does not appear to
+  // yield performance gains for our problem set. The overhead of
+  // event/stream synchronization appears to outweigh the benfits.
+  // We could write a true batched cumsum, but this would require
+  // significant code duplication from cub and we might move away
+  // from this formulation anyways.
+  for (int i = 0; i < x.size(0); ++i) {
+    void* scratchpad_ptr = (int8_t*)scratchpad.data_ptr() + scratchpad_bytes * i;
+    Cumsum<SumType>::Run(scratchpad_ptr,
+			 scratchpad_bytes,
+			 x.data_ptr<T>() + x.size(1) * i,
+			 out.data_ptr<T>() + x.size(1) * i,
+			 x.size(1),
+			 c10::cuda::getCurrentCUDAStream());
+  }
+}
+void exclusive_cumsum(torch::Tensor x, int dim, torch::Tensor out) {
+  // Validate the input matrix.
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 2);
+  TORCH_CHECK(x.scalar_type() == torch::kInt16 ||
+	      x.scalar_type() == torch::kInt32 ||
+	      x.scalar_type() == torch::kInt64);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 2);
+  TORCH_CHECK(out.scalar_type() == x.scalar_type());
+  // NOTE: We currently only support contraction across the contiguous
+  // dimension in the matrix.
+  TORCH_CHECK(dim == 1);
+  switch (x.scalar_type()) {
+  case torch::kInt16:
+    cub_cumsum<Exclusive, short>(x, dim, out);
+    return;
+  case torch::kInt32:
+    cub_cumsum<Exclusive, int>(x, dim, out);
+    return;
+  }
+  TORCH_CHECK(x.scalar_type() == torch::kInt64);
+  cub_cumsum<Exclusive, long>(x, dim, out);
+}
+void inclusive_cumsum(torch::Tensor x, int dim, torch::Tensor out) {
+  // Validate the input matrix.
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 2);
+  TORCH_CHECK(x.scalar_type() == torch::kInt16 ||
+	      x.scalar_type() == torch::kInt32 ||
+	      x.scalar_type() == torch::kInt64);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 2);
+  TORCH_CHECK(out.scalar_type() == x.scalar_type());
+  // NOTE: We currently only support contraction across the contiguous
+  // dimension in the matrix.
+  TORCH_CHECK(dim == 1);
+  switch (x.scalar_type()) {
+  case torch::kInt16:
+    cub_cumsum<Inclusive, short>(x, dim, out);
+    return;
+  case torch::kInt32:
+    cub_cumsum<Inclusive, int>(x, dim, out);
+    return;
+  }
+  TORCH_CHECK(x.scalar_type() == torch::kInt64);
+  cub_cumsum<Inclusive, long>(x, dim, out);
+}
+} // namespace megablocks
+#undef CUB_WRAPPED_NAMESPACE

csrc/new_cumsum.h ADDED Viewed

	@@ -0,0 +1,11 @@

+#pragma once
+#include <torch/all.h>
+namespace megablocks {
+// Forward declarations for the public interface functions
+void exclusive_cumsum(torch::Tensor x, int dim, torch::Tensor out);
+void inclusive_cumsum(torch::Tensor x, int dim, torch::Tensor out);
+} // namespace megablocks

csrc/new_histogram.cu ADDED Viewed

	@@ -0,0 +1,85 @@

+#undef CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE megablocks
+#include "new_histogram.h"
+#include <cstdint>
+#include <hipcub/hipcub.hpp>
+#include <c10/cuda/CUDAStream.h>
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+namespace megablocks {
+template <typename T>
+torch::Tensor cub_histogram(torch::Tensor x, int num_bins) {
+  // Allocate the count buffer.
+  auto options = torch::TensorOptions()
+    .dtype(torch::kInt32)
+    .device(x.device());
+  torch::Tensor out = torch::empty({x.size(0), num_bins}, options);
+  // Exit early if there is not work to do.
+  if (out.numel() == 0) return out;
+  // Get scratchpad size.
+  size_t scratchpad_bytes = 0;
+  CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(nullptr,
+						scratchpad_bytes,
+						x.data_ptr<T>(),
+						out.data_ptr<int>(),
+						/*num_levels=*/num_bins + 1,
+						/*lower_level=*/0,
+						/*upper_level=*/num_bins,
+						/*num_samples=*/int(x.size(1)),
+						c10::cuda::getCurrentCUDAStream()));
+  // Allocate scratchpad.
+  options = torch::TensorOptions().dtype(torch::kInt8).device(x.device());
+  torch::Tensor scratchpad = torch::empty(scratchpad_bytes, options);
+  // Run the kernel.
+  for (int i = 0; i < x.size(0); ++i) {
+    CUDA_CALL(hipcub::DeviceHistogram::HistogramEven(scratchpad.data_ptr(),
+						  scratchpad_bytes,
+						  x.data_ptr<T>() + x.size(1) * i,
+						  out.data_ptr<int>() + out.size(1) * i,
+						  /*num_levels=*/num_bins + 1,
+						  /*lower_level=*/0,
+						  /*upper_level=*/num_bins,
+						  /*num_samples=*/int(x.size(1)),
+						  c10::cuda::getCurrentCUDAStream()));
+  }
+  return out;
+}
+torch::Tensor histogram(torch::Tensor x, int num_bins) {
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 1 || x.ndimension() == 2);
+  TORCH_CHECK(x.scalar_type() == torch::kInt16 ||
+	      x.scalar_type() == torch::kInt32 ||
+	      x.scalar_type() == torch::kInt64);
+  bool no_batch = x.ndimension() == 1;
+  if (no_batch) x = x.view({1, x.numel()});
+  if (x.scalar_type() == torch::kInt16) {
+    auto out = cub_histogram<short>(x, num_bins);
+    return no_batch ? out.flatten() : out;
+  } else if (x.scalar_type() == torch::kInt32) {
+    auto out = cub_histogram<int>(x, num_bins);
+    return no_batch ? out.flatten() : out;
+  } else {
+    TORCH_CHECK(x.scalar_type() == torch::kInt64);
+    auto out = cub_histogram<long>(x, num_bins);
+    return no_batch ? out.flatten() : out;
+  }
+}
+} // namespace megablocks
+#undef CUDA_CALL
+#undef CUB_WRAPPED_NAMESPACE

csrc/new_histogram.h ADDED Viewed

	@@ -0,0 +1,10 @@

+#pragma once
+#include <torch/all.h>
+namespace megablocks {
+// Public interface function for computing histograms
+torch::Tensor histogram(torch::Tensor x, int num_bins);
+} // namespace megablocks

csrc/new_indices.cu ADDED Viewed

	@@ -0,0 +1,97 @@

+#include "new_indices.h"
+#include <cstdint>
+#include <c10/util/Half.h>
+#include <c10/cuda/CUDAStream.h>
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+namespace megablocks {
+namespace construct_indices {
+// We expect the number of outputs per block to be small. For
+// example, with ffn_hidden_size=4096, we only need to write
+// 32 elements per block per iteration.
+const int kThreadsPerBlock = 32;
+__global__ void __launch_bounds__(kThreadsPerBlock)
+  ConstructIndicesKernel(short * __restrict__ indices,
+			 int num_columns,
+			 int block_size,
+			 const int * __restrict__ padded_bins) {
+  // Load the offset for this bins indices.
+  int start = 0;
+  if (blockIdx.x > 0) start = __ldg(padded_bins + blockIdx.x - 1);
+  int end = __ldg(padded_bins + blockIdx.x);
+  // Divide the start and end into blocks.
+  start /= block_size;
+  end /= block_size;
+  // Offset the output buffer to the start of the bin.
+  indices += (start + blockIdx.y) * num_columns + threadIdx.x;
+  // Write the indices to the output.
+  int bin_offset = blockIdx.y;
+  int num_rows = end - start;
+  for (; bin_offset < num_rows; num_rows -= gridDim.y) {
+    short *out = indices;
+    for (int bid = threadIdx.x; bid < num_columns; bid += kThreadsPerBlock) {
+      *out = bid + (blockIdx.x * num_columns);
+      out += kThreadsPerBlock;
+    }
+    indices += gridDim.y * num_columns;
+  }
+}
+cudaError_t ConstructIndices(short * __restrict__ indices,
+			     int output_block_rows,
+			     int output_block_columns,
+			     int block_size,
+			     const int * __restrict__ padded_bins,
+			     int num_bins,
+			     cudaStream_t stream) {
+  dim3 block_dim(kThreadsPerBlock);
+  dim3 grid_dim(num_bins, (int)std::ceil((float)output_block_rows / num_bins));
+  ConstructIndicesKernel<<<grid_dim, block_dim, 0, stream>>>(indices,
+							     output_block_columns,
+							     block_size,
+							     padded_bins);
+  return cudaGetLastError();
+}
+} // namespace construct_indices
+void indices(torch::Tensor padded_bins,
+	     int block_size,
+	     int output_block_rows,
+	     int output_block_columns,
+	     torch::Tensor out) {
+  TORCH_CHECK(padded_bins.is_cuda());
+  TORCH_CHECK(padded_bins.ndimension() == 1);
+  TORCH_CHECK(padded_bins.scalar_type() == torch::kInt);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 1);
+  TORCH_CHECK(out.scalar_type() == torch::kInt16);
+  TORCH_CHECK(out.numel() == (output_block_rows * output_block_columns));
+  // Exit early if there is no work to do.
+  if (out.numel() == 0) return;
+  CUDA_CALL(construct_indices::ConstructIndices(out.data_ptr<short>(),
+						output_block_rows,
+						output_block_columns,
+						block_size,
+						padded_bins.data_ptr<int>(),
+						padded_bins.numel(),
+						c10::cuda::getCurrentCUDAStream()));
+}
+} // namespace megablocks
+#undef CUDA_CALL

csrc/new_indices.h ADDED Viewed

	@@ -0,0 +1,14 @@

+#pragma once
+#include <torch/all.h>
+namespace megablocks {
+// Public interface function for constructing indices from padded bins
+void indices(torch::Tensor padded_bins,
+             int block_size,
+             int output_block_rows,
+             int output_block_columns,
+             torch::Tensor out);
+} // namespace megablocks

csrc/new_replicate.cu ADDED Viewed

	@@ -0,0 +1,220 @@

+// Modifications:  Copyright Advanced Micro Devices, Inc.  SPDX License:  MIT.
+#undef CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE megablocks
+#include "new_replicate.h"
+#include <cstdint>
+#include <cub/cub.cuh>
+#include <c10/util/Half.h>
+#include <c10/cuda/CUDAStream.h>
+#ifndef USE_ROCM
+  #define _LDG(arg) __ldg(arg)
+#else
+  #define _LDG(arg) *(arg)
+#endif
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+namespace megablocks {
+namespace replicate {
+template <typename T, int kThreadsPerBlock>
+__global__ void __launch_bounds__(kThreadsPerBlock)
+  ReplicateForwardKernel(T * __restrict__ x,
+			 int * __restrict__ bins,
+			 T * __restrict__ out,
+			 int columns) {
+  // Offset to this threadblocks batch.
+  //
+  // x is [batch_size, num_bins]
+  // out is [batch_size, columns]
+  // bins is [num_bins]
+  int batch_idx = blockIdx.y;
+  int num_bins = gridDim.x;
+  x += batch_idx * num_bins;
+  out += batch_idx * columns;
+  // Load the start/end for this bin.
+  int bin_idx = blockIdx.x;
+  int start = 0;
+  if (bin_idx > 0) start = _LDG(bins + bin_idx - 1);
+  int end = _LDG(bins + bin_idx);
+  // Load the value to replicate.
+  T value = _LDG((T*)x + bin_idx);
+  // Offset to this threadblocks bin and this threads
+  // offset within the bin.
+  int bin_offset = blockIdx.z * kThreadsPerBlock + threadIdx.x;
+  out += start + bin_offset;
+  // Replicate the value to the output.
+  //
+  // TODO(tgale): Vectorize these stores.
+  int num_elements = end - start;
+  const int kElementsPerLoop = gridDim.z * kThreadsPerBlock;
+  T *out_ptr = (T*)out;
+  for (; bin_offset < num_elements; num_elements -= kElementsPerLoop) {
+    *out_ptr = value;
+    out_ptr += kElementsPerLoop;
+  }
+}
+template <typename T>
+cudaError_t ReplicateForward(T *x,
+			     int batch_size,
+			     int num_bins,
+			     int *bins,
+			     T *out,
+			     int columns,
+			     cudaStream_t stream) {
+  const int kThreadsPerBlock = 64;
+  dim3 block_dim(kThreadsPerBlock, 1, 1);
+  int group_size = std::ceil((float)columns / (num_bins * kThreadsPerBlock));
+  dim3 grid_dim(num_bins, batch_size, group_size);
+  ReplicateForwardKernel<T, kThreadsPerBlock><<<
+    grid_dim, block_dim, 0, stream>>>(x, bins, out, columns);
+  return cudaGetLastError();
+}
+void cub_segmented_reduce(torch::Tensor grad,
+			  torch::Tensor bins,
+			  torch::Tensor out,
+			  cudaStream_t stream) {
+  // Append a zero to the bin boundaries for CUB.
+  torch::Tensor offsets = torch::empty(bins.numel() + 1, bins.options());
+  CUDA_CALL(cudaMemsetAsync(offsets.data_ptr<int>(),
+			    0,
+			    offsets.numel() * sizeof(int),
+			    stream));
+  CUDA_CALL(cudaMemcpyAsync(offsets.data_ptr<int>() + 1,
+			    bins.data_ptr<int>(),
+			    bins.numel() * sizeof(int),
+			    cudaMemcpyDeviceToDevice,
+			    stream));
+  // Get temporary buffer size.
+  size_t scratchpad_bytes = 0;
+  CUDA_CALL(cub::DeviceSegmentedReduce::Sum(nullptr,
+					    scratchpad_bytes,
+					    grad.data_ptr<c10::Half>(),
+					    out.data_ptr<c10::Half>(),
+					    bins.numel(),
+					    offsets.data_ptr<int>(),
+					    offsets.data_ptr<int>() + 1,
+					    stream));
+  // Allocate scratchpad.
+  auto options = torch::TensorOptions()
+    .dtype(torch::kInt8)
+    .device(grad.device());
+  torch::Tensor scratchpad = torch::empty(scratchpad_bytes, options);
+  // Run the kernel for each batch item.
+  for (int i = 0; i < grad.size(0); ++i) {
+    int num_bins = out.size(1);
+    int num_values = grad.size(1);
+    CUDA_CALL(cub::DeviceSegmentedReduce::Sum(scratchpad.data_ptr<int8_t>(),
+					      scratchpad_bytes,
+					      grad.data_ptr<c10::Half>() + i * num_values,
+					      out.data_ptr<c10::Half>() + i * num_bins,
+					      bins.numel(),
+					      offsets.data_ptr<int>(),
+					      offsets.data_ptr<int>() + 1,
+					      stream));
+  }
+}
+}  // namespace replicate
+void replicate_forward(torch::Tensor x,
+		       torch::Tensor bins,
+		       torch::Tensor out) {
+  // Validate the inputs.
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 2);
+  TORCH_CHECK(x.scalar_type() == torch::kFloat16 ||
+	      x.scalar_type() == torch::kInt16 ||
+	      x.scalar_type() == torch::kInt32);
+  TORCH_CHECK(bins.is_cuda());
+  TORCH_CHECK(bins.ndimension() == 1);
+  TORCH_CHECK(bins.scalar_type() == torch::kInt);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 2);
+  TORCH_CHECK(out.scalar_type() == x.scalar_type());
+  // Batch dimensions should match for input/output.
+  TORCH_CHECK(x.size(0) == out.size(0));
+  // One input for each bin (in each batch).
+  TORCH_CHECK(x.size(1) == bins.size(0));
+  // Exit early if there is no work to do.
+  if (out.numel() == 0) return;
+  switch (x.scalar_type()) {
+  case torch::kFloat16:
+    CUDA_CALL(replicate::ReplicateForward(x.data_ptr<c10::Half>(),
+					  x.size(0),
+					  x.size(1),
+					  bins.data_ptr<int>(),
+					  out.data_ptr<c10::Half>(),
+					  out.size(1),
+					  c10::cuda::getCurrentCUDAStream()));
+    return;
+  case torch::kInt32:
+    CUDA_CALL(replicate::ReplicateForward(x.data_ptr<int>(),
+					  x.size(0),
+					  x.size(1),
+					  bins.data_ptr<int>(),
+					  out.data_ptr<int>(),
+					  out.size(1),
+					  c10::cuda::getCurrentCUDAStream()));
+    return;
+  }
+  TORCH_CHECK(x.scalar_type() == torch::kInt16);
+  CUDA_CALL(replicate::ReplicateForward(x.data_ptr<short>(),
+					x.size(0),
+					x.size(1),
+					bins.data_ptr<int>(),
+					out.data_ptr<short>(),
+					out.size(1),
+					c10::cuda::getCurrentCUDAStream()));
+}
+void replicate_backward(torch::Tensor grad,
+			torch::Tensor bins,
+			torch::Tensor out) {
+  // Validate the inputs.
+  TORCH_CHECK(grad.is_cuda());
+  TORCH_CHECK(grad.ndimension() == 2);
+  TORCH_CHECK(grad.scalar_type() == torch::kFloat16);
+  TORCH_CHECK(bins.is_cuda());
+  TORCH_CHECK(bins.ndimension() == 1);
+  TORCH_CHECK(bins.scalar_type() == torch::kInt);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 2);
+  TORCH_CHECK(out.scalar_type() == torch::kFloat16);
+  // Batch dimensions should match for input/output.
+  TORCH_CHECK(grad.size(0) == out.size(0));
+  // One output for each bin (in each batch).
+  TORCH_CHECK(out.size(1) == bins.size(0));
+  replicate::cub_segmented_reduce(grad, bins, out, c10::cuda::getCurrentCUDAStream());
+}
+}  // namespace megablocks
+#undef CUDA_CALL
+#undef CUB_WRAPPED_NAMESPACE

csrc/new_replicate.h ADDED Viewed

	@@ -0,0 +1,17 @@

+#pragma once
+#include <torch/all.h>
+namespace megablocks {
+// Forward pass: replicate values from x according to bin sizes
+void replicate_forward(torch::Tensor x,
+                       torch::Tensor bins,
+                       torch::Tensor out);
+// Backward pass: reduce gradients back to bins using segmented reduction
+void replicate_backward(torch::Tensor grad,
+                        torch::Tensor bins,
+                        torch::Tensor out);
+} // namespace megablocks

csrc/new_sort.cu ADDED Viewed

	@@ -0,0 +1,90 @@

+#undef CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE megablocks
+#include "new_sort.h"
+#include <cstdint>
+#include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+namespace megablocks {
+template <typename T>
+void cub_radix_sort(torch::Tensor x,
+		    int end_bit,
+		    torch::Tensor x_out,
+		    torch::Tensor iota_out) {
+  // Get iota for values in sort.
+  torch::Tensor iota = torch::arange(0, x.numel(), x.options());
+  // Get temporary buffer size.
+  size_t scratchpad_bytes = 0;
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(nullptr,
+  					    scratchpad_bytes,
+  					    x.data_ptr<T>(),
+  					    x_out.data_ptr<T>(),
+  					    iota.data_ptr<T>(),
+  					    iota_out.data_ptr<T>(),
+  					    x.numel(),
+  					    /*begin_bit*/0,
+  					    /*end_bit=*/end_bit,
+  					    c10::cuda::getCurrentCUDAStream()));
+  // Allocate scratchpad.
+  auto options = torch::TensorOptions()
+    .dtype(torch::kInt8)
+    .device(x.device());
+  torch::Tensor scratchpad = torch::empty(scratchpad_bytes, options);
+  // Run the kernel.
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(scratchpad.data_ptr(),
+  					    scratchpad_bytes,
+  					    x.data_ptr<T>(),
+  					    x_out.data_ptr<T>(),
+  					    iota.data_ptr<T>(),
+  					    iota_out.data_ptr<T>(),
+  					    x.numel(),
+  					    /*begin_bit=*/0,
+  					    /*end_bit=*/end_bit,
+  					    c10::cuda::getCurrentCUDAStream()));
+}
+void sort(torch::Tensor x,
+	  int end_bit,
+	  torch::Tensor x_out,
+	  torch::Tensor iota_out) {
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 1);
+  TORCH_CHECK(x.scalar_type() == torch::kInt16 ||
+  	      x.scalar_type() == torch::kInt32 ||
+  	      x.scalar_type() == torch::kInt64);
+  TORCH_CHECK(x_out.is_cuda());
+  TORCH_CHECK(x_out.ndimension() == 1);
+  TORCH_CHECK(x_out.scalar_type() == x.scalar_type());
+  TORCH_CHECK(iota_out.is_cuda());
+  TORCH_CHECK(iota_out.ndimension() == 1);
+  TORCH_CHECK(iota_out.scalar_type() == x.scalar_type());
+  // Exit early if there is not work to do.
+  if (x_out.numel() == 0) return;
+  switch (x.scalar_type()) {
+  case torch::kInt16:
+    return cub_radix_sort<short>(x, end_bit, x_out, iota_out);
+  case torch::kInt32:
+    return cub_radix_sort<int>(x, end_bit, x_out, iota_out);
+  }
+  TORCH_CHECK(x.scalar_type() == torch::kInt64);
+  return cub_radix_sort<long>(x, end_bit, x_out, iota_out);
+}
+} // namespace megablocks
+#undef CUDA_CALL
+#undef CUB_WRAPPED_NAMESPACE

csrc/new_sort.h ADDED Viewed

	@@ -0,0 +1,13 @@

+#pragma once
+#include <torch/all.h>
+namespace megablocks {
+// Public interface function for radix sorting with indices
+void sort(torch::Tensor x,
+          int end_bit,
+          torch::Tensor x_out,
+          torch::Tensor iota_out);
+} // namespace megablocks

csrc/replicate.h ADDED Viewed

	@@ -0,0 +1,211 @@

+#undef CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE megablocks
+#include <cstdint>
+#include <cub/cub.cuh>
+#include <c10/util/Half.h>
+#include <c10/cuda/CUDAStream.h>
+// #include <torch/extension.h>
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+namespace megablocks {
+namespace replicate {
+template <typename T, int kThreadsPerBlock>
+__global__ void __launch_bounds__(kThreadsPerBlock)
+  ReplicateForwardKernel(T * __restrict__ x,
+			 int * __restrict__ bins,
+			 T * __restrict__ out,
+			 int columns) {
+  // Offset to this threadblocks batch.
+  //
+  // x is [batch_size, num_bins]
+  // out is [batch_size, columns]
+  // bins is [num_bins]
+  int batch_idx = blockIdx.y;
+  int num_bins = gridDim.x;
+  x += batch_idx * num_bins;
+  out += batch_idx * columns;
+  // Load the start/end for this bin.
+  int bin_idx = blockIdx.x;
+  int start = 0;
+  if (bin_idx > 0) start = __ldg(bins + bin_idx - 1);
+  int end = __ldg(bins + bin_idx);
+  // Load the value to replicate.
+  T value = __ldg((T*)x + bin_idx);
+  // Offset to this threadblocks bin and this threads
+  // offset within the bin.
+  int bin_offset = blockIdx.z * kThreadsPerBlock + threadIdx.x;
+  out += start + bin_offset;
+  // Replicate the value to the output.
+  //
+  // TODO(tgale): Vectorize these stores.
+  int num_elements = end - start;
+  const int kElementsPerLoop = gridDim.z * kThreadsPerBlock;
+  T *out_ptr = (T*)out;
+  for (; bin_offset < num_elements; num_elements -= kElementsPerLoop) {
+    *out_ptr = value;
+    out_ptr += kElementsPerLoop;
+  }
+}
+template <typename T>
+cudaError_t ReplicateForward(T *x,
+			     int batch_size,
+			     int num_bins,
+			     int *bins,
+			     T *out,
+			     int columns,
+			     cudaStream_t stream) {
+  const int kThreadsPerBlock = 64;
+  dim3 block_dim(kThreadsPerBlock, 1, 1);
+  int group_size = std::ceil((float)columns / (num_bins * kThreadsPerBlock));
+  dim3 grid_dim(num_bins, batch_size, group_size);
+  ReplicateForwardKernel<T, kThreadsPerBlock><<<
+    grid_dim, block_dim, 0, stream>>>(x, bins, out, columns);
+  return cudaGetLastError();
+}
+void cub_segmented_reduce(torch::Tensor grad,
+			  torch::Tensor bins,
+			  torch::Tensor out,
+			  cudaStream_t stream) {
+  // Append a zero to the bin boundaries for CUB.
+  torch::Tensor offsets = torch::empty(bins.numel() + 1, bins.options());
+  CUDA_CALL(cudaMemsetAsync(offsets.data_ptr<int>(),
+			    0,
+			    offsets.numel() * sizeof(int),
+			    stream));
+  CUDA_CALL(cudaMemcpyAsync(offsets.data_ptr<int>() + 1,
+			    bins.data_ptr<int>(),
+			    bins.numel() * sizeof(int),
+			    cudaMemcpyDeviceToDevice,
+			    stream));
+  // Get temporary buffer size.
+  size_t scratchpad_bytes = 0;
+  CUDA_CALL(cub::DeviceSegmentedReduce::Sum(nullptr,
+					    scratchpad_bytes,
+					    grad.data_ptr<c10::Half>(),
+					    out.data_ptr<c10::Half>(),
+					    bins.numel(),
+					    offsets.data_ptr<int>(),
+					    offsets.data_ptr<int>() + 1,
+					    stream));
+  // Allocate scratchpad.
+  auto options = torch::TensorOptions()
+    .dtype(torch::kInt8)
+    .device(grad.device());
+  torch::Tensor scratchpad = torch::empty(scratchpad_bytes, options);
+  // Run the kernel for each batch item.
+  for (int i = 0; i < grad.size(0); ++i) {
+    int num_bins = out.size(1);
+    int num_values = grad.size(1);
+    CUDA_CALL(cub::DeviceSegmentedReduce::Sum(scratchpad.data_ptr<int8_t>(),
+					      scratchpad_bytes,
+					      grad.data_ptr<c10::Half>() + i * num_values,
+					      out.data_ptr<c10::Half>() + i * num_bins,
+					      bins.numel(),
+					      offsets.data_ptr<int>(),
+					      offsets.data_ptr<int>() + 1,
+					      stream));
+  }
+}
+}  // namespace replicate
+void replicate_forward(torch::Tensor x,
+		       torch::Tensor bins,
+		       torch::Tensor out) {
+  // Validate the inputs.
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 2);
+  TORCH_CHECK(x.scalar_type() == torch::kFloat16 ||
+	      x.scalar_type() == torch::kInt16 ||
+	      x.scalar_type() == torch::kInt32);
+  TORCH_CHECK(bins.is_cuda());
+  TORCH_CHECK(bins.ndimension() == 1);
+  TORCH_CHECK(bins.scalar_type() == torch::kInt);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 2);
+  TORCH_CHECK(out.scalar_type() == x.scalar_type());
+  // Batch dimensions should match for input/output.
+  TORCH_CHECK(x.size(0) == out.size(0));
+  // One input for each bin (in each batch).
+  TORCH_CHECK(x.size(1) == bins.size(0));
+  // Exit early if there is no work to do.
+  if (out.numel() == 0) return;
+  switch (x.scalar_type()) {
+  case torch::kFloat16:
+    CUDA_CALL(replicate::ReplicateForward(x.data_ptr<c10::Half>(),
+					  x.size(0),
+					  x.size(1),
+					  bins.data_ptr<int>(),
+					  out.data_ptr<c10::Half>(),
+					  out.size(1),
+					  c10::cuda::getCurrentCUDAStream()));
+    return;
+  case torch::kInt32:
+    CUDA_CALL(replicate::ReplicateForward(x.data_ptr<int>(),
+					  x.size(0),
+					  x.size(1),
+					  bins.data_ptr<int>(),
+					  out.data_ptr<int>(),
+					  out.size(1),
+					  c10::cuda::getCurrentCUDAStream()));
+    return;
+  }
+  TORCH_CHECK(x.scalar_type() == torch::kInt16);
+  CUDA_CALL(replicate::ReplicateForward(x.data_ptr<short>(),
+					x.size(0),
+					x.size(1),
+					bins.data_ptr<int>(),
+					out.data_ptr<short>(),
+					out.size(1),
+					c10::cuda::getCurrentCUDAStream()));
+}
+void replicate_backward(torch::Tensor grad,
+			torch::Tensor bins,
+			torch::Tensor out) {
+  // Validate the inputs.
+  TORCH_CHECK(grad.is_cuda());
+  TORCH_CHECK(grad.ndimension() == 2);
+  TORCH_CHECK(grad.scalar_type() == torch::kFloat16);
+  TORCH_CHECK(bins.is_cuda());
+  TORCH_CHECK(bins.ndimension() == 1);
+  TORCH_CHECK(bins.scalar_type() == torch::kInt);
+  TORCH_CHECK(out.is_cuda());
+  TORCH_CHECK(out.ndimension() == 2);
+  TORCH_CHECK(out.scalar_type() == torch::kFloat16);
+  // Batch dimensions should match for input/output.
+  TORCH_CHECK(grad.size(0) == out.size(0));
+  // One output for each bin (in each batch).
+  TORCH_CHECK(out.size(1) == bins.size(0));
+  replicate::cub_segmented_reduce(grad, bins, out, c10::cuda::getCurrentCUDAStream());
+}
+}  // namespace megablocks
+#undef CUDA_CALL
+#undef CUB_WRAPPED_NAMESPACE

csrc/sort.h ADDED Viewed

	@@ -0,0 +1,91 @@

+#undef CUB_WRAPPED_NAMESPACE
+#define CUB_WRAPPED_NAMESPACE megablocks
+#include <cstdint>
+#include <cub/cub.cuh>
+#include <c10/cuda/CUDAStream.h>
+// #include <torch/extension.h>
+#define CUDA_CALL(code)					    \
+  do {                                                      \
+    cudaError_t status = code;                              \
+    std::string err = cudaGetErrorString(status);           \
+    TORCH_CHECK(status == cudaSuccess, err);		    \
+  } while (0)
+namespace megablocks {
+template <typename T>
+void cub_radix_sort(torch::Tensor x,
+		    int end_bit,
+		    torch::Tensor x_out,
+		    torch::Tensor iota_out) {
+  // Get iota for values in sort.
+  torch::Tensor iota = torch::arange(0, x.numel(), x.options());
+  // Get temporary buffer size.
+  size_t scratchpad_bytes = 0;
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(nullptr,
+  					    scratchpad_bytes,
+  					    x.data_ptr<T>(),
+  					    x_out.data_ptr<T>(),
+  					    iota.data_ptr<T>(),
+  					    iota_out.data_ptr<T>(),
+  					    x.numel(),
+  					    /*begin_bit*/0,
+  					    /*end_bit=*/end_bit,
+  					    c10::cuda::getCurrentCUDAStream()));
+  // Allocate scratchpad.
+  auto options = torch::TensorOptions()
+    .dtype(torch::kInt8)
+    .device(x.device());
+  torch::Tensor scratchpad = torch::empty(scratchpad_bytes, options);
+  // Run the kernel.
+  CUDA_CALL(cub::DeviceRadixSort::SortPairs(scratchpad.data_ptr(),
+  					    scratchpad_bytes,
+  					    x.data_ptr<T>(),
+  					    x_out.data_ptr<T>(),
+  					    iota.data_ptr<T>(),
+  					    iota_out.data_ptr<T>(),
+  					    x.numel(),
+  					    /*begin_bit=*/0,
+  					    /*end_bit=*/end_bit,
+  					    c10::cuda::getCurrentCUDAStream()));
+}
+void sort(torch::Tensor x,
+	  int end_bit,
+	  torch::Tensor x_out,
+	  torch::Tensor iota_out) {
+  TORCH_CHECK(x.is_cuda());
+  TORCH_CHECK(x.ndimension() == 1);
+  TORCH_CHECK(x.scalar_type() == torch::kInt16 ||
+  	      x.scalar_type() == torch::kInt32 ||
+  	      x.scalar_type() == torch::kInt64);
+  TORCH_CHECK(x_out.is_cuda());
+  TORCH_CHECK(x_out.ndimension() == 1);
+  TORCH_CHECK(x_out.scalar_type() == x.scalar_type());
+  TORCH_CHECK(iota_out.is_cuda());
+  TORCH_CHECK(iota_out.ndimension() == 1);
+  TORCH_CHECK(iota_out.scalar_type() == x.scalar_type());
+  // Exit early if there is not work to do.
+  if (x_out.numel() == 0) return;
+  switch (x.scalar_type()) {
+  case torch::kInt16:
+    return cub_radix_sort<short>(x, end_bit, x_out, iota_out);
+  case torch::kInt32:
+    return cub_radix_sort<int>(x, end_bit, x_out, iota_out);
+  }
+  TORCH_CHECK(x.scalar_type() == torch::kInt64);
+  return cub_radix_sort<long>(x, end_bit, x_out, iota_out);
+}
+}  // namespace megablocks
+#undef CUDA_CALL
+#undef CUB_WRAPPED_NAMESPACE

flake.lock ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-compat_2": {
+      "locked": {
+        "lastModified": 1733328505,
+        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_2": {
+      "inputs": {
+        "systems": "systems_2"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "hf-nix": {
+      "inputs": {
+        "flake-compat": "flake-compat_2",
+        "flake-utils": "flake-utils_2",
+        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1751968576,
+        "narHash": "sha256-cmKrlWpNTG/hq1bCaHXfbdm9T+Y6V+5//EHAVc1TLBE=",
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "rev": "3fcd1e1b46da91b6691261640ffd6b7123d0cb9e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "hf-nix": "hf-nix",
+        "nixpkgs": [
+          "kernel-builder",
+          "hf-nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1753256281,
+        "narHash": "sha256-CfL3Fyf2ih7OtyL7ScZUCwOeCj+gjlRyPykhR6Zbt3I=",
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "rev": "dcbbdf2d3c8e78b27321b205b2c9d67ffce6a706",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1747820358,
+        "narHash": "sha256-fTqsZsUX6M3yeEvgyQvXcbGmT2CaRVyVwsi8eK29Oj4=",
+        "owner": "danieldk",
+        "repo": "nixpkgs",
+        "rev": "d3c1681180717528068082103bf323147de6ab0b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "danieldk",
+        "ref": "cudatoolkit-12.9-kernel-builder",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

flake.nix ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  description = "Flake for megablocks_moe kernel";
+  inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
+  };
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genFlakeOutputs {
+      path = ./.;
+      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+      pythonCheckInputs = pkgs: with pkgs; [
+        tqdm
+        py-cpuinfo
+        importlib-metadata
+        torchmetrics
+      ];
+    };
+}

tests/__init__.py ADDED Viewed

File without changes

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import os
+from typing import List, Optional
+import pytest
+# from composer.utils import reproducibility
+# Allowed options for pytest.mark.world_size()
+WORLD_SIZE_OPTIONS = (1, 2)
+# Enforce deterministic mode before any tests start.
+# reproducibility.configure_deterministic_mode()
+# TODO: allow plugind when deps resolved
+# Add the path of any pytest fixture files you want to make global
+pytest_plugins = [
+    # 'tests.fixtures.autouse',
+    'tests.fixtures.fixtures',
+]
+def _get_world_size(item: pytest.Item):
+    """Returns the world_size of a test, defaults to 1."""
+    _default = pytest.mark.world_size(1).mark
+    return item.get_closest_marker('world_size', default=_default).args[0]
+def _get_option(
+    config: pytest.Config,
+    name: str,
+    default: Optional[str] = None,
+) -> str:  # type: ignore
+    val = config.getoption(name)
+    if val is not None:
+        assert isinstance(val, str)
+        return val
+    val = config.getini(name)
+    if val == []:
+        val = None
+    if val is None:
+        if default is None:
+            pytest.fail(f'Config option {name} is not specified but is required',)
+        val = default
+    assert isinstance(val, str)
+    return val
+def _add_option(
+    parser: pytest.Parser,
+    name: str,
+    help: str,
+    choices: Optional[list[str]] = None,
+):
+    parser.addoption(
+        f'--{name}',
+        default=None,
+        type=str,
+        choices=choices,
+        help=help,
+    )
+    parser.addini(
+        name=name,
+        help=help,
+        type='string',
+        default=None,
+    )
+def pytest_collection_modifyitems(
+    config: pytest.Config,
+    items: List[pytest.Item],
+) -> None:
+    """Filter tests by world_size (for multi-GPU tests)"""
+    world_size = int(os.environ.get('WORLD_SIZE', '1'))
+    print(f'world_size={world_size}')
+    conditions = [
+        lambda item: _get_world_size(item) == world_size,
+    ]
+    # keep items that satisfy all conditions
+    remaining = []
+    deselected = []
+    for item in items:
+        if all(condition(item) for condition in conditions):
+            remaining.append(item)
+        else:
+            deselected.append(item)
+    if deselected:
+        config.hook.pytest_deselected(items=deselected)
+        items[:] = remaining
+def pytest_addoption(parser: pytest.Parser) -> None:
+    _add_option(
+        parser,
+        'seed',
+        help="""\
+        Rank zero seed to use. `reproducibility.seed_all(seed + dist.get_global_rank())` will be invoked
+        before each test.""",
+    )
+def pytest_sessionfinish(session: pytest.Session, exitstatus: int):
+    if exitstatus == 5:
+        session.exitstatus = 0  # Ignore no-test-ran errors

tests/fixtures/autouse.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import gc
+import logging
+import os
+import composer
+import pytest
+import torch
+from composer.devices import DeviceCPU, DeviceGPU
+from composer.utils import dist, reproducibility
+@pytest.fixture(autouse=True)
+def clear_cuda_cache(request: pytest.FixtureRequest):
+    """Clear memory between GPU tests."""
+    marker = request.node.get_closest_marker('gpu')
+    if marker is not None and torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        gc.collect()  # Only gc on GPU tests as it 2x slows down CPU tests
+@pytest.fixture(autouse=True)
+def reset_mlflow_tracking_dir():
+    """Reset MLFlow tracking dir so it doesn't persist across tests."""
+    try:
+        import mlflow
+        mlflow.set_tracking_uri(None)  # type: ignore
+    except ModuleNotFoundError:
+        # MLFlow not installed
+        pass
+@pytest.fixture(scope='session')
+def cleanup_dist():
+    """Ensure all dist tests clean up resources properly."""
+    yield
+    # Avoid race condition where a test is still writing to a file on one rank
+    # while the file system is being torn down on another rank.
+    dist.barrier()
+@pytest.fixture(autouse=True, scope='session')
+def configure_dist(request: pytest.FixtureRequest):
+    # Configure dist globally when the world size is greater than 1,
+    # so individual tests that do not use the trainer
+    # do not need to worry about manually configuring dist.
+    if dist.get_world_size() == 1:
+        return
+    device = None
+    for item in request.session.items:
+        device = DeviceCPU() if item.get_closest_marker('gpu') is None else DeviceGPU()
+        break
+    assert device is not None
+    if not dist.is_initialized():
+        dist.initialize_dist(device, timeout=300.0)
+    # Hold PyTest until all ranks have reached this barrier. Ensure that no rank starts
+    # any test before other ranks are ready to start it, which could be a cause of random timeouts
+    # (e.g. rank 1 starts the next test while rank 0 is finishing up the previous test).
+    dist.barrier()
+@pytest.fixture(autouse=True)
+def set_log_levels():
+    """Ensures all log levels are set to DEBUG."""
+    logging.basicConfig()
+    logging.getLogger(composer.__name__).setLevel(logging.DEBUG)
+@pytest.fixture(autouse=True)
+def seed_all(rank_zero_seed: int, monkeypatch: pytest.MonkeyPatch):
+    """Monkeypatch reproducibility.
+    Make get_random_seed to always return the rank zero seed, and set the random seed before each test to the rank local
+    seed.
+    """
+    monkeypatch.setattr(
+        reproducibility,
+        'get_random_seed',
+        lambda: rank_zero_seed,
+    )
+    reproducibility.seed_all(rank_zero_seed + dist.get_global_rank())
+@pytest.fixture(autouse=True)
+def remove_run_name_env_var():
+    # Remove environment variables for run names in unit tests
+    composer_run_name = os.environ.get('COMPOSER_RUN_NAME')
+    run_name = os.environ.get('RUN_NAME')
+    if 'COMPOSER_RUN_NAME' in os.environ:
+        del os.environ['COMPOSER_RUN_NAME']
+    if 'RUN_NAME' in os.environ:
+        del os.environ['RUN_NAME']
+    yield
+    if composer_run_name is not None:
+        os.environ['COMPOSER_RUN_NAME'] = composer_run_name
+    if run_name is not None:
+        os.environ['RUN_NAME'] = run_name

tests/fixtures/fixtures.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+from tests.conftest import _get_option
+@pytest.fixture
+def rank_zero_seed(pytestconfig: pytest.Config) -> int:
+    """Read the rank_zero_seed from the CLI option."""
+    seed = _get_option(pytestconfig, 'seed', default='0')
+    return int(seed)

tests/layer_test.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import torch
+from collections import namedtuple
+def test_megablocks_moe_mlp_import():
+    """Test if MegaBlocksMoeMLP can be imported."""
+    from megablocks.layers import MegaBlocksMoeMLP
+    assert MegaBlocksMoeMLP is not None, "MegaBlocksMoeMLP import failed."
+def test_megablocks_moe_mlp_functionality():
+    """Test the functionality of MegaBlocksMoeMLP."""
+    from megablocks.layers import MegaBlocksMoeMLP
+    # Create a simple instance of MegaBlocksMoeMLP
+    model = MegaBlocksMoeMLP()
+    # add experts attribute to the model
+    model.experts = namedtuple(
+        "Experts",
+        [
+            "gate_up_proj",
+            "gate_down_proj",
+            "down_proj",
+            "hidden_size",
+        ],
+    )
+    num_experts = 128
+    hidden_size = 1152
+    intermediate_size = 3072
+    # Shorter names for reading convenience
+    ne, hs, isz = num_experts, hidden_size, intermediate_size
+    model.router = torch.nn.Linear(hs, ne).cuda()
+    model.router.weight.data.fill_(1)
+    e = model.experts
+    e.gate_up_proj = torch.nn.Parameter(torch.ones(ne, hs, isz, device="cuda"))
+    e.gate_up_proj_bias = torch.nn.Parameter(torch.zeros(ne, isz, device="cuda"))
+    e.down_proj = torch.nn.Parameter(torch.ones(ne, 1536, hs, device="cuda"))
+    e.down_proj_bias = torch.nn.Parameter(torch.zeros(ne, hs, device="cuda"))
+    e.hidden_size = hs
+    # Create dummy input data
+    x = torch.randn(1, 1, 1152).to(torch.device("cuda"))
+    output, expert_weights_out = model(x)
+    # print("Output shape:", output.shape)
+    assert output.shape == (1, 1, 1152), "Output shape mismatch."

tests/layers/architectures.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn.functional as F
+from megablocks._layers.arguments import Arguments
+class FFN(torch.nn.Module):
+    def __init__(self, args: Arguments):
+        super().__init__()
+        self.w1 = torch.nn.Parameter(
+            torch.empty(
+                args.hidden_size,
+                args.ffn_hidden_size,
+                device=args.device,
+                dtype=torch.float16 if args.fp16 else torch.float32,
+            ),
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.empty(
+                args.ffn_hidden_size,
+                args.hidden_size,
+                device=args.device,
+                dtype=torch.float16 if args.fp16 else torch.float32,
+            ),
+        )
+    def forward(self, x):
+        return torch.matmul(
+            F.gelu(torch.matmul(x, self.w1), approximate='tanh'),
+            self.w2,
+        )
+class GLU(FFN):
+    def __init__(self, args: Arguments):
+        super().__init__(args)
+        self.v1 = torch.nn.Parameter(
+            torch.empty(
+                args.hidden_size,
+                args.ffn_hidden_size,
+                device=args.device,
+                dtype=torch.float16 if args.fp16 else torch.float32,
+            ),
+        )
+    def forward(self, x):
+        x1 = F.gelu(torch.matmul(x, self.w1), approximate='tanh') * torch.matmul(x, self.v1)
+        return torch.matmul(x1, self.w2)

tests/layers/moe_test.py ADDED Viewed

	@@ -0,0 +1,199 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from functools import partial
+import pytest
+import torch
+from megablocks._layers.arguments import Arguments
+from megablocks._layers.moe import MoE, batched_load_balancing_loss, clear_load_balancing_loss
+from megablocks._layers.router import batched_router_zloss, clear_router_zloss
+from tests.layers.architectures import FFN
+_FORWARD_TESTS = (
+    (16, 1024, 512, 1, 1),
+    (16, 1024, 512, 2, 1),
+    (16, 1024, 512, 4, 1),
+    (16, 1024, 512, 8, 1),
+    (8, 2048, 512, 1, 1),
+    (8, 2048, 512, 2, 1),
+    (8, 2048, 512, 4, 1),
+    (16, 1024, 512, 2, 2),
+    (16, 1024, 512, 4, 2),
+    (16, 1024, 512, 4, 4),
+    (16, 1024, 512, 8, 2),
+    (16, 1024, 512, 8, 4),
+    (16, 1024, 512, 8, 8),
+)
+_DENSE_TESTS = (
+    (16, 1024, 512),
+    (8, 2048, 512),
+)
+def construct_moe(
+    hidden_size: int,
+    ffn_hidden_size: int,
+    moe_num_experts: int = 1,
+    moe_capacity_factor: int = 1,
+    moe_top_k: int = 1,
+    moe_zloss_weight: float = 0,
+):
+    # All tests are skipped if triton >=3.2.0 is installed since sparse is not supported
+    # TODO: Remove this once sparse is supported with triton >=3.2.0
+    try:
+        import triton
+        if triton.__version__ >= '3.2.0':
+            pytest.skip('Sparse MLP is not supported with triton >=3.2.0')
+    except ImportError:
+        pass
+    init_method = partial(torch.nn.init.normal_, mean=0.0, std=0.1)
+    args = Arguments(
+        hidden_size=hidden_size,
+        ffn_hidden_size=ffn_hidden_size,
+        moe_num_experts=moe_num_experts,
+        moe_capacity_factor=moe_capacity_factor,
+        moe_top_k=moe_top_k,
+        init_method=init_method,
+        moe_zloss_weight=moe_zloss_weight,
+    )
+    mlp = FFN(args)
+    moe_mlp = MoE(args)
+    mlp.cuda(torch.cuda.current_device()).half()
+    moe_mlp.cuda(torch.cuda.current_device()).half()
+    # Set the baseline parameters to match exactly.
+    if moe_num_experts == 1:
+        with torch.no_grad():
+            mlp.w1.copy_(moe_mlp.experts.mlp.w1.squeeze())
+            mlp.w2.copy_(moe_mlp.experts.mlp.w2.squeeze())
+    return args, mlp, moe_mlp
+@pytest.mark.gpu
+@pytest.mark.parametrize(('bs', 'sl', 'hs', 'num_experts', 'top_k'), _FORWARD_TESTS)
+def test_moe_forward(bs: int, sl: int, hs: int, num_experts: int, top_k: int):
+    x = torch.randn(sl, bs, hs).half().cuda()
+    _, _, layer = construct_moe(
+        hidden_size=hs,
+        ffn_hidden_size=hs * 2,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+    )
+    out, _ = layer(x)
+    assert out.shape == x.shape
+    clear_load_balancing_loss()
+@pytest.mark.gpu
+@pytest.mark.parametrize(('bs', 'sl', 'hs', 'num_experts', 'top_k'), _FORWARD_TESTS)
+def test_moe_forward_backward(
+    bs: int,
+    sl: int,
+    hs: int,
+    num_experts: int,
+    top_k: int,
+):
+    x = torch.randn(sl, bs, hs).half().cuda()
+    x.requires_grad_(True)
+    args, _, layer = construct_moe(
+        hidden_size=hs,
+        ffn_hidden_size=hs * 2,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+    )
+    out, _ = layer(x)
+    assert out.shape == x.shape
+    loss = out.sum() + batched_load_balancing_loss(args)
+    loss.backward()
+    layer.zero_grad(set_to_none=True)
+    x.grad = None
+    clear_load_balancing_loss()
+@pytest.mark.gpu
+@pytest.mark.parametrize(('bs', 'sl', 'hs', 'num_experts', 'top_k'), _FORWARD_TESTS)
+def test_moe_forward_backward_with_zloss(
+    bs: int,
+    sl: int,
+    hs: int,
+    num_experts: int,
+    top_k: int,
+):
+    x = torch.randn(sl, bs, hs).half().cuda()
+    x.requires_grad_(True)
+    args, _, layer = construct_moe(
+        hidden_size=hs,
+        ffn_hidden_size=hs * 2,
+        moe_num_experts=num_experts,
+        moe_top_k=top_k,
+        moe_zloss_weight=1e-3,
+    )
+    out, _ = layer(x)
+    assert out.shape == x.shape
+    loss = out.sum() + batched_load_balancing_loss(args)
+    loss.backward()
+    layer.zero_grad(set_to_none=True)
+    x.grad = None
+    clear_load_balancing_loss()
+    clear_router_zloss()
+@pytest.mark.gpu
+@pytest.mark.parametrize(('bs', 'sl', 'hs'), _DENSE_TESTS)
+def test_moe_forward_vs_dense(bs: int, sl: int, hs: int):
+    x = torch.randn(sl, bs, hs).half().cuda()
+    _, mlp, moe_mlp = construct_moe(hidden_size=hs, ffn_hidden_size=hs * 2)
+    expected_out = mlp(x)
+    out, _ = moe_mlp(x)
+    assert out.shape == x.shape == expected_out.shape
+    assert torch.allclose(out, expected_out)
+    clear_load_balancing_loss()
+@pytest.mark.gpu
+@pytest.mark.parametrize(('bs', 'sl', 'hs'), _DENSE_TESTS)
+def test_moe_forward_backward_vs_dense(bs: int, sl: int, hs: int):
+    x = torch.randn(sl, bs, hs).half().cuda()
+    x.requires_grad_(True)
+    _, mlp, moe_mlp = construct_moe(hidden_size=hs, ffn_hidden_size=hs * 2)
+    out, _ = moe_mlp(x)
+    loss = out.sum()
+    loss.backward()
+    w1_grad = moe_mlp.experts.mlp.w1.grad.detach().squeeze()
+    w2_grad = moe_mlp.experts.mlp.w2.grad.detach().squeeze()
+    moe_mlp.zero_grad(set_to_none=True)
+    x.grad = None
+    clear_load_balancing_loss()
+    expected_out = mlp(x)
+    expected_loss = expected_out.sum()
+    expected_loss.backward()
+    expected_w1_grad = mlp.w1.grad.detach()
+    expected_w2_grad = mlp.w2.grad.detach()
+    mlp.zero_grad(set_to_none=True)
+    x.grad = None
+    # Verify the gradients match.
+    assert w1_grad.shape == expected_w1_grad.shape
+    assert w2_grad.shape == expected_w2_grad.shape
+    assert torch.allclose(w1_grad, expected_w1_grad)
+    assert torch.allclose(w2_grad, expected_w2_grad)
+    clear_load_balancing_loss()

tests/ops/binned_gather_test.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import pytest
+import torch
+from megablocks import ops
+BINNED_GATHER_TESTS = (
+    (4, 2, 2, 1),
+    (4, 2, 2, 2),
+    (4, 2, 2, 4),
+    (1024, 1536, 4, 1),
+    (1024, 1536, 4, 2),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 64, 1),
+    (1024, 1536, 64, 2),
+    (1024, 1536, 64, 4),
+    (1024, 1536, 128, 1),
+    (1024, 1536, 128, 2),
+    (1024, 1536, 128, 4),
+    (16384, 768, 4, 1),
+    (16384, 768, 4, 2),
+    (16384, 768, 4, 4),
+    (16384, 768, 64, 1),
+    (16384, 768, 64, 2),
+    (16384, 768, 64, 4),
+    (16384, 768, 128, 1),
+    (16384, 768, 128, 2),
+    (16384, 768, 128, 4),
+)
+@pytest.mark.gpu
+@pytest.mark.parametrize(('sl', 'hs', 'ne', 'top_k'), BINNED_GATHER_TESTS)
+def test_binned_gather(sl: int, hs: int, ne: int, top_k: int):
+    # NOTE: Capacity factor == 1.
+    ec = (sl * top_k) // ne
+    # Create the data and indices.
+    x = torch.randn((sl, hs)).cuda().half()
+    # Randomly assign tokens to experts.
+    top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+    _, indices = ops.sort(top_expert)
+    bins = ops.inclusive_cumsum(ops.histogram(top_expert, ne), 0)
+    def binned_gather(
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bins: torch.Tensor,
+        ec: int,
+        top_k: int,
+    ):
+        x = x.cpu().numpy()
+        indices = indices.cpu().numpy()
+        bins = bins.cpu().numpy()
+        start = 0
+        out = np.zeros((ne, ec, hs))
+        for i in range(ne):
+            end = bins[i]
+            for j in range(min(ec, end - start)):
+                index = indices[start + j] // top_k
+                out[i, j, :] = x[index, :]
+            start = end
+        return torch.from_numpy(out).cuda().half()
+    out = ops.binned_gather(x, indices, bins, ec, top_k)
+    expected_out = binned_gather(x, indices, bins, ec, top_k)
+    assert torch.all(torch.eq(out, expected_out))

tests/ops/binned_scatter_test.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import pytest
+import torch
+from megablocks import ops
+_BINNED_SCATTER_TESTS = (
+    (4, 2, 2, 1),
+    (4, 2, 2, 2),
+    (4, 2, 2, 4),
+    (1024, 1536, 4, 1),
+    (1024, 1536, 4, 2),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 64, 1),
+    (1024, 1536, 64, 2),
+    (1024, 1536, 64, 4),
+    (1024, 1536, 128, 1),
+    (1024, 1536, 128, 2),
+    (1024, 1536, 128, 4),
+    (16384, 768, 4, 1),
+    (16384, 768, 4, 2),
+    (16384, 768, 4, 4),
+    (16384, 768, 64, 1),
+    (16384, 768, 64, 2),
+    (16384, 768, 64, 4),
+    (16384, 768, 128, 1),
+    (16384, 768, 128, 2),
+    (16384, 768, 128, 4),
+)
+@pytest.mark.gpu
+@pytest.mark.parametrize(('sl', 'hs', 'ne', 'top_k'), _BINNED_SCATTER_TESTS)
+def testBinnedScatter(sl: int, hs: int, ne: int, top_k: int):
+    # NOTE: Capacity factor == 1.
+    ec = (sl * top_k) // ne
+    # Create the data and indices.
+    x = torch.randn((sl, hs)).cuda().half()
+    # Randomly assign tokens to experts.
+    top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+    _, indices = ops.sort(top_expert)
+    bins = ops.inclusive_cumsum(ops.histogram(top_expert, ne), 0)
+    # Sample weights for the scatter reduce.
+    weights = torch.rand((sl * top_k,)).cuda().half()
+    x = ops.binned_gather(x, indices, bins, ec, top_k)
+    def binned_scatter(
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        top_k: int,
+    ):
+        x = x.cpu().numpy()
+        indices = indices.cpu().numpy()
+        weights = weights.cpu().numpy()
+        bins = bins.cpu().numpy()
+        start = 0
+        out = np.zeros((sl, hs))
+        for i in range(ne):
+            end = bins[i]
+            for j in range(min(ec, end - start)):
+                index = indices[start + j]
+                scale = weights[index]
+                index //= top_k
+                out[index, :] += scale * x[i, j, :]
+            start = end
+        return torch.from_numpy(out).cuda().half()
+    out = ops.binned_scatter(x, indices, weights, bins, top_k)
+    expected_out = binned_scatter(x, indices, weights, bins, top_k)
+    # NOTE: We need to check approximate equality because the
+    # scatter reduce uses atomics.
+    assert np.testing.assert_allclose(
+        out.cpu(),
+        expected_out.cpu(),
+        rtol=5e-3,
+    ) is None

tests/ops/cumsum_test.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+from megablocks import ops
+CUMSUM_TESTS = (
+    (1, 32),
+    (2, 32),
+    (2, 1024),
+    (4, 1024),
+    (8, 1024),
+    (16, 1024),
+    (32, 1024),
+    (64, 1024),
+    (128, 1024),
+    (2, 16384),
+    (4, 16384),
+    (8, 16384),
+    (16, 16384),
+    (32, 16384),
+    (64, 16384),
+    (128, 16384),
+)
+@pytest.mark.gpu
+@pytest.mark.parametrize(('n', 'm'), CUMSUM_TESTS)
+def test_exclusive_cumsum(n: int, m: int):
+    x = torch.randint(0, 2, (n, m)).long().cuda()
+    out = ops.exclusive_cumsum(x, 1) * x
+    expected_out = (torch.cumsum(x, dim=1) - 1) * x
+    assert torch.all(torch.eq(out, expected_out))
+@pytest.mark.gpu
+@pytest.mark.parametrize(('n', 'm'), CUMSUM_TESTS)
+def test_inclusive_cumsum(n: int, m: int):
+    x = torch.randint(0, 2, (n, m)).long().cuda()
+    out = ops.inclusive_cumsum(x, 1)
+    expected_out = torch.cumsum(x, dim=1)
+    assert torch.all(torch.eq(out, expected_out))

tests/ops/histogram_test.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+from megablocks import ops
+_HISTOGRAM_TESTS = (
+    (1, 32, torch.int16, 128),
+    (1, 1024, torch.int16, 128),
+    (1, 16384, torch.int16, 128),
+    (1, 32, torch.int32, 128),
+    (1, 1024, torch.int32, 128),
+    (1, 16384, torch.int32, 128),
+    (1, 32, torch.int64, 128),
+    (1, 1024, torch.int64, 128),
+    (1, 16384, torch.int64, 128),
+    (1, 32, torch.int16, 1024),
+    (1, 1024, torch.int16, 1024),
+    (1, 16384, torch.int16, 1024),
+    (1, 32, torch.int32, 1024),
+    (1, 1024, torch.int32, 1024),
+    (1, 16384, torch.int32, 1024),
+    (1, 32, torch.int64, 1024),
+    (1, 1024, torch.int64, 1024),
+    (1, 16384, torch.int64, 1024),
+    (2, 32, torch.int16, 128),
+    (2, 1024, torch.int16, 128),
+    (2, 16384, torch.int16, 128),
+    (2, 32, torch.int32, 128),
+    (2, 1024, torch.int32, 128),
+    (2, 16384, torch.int32, 128),
+    (2, 32, torch.int64, 128),
+    (2, 1024, torch.int64, 128),
+    (2, 16384, torch.int64, 128),
+    (2, 32, torch.int16, 1024),
+    (2, 1024, torch.int16, 1024),
+    (2, 16384, torch.int16, 1024),
+    (2, 32, torch.int32, 1024),
+    (2, 1024, torch.int32, 1024),
+    (2, 16384, torch.int32, 1024),
+    (2, 32, torch.int64, 1024),
+    (2, 1024, torch.int64, 1024),
+    (2, 16384, torch.int64, 1024),
+    (8, 32, torch.int16, 128),
+    (8, 1024, torch.int16, 128),
+    (8, 16384, torch.int16, 128),
+    (8, 32, torch.int32, 128),
+    (8, 1024, torch.int32, 128),
+    (8, 16384, torch.int32, 128),
+    (8, 32, torch.int64, 128),
+    (8, 1024, torch.int64, 128),
+    (8, 16384, torch.int64, 128),
+    (8, 32, torch.int16, 1024),
+    (8, 1024, torch.int16, 1024),
+    (8, 16384, torch.int16, 1024),
+    (8, 32, torch.int32, 1024),
+    (8, 1024, torch.int32, 1024),
+    (8, 16384, torch.int32, 1024),
+    (8, 32, torch.int64, 1024),
+    (8, 1024, torch.int64, 1024),
+    (8, 16384, torch.int64, 1024),
+)
+# Override the seed_all fixture in autouse.py because
+# _histc_cuda does not have a deterministic implementation
+@pytest.fixture()
+def seed_all():
+    torch.use_deterministic_algorithms(False)
+    return
+@pytest.mark.gpu
+@pytest.mark.parametrize(('m', 'n', 'dtype', 'max_val'), _HISTOGRAM_TESTS)
+def test_histogram(m: int, n: int, dtype: torch.dtype, max_val: int):
+    x = torch.randint(0, max_val, (m, n)).cuda().to(dtype)
+    out = ops.histogram(x, max_val)
+    expected_out = torch.stack([torch.histc(y, max_val, 0, max_val - 1) for y in torch.split(x, 1)])
+    assert torch.all(torch.eq(out, expected_out))

tests/ops/padded_gather_test.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import pytest
+import torch
+from megablocks import ops
+PADDED_GATHER_TESTS = (
+    (4, 2, 2, 1),
+    (4, 2, 2, 2),
+    (1024, 1, 4, 1),
+    (1024, 1, 4, 2),
+    (1024, 1, 4, 4),
+    (1024, 1, 64, 1),
+    (1024, 1, 64, 2),
+    (1024, 1, 64, 4),
+    (1024, 1, 128, 1),
+    (1024, 1, 128, 2),
+    (1024, 1, 128, 4),
+    (1024, 1536, 4, 1),
+    (1024, 1536, 4, 2),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 64, 1),
+    (1024, 1536, 64, 2),
+    (1024, 1536, 64, 4),
+    (1024, 1536, 128, 1),
+    (1024, 1536, 128, 2),
+    (1024, 1536, 128, 4),
+    (16384, 768, 4, 1),
+    (16384, 768, 4, 2),
+    (16384, 768, 4, 4),
+    (16384, 768, 64, 1),
+    (16384, 768, 64, 2),
+    (16384, 768, 64, 4),
+    (16384, 768, 128, 1),
+    (16384, 768, 128, 2),
+    (16384, 768, 128, 4),
+    (16384, 1, 4, 1),
+    (16384, 1, 4, 2),
+    (16384, 1, 4, 4),
+    (16384, 1, 64, 1),
+    (16384, 1, 64, 2),
+    (16384, 1, 64, 4),
+    (16384, 1, 128, 1),
+    (16384, 1, 128, 2),
+    (16384, 1, 128, 4),
+)
+@pytest.mark.gpu
+@pytest.mark.parametrize(('sl', 'hs', 'ne', 'top_k'), PADDED_GATHER_TESTS)
+def testPaddedGather(sl: int, hs: int, ne: int, top_k: int):
+    # Create the data and indices.
+    x = torch.randn((sl, hs)).cuda().half()
+    # Randomly assign tokens to experts.
+    top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+    bin_ids, indices = ops.sort(top_expert)
+    tokens_per_expert = ops.histogram(top_expert, ne)
+    padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+    padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+    bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+    def padded_gather(
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        x = x.cpu().numpy()
+        indices = indices.cpu().numpy()
+        bin_ids = bin_ids.cpu().numpy()
+        bins = bins.cpu().numpy()
+        padded_bins = padded_bins.cpu().numpy()
+        out = np.zeros((padded_bins[-1], hs))
+        in_idx = 0
+        for i, end in enumerate(bins):
+            out_idx = 0 if i == 0 else padded_bins[i - 1]
+            end = bins[i]
+            while in_idx < end:
+                load_idx = indices[in_idx] // top_k
+                out[out_idx, :] = x[load_idx, :]
+                in_idx += 1
+                out_idx += 1
+        return torch.from_numpy(out).cuda().half()
+    out = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+    expected_out = padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+    assert torch.all(torch.eq(out, expected_out))

tests/ops/padded_scatter_test.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import pytest
+import torch
+from megablocks import ops
+PADDED_SCATTER_TESTS = [
+    (4, 2, 2, 2),
+    (4, 2, 2, 1),
+    (4, 2, 2, 1),
+    (4, 2, 2, 1),
+    (4, 2, 2, 2),
+    (4, 2, 2, 2),
+    (1024, 1, 4, 1),
+    (1024, 1, 4, 2),
+    (1024, 1, 4, 4),
+    (1024, 1, 4, 1),
+    (1024, 1, 4, 2),
+    (1024, 1, 4, 4),
+    (1024, 1, 4, 1),
+    (1024, 1, 4, 2),
+    (1024, 1, 4, 4),
+    (1024, 1, 64, 1),
+    (1024, 1, 64, 2),
+    (1024, 1, 64, 4),
+    (1024, 1, 128, 1),
+    (1024, 1, 128, 2),
+    (1024, 1, 128, 4),
+    (1024, 1536, 4, 1),
+    (1024, 1536, 4, 2),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 4, 4),
+    (1024, 1536, 64, 1),
+    (1024, 1536, 64, 2),
+    (1024, 1536, 64, 4),
+    (1024, 1536, 128, 1),
+    (1024, 1536, 128, 2),
+    (1024, 1536, 128, 4),
+    (1024, 1536, 128, 1),
+    (1024, 1536, 128, 1),
+    (16384, 768, 4, 1),
+    (16384, 768, 4, 2),
+    (16384, 768, 4, 4),
+    (16384, 768, 64, 1),
+    (16384, 768, 64, 2),
+    (16384, 768, 64, 4),
+    (16384, 768, 128, 1),
+    (16384, 768, 128, 2),
+    (16384, 768, 128, 4),
+    (16384, 1, 4, 1),
+    (16384, 1, 4, 2),
+    (16384, 1, 4, 4),
+    (16384, 1, 64, 1),
+    (16384, 1, 64, 2),
+    (16384, 1, 64, 4),
+    (16384, 1, 128, 1),
+    (16384, 1, 128, 2),
+    (16384, 1, 128, 4),
+    (16384, 1, 128, 2),
+    (16384, 1, 128, 2),
+]
+def _to_numpy(x: torch.Tensor) -> np.ndarray:
+    return x.detach().cpu().numpy()
+@pytest.mark.gpu
+@pytest.mark.parametrize((
+    'sl',
+    'hs',
+    'ne',
+    'top_k',
+), PADDED_SCATTER_TESTS)
+def testPaddedScatter(sl: int, hs: int, ne: int, top_k: int):
+    # Create the data and indices.
+    x = torch.randn((sl, hs), requires_grad=True).cuda().half()
+    # Randomly assign tokens to experts.
+    top_expert = torch.randint(0, ne, (sl * top_k,)).cuda().int()
+    bin_ids, indices = ops.sort(top_expert)
+    tokens_per_expert = ops.histogram(top_expert, ne)
+    padded_tokens_per_expert = ops.round_up(tokens_per_expert, 128)
+    padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+    bins = ops.inclusive_cumsum(tokens_per_expert, 0)
+    # Sample weights for the scatter reduce.
+    weights = torch.rand((sl * top_k,), requires_grad=True).cuda().half()
+    # Gather the data to prepare for backwards.
+    x = ops.padded_gather(x, indices, bin_ids, bins, padded_bins, top_k)
+    def padded_scatter(
+        x: torch.Tensor,
+        indices: torch.Tensor,
+        bin_ids: torch.Tensor,
+        weights: torch.Tensor,
+        bins: torch.Tensor,
+        padded_bins: torch.Tensor,
+        top_k: int,
+    ):
+        x = x.detach().cpu().numpy()
+        indices: np.ndarray = _to_numpy(indices)
+        bin_ids: np.ndarray = _to_numpy(bin_ids)
+        weights: np.ndarray = _to_numpy(weights)
+        bins: np.ndarray = _to_numpy(bins)
+        padded_bins: np.ndarray = _to_numpy(padded_bins)
+        out = np.zeros((indices.shape[0] // top_k, hs))
+        out_idx = 0
+        for i in range(len(bins)):
+            in_idx = 0 if i == 0 else padded_bins[i - 1]
+            end = bins[i]
+            while out_idx < end:
+                store_idx = indices[out_idx]
+                scale = weights[store_idx]
+                store_idx //= top_k
+                out[store_idx, :] += scale * x[in_idx, :]
+                out_idx += 1
+                in_idx += 1
+        return torch.from_numpy(out).cuda().half()
+    out = ops.padded_scatter(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
+    expected_out = padded_scatter(
+        x,
+        indices,
+        bin_ids,
+        weights,
+        bins,
+        padded_bins,
+        top_k,
+    )
+    out.backward(torch.randn_like(out))  # sanity check backward pass
+    # NOTE: We need to check approximate equality because the scatter reduce uses atomics.
+    # np.testing.assert_allclose returns `None` if no error and raises an AssertionError if an error exists
+    assert np.testing.assert_allclose(
+        _to_numpy(out),
+        _to_numpy(expected_out),
+        rtol=5e-3,
+    ) is None

tests/ops/replicate_test.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import pytest
+import torch
+try:
+    from megablocks._ops import ops as backend  # type: ignore
+except ModuleNotFoundError as e:
+    raise ModuleNotFoundError("No module named 'megablocks_ops'.") from e
+from megablocks import ops
+def promote_scalar(x: torch.Tensor) -> torch.Tensor:
+    return x.view(1) if not len(x.size()) else x
+REPLICATE_TESTS = [
+    (8, 1, 1),
+    (8, 2, 1),
+    (8, 4, 1),
+    (8, 8, 1),
+    (8, 2, 2),
+    (8, 4, 2),
+    (8, 8, 2),
+    (8, 2, 4),
+    (8, 4, 4),
+    (8, 8, 4),
+    (8, 2, 8),
+    (8, 4, 8),
+    (8, 8, 8),
+    (16384, 2, 1),
+    (16384, 4, 1),
+    (16384, 8, 1),
+    (16384, 16, 1),
+    (16384, 32, 1),
+    (16384, 64, 1),
+    (16384, 128, 1),
+    (16384, 2, 2),
+    (16384, 4, 2),
+    (16384, 8, 2),
+    (16384, 16, 2),
+    (16384, 32, 2),
+    (16384, 64, 2),
+    (16384, 128, 2),
+    (16384, 2, 4),
+    (16384, 4, 4),
+    (16384, 8, 4),
+    (16384, 16, 4),
+    (16384, 32, 4),
+    (16384, 64, 4),
+    (16384, 128, 4),
+    (16384, 2, 8),
+    (16384, 4, 8),
+    (16384, 8, 8),
+    (16384, 16, 8),
+    (16384, 32, 8),
+    (16384, 64, 8),
+    (16384, 128, 8),
+]
+@pytest.mark.gpu
+@pytest.mark.parametrize(("tokens", "num_centers", "top_k"), REPLICATE_TESTS)
+def test_replicate(tokens: int, num_centers: int, top_k: int):
+    tokens_to_centers = torch.randint(0, num_centers, (tokens,)).cuda().int()
+    tokens_per_center = ops.histogram(tokens_to_centers, num_centers)
+    bins = ops.inclusive_cumsum(tokens_per_center, 0)
+    bins = promote_scalar(bins)
+    center_weights = torch.randn(top_k, num_centers).cuda().half()
+    def replicate(x: torch.Tensor, bins: torch.Tensor, num_outputs: int):
+        x = x.cpu().numpy()
+        bins = bins.cpu().numpy()
+        out = np.zeros((x.shape[0], num_outputs))
+        for batch_idx in range(x.shape[0]):
+            start = 0
+            for i, end in enumerate(bins):
+                value = x[batch_idx, i]
+                while start < end:
+                    out[batch_idx, start] = value
+                    start += 1
+        return torch.from_numpy(out).cuda().half()
+    out = ops.replicate(center_weights, bins, tokens)
+    expected_out = replicate(center_weights, bins, tokens)
+    assert torch.all(torch.eq(out, expected_out))
+@pytest.mark.gpu
+@pytest.mark.parametrize(("tokens", "num_centers", "top_k"), REPLICATE_TESTS)
+def test_replicate_backward(tokens: int, num_centers: int, top_k: int):
+    tokens_to_centers = torch.randint(0, num_centers, (tokens,)).cuda().int()
+    tokens_per_center = ops.histogram(tokens_to_centers, num_centers)
+    bins = ops.inclusive_cumsum(tokens_per_center, 0)
+    bins = promote_scalar(bins)
+    center_weights = torch.randn(top_k, num_centers).cuda().half()
+    grad = ops.replicate(center_weights, bins, tokens)
+    out = torch.empty_like(center_weights)
+    backend.replicate_backward(grad, bins, out)
+    expected_out = center_weights * tokens_per_center.view([1, num_centers])
+    # NOTE: This floating-point reduction could be a problem for training stability and accuracy.
+    assert torch.allclose(out, expected_out, rtol=1e-2)

tests/ops/sort_test.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Dict, Optional, Union
+import numpy as np
+import pytest
+import torch
+from megablocks import ops
+SORT_TESTS = [
+    (32, torch.int16, None),
+    (1024, torch.int16, None),
+    (16384, torch.int16, None),
+    (32, torch.int32, None),
+    (1024, torch.int32, None),
+    (16384, torch.int32, None),
+    (32, torch.int64, None),
+    (1024, torch.int64, None),
+    (16384, torch.int64, None),
+    (32, torch.int16, 128),
+    (1024, torch.int16, 128),
+    (16384, torch.int16, 128),
+    (32, torch.int32, 128),
+    (1024, torch.int32, 128),
+    (16384, torch.int32, 128),
+    (32, torch.int64, 128),
+    (1024, torch.int64, 128),
+    (16384, torch.int64, 128),
+]
+def torch_to_numpy_dtype(dtype: torch.dtype,) -> Union[np.int16, np.int32, np.int64]:
+    types: Dict[torch.dtype, Union[np.int16, np.int32, np.int64]] = {
+        torch.int16: np.int16,
+        torch.int32: np.int32,
+        torch.int64: np.int64,
+    }
+    return types[dtype]
+@pytest.mark.gpu
+@pytest.mark.parametrize(
+    ('n', 'dtype', 'max_val'),
+    SORT_TESTS,
+)
+def test_sort(n: int, dtype: torch.dtype, max_val: Optional[int]):
+    if max_val is None:
+        max_val = np.iinfo(torch_to_numpy_dtype(dtype)).max
+    end_bit = int(np.ceil(np.log2(max_val)))
+    x = torch.randint(0, max_val, (n,)).cuda().to(dtype)
+    out, indices = ops.sort(x, end_bit)
+    expected_out, expected_indices = torch.sort(x)
+    assert torch.all(torch.eq(out, expected_out))
+    # NOTE: The indices can be in different order depending
+    # on sort stability if multiple values in the array are
+    # equal.
+    data = torch.empty_like(x)
+    data.scatter_(0, indices.long(), out)
+    expected_data = torch.empty_like(x)
+    expected_data.scatter_(0, expected_indices, expected_out)
+    assert torch.all(torch.eq(data, expected_data))

tests/ops/topology_test.py ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import pytest
+import torch
+from megablocks import ops
+TOPOLOGY_TESTS = (
+    (1024, 1536, 2),
+    (1024, 1536, 4),
+    (1024, 1536, 8),
+    (1024, 1536, 16),
+    (1024, 1536, 32),
+    (1024, 1536, 64),
+    (1024, 1536, 128),
+    (1024, 1536, 256),
+    (1024, 1536, 512),
+    (16384, 768, 2),
+    (16384, 768, 4),
+    (16384, 768, 8),
+    (16384, 768, 16),
+    (16384, 768, 32),
+    (16384, 768, 64),
+    (16384, 768, 128),
+    (16384, 768, 256),
+    (16384, 768, 512),
+    (16384, 768, 1024),
+    (8, 14336, 8),
+)
+@pytest.mark.gpu
+@pytest.mark.parametrize(('sl', 'hs', 'ne'), TOPOLOGY_TESTS)
+def test_topology(sl: int, hs: int, ne: int):
+    # Create the data and indices.
+    blocking = 128
+    assert hs % blocking == 0
+    # Randomly assign tokens to experts.
+    top_expert = torch.randint(0, ne, (sl,)).cuda().int()
+    tokens_per_expert = ops.histogram(top_expert, ne)
+    padded_tokens_per_expert = ops.round_up(tokens_per_expert, blocking)
+    padded_bins = ops.inclusive_cumsum(padded_tokens_per_expert, 0)
+    # Dimensions for the output indices.
+    output_block_rows = int(padded_bins[-1]) // blocking
+    output_block_columns = hs // blocking
+    def topology(
+        padded_bins: torch.Tensor,
+        blocking: torch.Tensor,
+        rows: int,
+        columns: int,
+    ):
+        padded_bins = padded_bins.cpu().numpy()
+        out = np.zeros([rows * columns])
+        start = 0
+        for i in range(padded_bins.shape[0]):
+            end = padded_bins[i] // blocking
+            while start < end:
+                for j in range(columns):
+                    out[start * columns + j] = j + i * columns
+                start += 1
+        return torch.from_numpy(out).cuda().short()
+    out = ops.topology(
+        padded_bins,
+        blocking,
+        output_block_rows,
+        output_block_columns,
+    )
+    expected_out = topology(
+        padded_bins,
+        blocking,
+        output_block_rows,
+        output_block_columns,
+    )
+    assert torch.all(torch.eq(out, expected_out))

tests/ops_test.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import torch
+import megablocks
+import unittest
+from absl.testing import parameterized
+# import itertools
+# import numpy as np
+def allclose(x, y, pct=2.0):
+    mask = torch.isclose(x, y, rtol=1e-5)
+    pct_diff = (mask.numel() - mask.sum()) / mask.numel() * 100
+    if pct_diff > pct:
+        print(x[torch.logical_not(mask)], y[torch.logical_not(mask)])
+        print("{:.2f}% of values not close.".format(pct_diff))
+        return False
+    return True
+def add_flags(x):
+    out = []
+    for y in x:
+        for trans_b in (False, True):
+            out.append(y + (trans_b, False))
+            # TODO: Revisit enabling batch_sizes_on_device
+            # for batch_sizes_on_device in (False, True):
+            #     out.append(y + (trans_b, batch_sizes_on_device))
+    return out
+_TEST_PROBLEMS = add_flags((
+    (1, 128, 128, 128),
+    (8, 128, 128, 128),
+    (16, 128, 128, 128),
+    (1, 128, 256, 512),
+    (8, 128, 256, 512),
+    (16, 128, 256, 512),
+))
+def randn(bs, x, y):
+    out = (torch.rand(bs, x, y) - 0.5 * 2) / (y * x)
+    return out.cuda().to(torch.bfloat16)
+def gmm(a, b, batch_sizes, trans_b=False):
+    batch_sizes = batch_sizes.cpu().numpy()
+    out = []
+    start = 0
+    for i, size in enumerate(batch_sizes):
+        rhs = b[i, :, :].t() if trans_b else b[i, :, :]
+        out.append(a[start:start + size, :] @ rhs)
+        start += size
+    return torch.cat(out)
+@parameterized.parameters(*_TEST_PROBLEMS)
+class OpsTest(parameterized.TestCase):
+    def testGroupedGemm_FixedSizes(self, z, m, k, n, trans_b, batch_sizes_on_device):
+        torch.manual_seed(0)
+        a = randn(z, m, k).view(-1, k)
+        b = randn(z, n, k) if trans_b else randn(z, k, n)
+        batch_sizes = torch.tensor([m] * z)
+        if batch_sizes_on_device:
+            batch_sizes = batch_sizes.cuda()
+        a.requires_grad_(True)
+        b.requires_grad_(True)
+        a_ref = a.detach().clone().requires_grad_(True)
+        b_ref = b.detach().clone().requires_grad_(True)
+        # out = ops.gmm(a, b, batch_sizes, trans_b)
+        out = megablocks.gg_ops.gmm(a, b, batch_sizes, trans_b)
+        # print("out", out)
+        expected_out = gmm(a_ref, b_ref, batch_sizes, trans_b)
+        self.assertTrue(allclose(out, expected_out))
+        # Check gradients.
+        out.sum().backward()
+        expected_out.sum().backward()
+        self.assertTrue(allclose(a.grad, a_ref.grad))
+        self.assertTrue(allclose(b.grad, b_ref.grad))
+    def testGroupedGemm_VariableSizes(self, z, m, k, n, trans_b, batch_sizes_on_device):
+        torch.manual_seed(0)
+        a = randn(z, m, k).view(-1, k)
+        b = randn(z, n, k) if trans_b else randn(z, k, n)
+        dist = torch.rand(z, )
+        dist /= dist.sum()
+        batch_sizes = (dist * m).to(torch.long)
+        error = m * z - batch_sizes.sum()
+        batch_sizes[-1] += error
+        assert batch_sizes.sum() == (m * z)
+        if batch_sizes_on_device:
+            batch_sizes = batch_sizes.cuda()
+        a.requires_grad_(True)
+        b.requires_grad_(True)
+        a_ref = a.detach().clone().requires_grad_(True)
+        b_ref = b.detach().clone().requires_grad_(True)
+        out = megablocks.gg_ops.gmm(a, b, batch_sizes, trans_b)
+        expected_out = gmm(a_ref, b_ref, batch_sizes, trans_b)
+        self.assertTrue(allclose(out, expected_out))
+        # Check gradients.
+        out.sum().backward()
+        expected_out.sum().backward()
+        self.assertTrue(allclose(a.grad, a_ref.grad))
+        # TODO: Review to ensure that the gradients are correct.
+        # self.assertTrue(allclose(b.grad, b_ref.grad))
+# @parameterized.parameters(False, True)
+@parameterized.parameters(False, False)
+class EdgeCasesTest(unittest.TestCase):
+    def testGroupedGemm_ZeroSize(self, batch_sizes_on_device):
+        torch.manual_seed(0)
+        m = 16384
+        k = 4096
+        n = 14336
+        num_experts = 8
+        a = randn(num_experts, m // num_experts, k).view(-1, k)
+        b = randn(num_experts, k, n)
+        batch_sizes = torch.tensor([219, 2246, 5, 8103, 1, 1117, 4693, 0]).to(torch.long)
+        if batch_sizes_on_device:
+            batch_sizes = batch_sizes.cuda()
+        a.requires_grad_(True)
+        b.requires_grad_(True)
+        a_ref = a.detach().clone().requires_grad_(True)
+        b_ref = b.detach().clone().requires_grad_(True)
+        out = megablocks.gg_ops.gmm(a, b, batch_sizes)
+        expected_out = gmm(a_ref, b_ref, batch_sizes)
+        self.assertTrue(allclose(out, expected_out))
+        # Check gradients.
+        out.sum().backward()
+        expected_out.sum().backward()
+        self.assertTrue(allclose(a.grad, a_ref.grad))
+        self.assertTrue(allclose(b.grad, b_ref.grad))
+    def testGroupedGemm_ZeroK(self, batch_sizes_on_device):
+        sz = 128
+        total_tokens = 192
+        a = torch.ones(total_tokens, sz).cuda().to(torch.bfloat16)
+        b = torch.ones(total_tokens, sz).cuda().to(torch.bfloat16)
+        c = torch.ones(4, sz, sz).cuda().to(torch.bfloat16)
+        batch_sizes = torch.tensor([0, 128, 0, 64]).to(torch.long)
+        if batch_sizes_on_device:
+            batch_sizes = batch_sizes.cuda()
+        megablocks.gg_backend.gmm(a, b, batch_sizes, trans_a=True, c=c)
+        self.assertTrue((c[0] == 0).all())
+        self.assertTrue((c[1] == 128).all())
+        self.assertTrue((c[2] == 0).all())
+        self.assertTrue((c[3] == 64).all())
+if __name__ == '__main__':
+    unittest.main()

tests/parallel_layer_test.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import os
+def test_megablocks_moe_mlp_import():
+    from megablocks.layers import MegaBlocksMoeMLP
+    assert MegaBlocksMoeMLP is not None, "MegaBlocksMoeMLP import failed."
+def run_distributed_test(rank, world_size):
+    from megablocks.layers import MegaBlocksMoeMLP
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    os.environ["RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    dist.init_process_group(
+        backend="gloo",
+        rank=rank,
+        world_size=world_size,
+    )
+    expert_parallel_group = torch.distributed.new_group(
+        range(torch.distributed.get_world_size())
+    )
+    model = MegaBlocksMoeMLP()
+    model.expert_parallel_group = expert_parallel_group
+    class Experts:
+        def __init__(self):
+            self.gate_up_proj = None
+            self.gate_up_proj_bias = None
+            self.down_proj = None
+            self.down_proj_bias = None
+            self.hidden_size = None
+    model.experts = Experts()
+    num_experts = 128
+    hidden_size = 1152
+    intermediate_size = 3072
+    ne, hs, isz = num_experts, hidden_size, intermediate_size
+    experts_per_rank = ne // world_size
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.router = torch.nn.Linear(hs, ne).to(device)
+    model.router.weight.data.fill_(1)
+    e = model.experts
+    e.gate_up_proj = torch.nn.Parameter(
+        torch.ones(experts_per_rank, hs, isz, device=device)
+    )
+    e.gate_up_proj_bias = torch.nn.Parameter(
+        torch.zeros(experts_per_rank, isz, device=device)
+    )
+    e.down_proj = torch.nn.Parameter(
+        torch.ones(experts_per_rank, 1536, hs, device=device)
+    )
+    e.down_proj_bias = torch.nn.Parameter(
+        torch.zeros(experts_per_rank, hs, device=device)
+    )
+    e.hidden_size = hs
+    x = torch.randn(1, 1, 1152).to(device)
+    output, expert_weights_out = model(x)
+    assert output.shape == (1, 1, 1152), f"Output shape mismatch on rank {rank}."
+    print(f"Rank {rank}: Test passed! Output shape: {output.shape}")
+    dist.destroy_process_group()
+def test_megablocks_moe_mlp_functionality():
+    world_size = 2
+    mp.spawn(run_distributed_test, args=(world_size,), nprocs=world_size, join=True)
+    print("Multi-process test completed successfully!")
+if __name__ == "__main__":
+    test_megablocks_moe_mlp_import()
+    print("Import test passed!")
+    test_megablocks_moe_mlp_functionality()

tests/test_gg.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import torch
+import megablocks
+def randn(bs, x, y):
+    out = (torch.rand(bs, x, y) - 0.5 * 2) / (y * x)
+    return out.cuda().to(torch.bfloat16)
+def gmm(a, b, batch_sizes, trans_b=False):
+    batch_sizes = batch_sizes.cpu().numpy()
+    out = []
+    start = 0
+    for i, size in enumerate(batch_sizes):
+        rhs = b[i, :, :].t() if trans_b else b[i, :, :]
+        out.append(a[start : start + size, :] @ rhs)
+        start += size
+    return torch.cat(out)
+def test_gmm():
+    z = 1
+    m = 128
+    n = 128
+    k = 128
+    trans_b = False
+    batch_sizes_on_device = False
+    # TODO: fix to enable batch_sizes_on_device
+    # batch_sizes_on_device = True
+    torch.manual_seed(0)
+    a = randn(z, m, k).view(-1, k)
+    b = randn(z, n, k) if trans_b else randn(z, k, n)
+    batch_sizes = torch.tensor([m] * z)
+    if batch_sizes_on_device:
+        batch_sizes = batch_sizes.cuda()
+    a.requires_grad_(True)
+    b.requires_grad_(True)
+    a_ref = a.detach().clone().requires_grad_(True)
+    b_ref = b.detach().clone().requires_grad_(True)
+    # out = ops.gmm(a, b, batch_sizes, trans_b)
+    out = megablocks.gg_ops.gmm(a, b, batch_sizes, trans_b)
+    print("out", out)
+    expected_out = gmm(a_ref, b_ref, batch_sizes, trans_b)
+    assert torch.allclose(out, expected_out, atol=1e-3), f"Expected {expected_out}, got {out}"
+    out.sum().backward()
+    expected_out.sum().backward()
+    assert torch.allclose(a.grad, a_ref.grad, atol=1e-3), f"Expected {a_ref.grad}, got {a.grad}"
+    assert torch.allclose(b.grad, b_ref.grad, atol=1e-3), f"Expected {b_ref.grad}, got {b.grad}"
+    print("Test passed successfully!")

tests/test_mb_moe.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+import megablocks
+def test_import():
+    """Simple test to check if the module can be imported."""
+    print("megablocks_moe module imported successfully.")
+    print("Available functions:", dir(megablocks))
+    expected_functions = [
+        "Arguments", "MLP", "MoE", "ParallelDroplessMLP", "ParallelMLP",
+        "SparseGLU", "SparseMLP", "argsort",
+        "backend", "cumsum", "dMoE", "exclusive_cumsum",
+        "get_load_balancing_loss", "grouped_gemm_util", "histogram",
+        "inclusive_cumsum", "indices", "layers", "ops", "replicate_backward",
+        "replicate_forward", "sort", "torch"
+    ]
+    # Check if all expected functions are available
+    for func in expected_functions:
+        assert func in dir(megablocks), f"Missing function: {func}"
+# exclusive_cumsum
+def test_exclusive_cumsum():
+    """Test exclusive cumulative sum."""
+    x = torch.tensor([1, 2, 3, 4], dtype=torch.int16).cuda()
+    out = torch.empty_like(x)
+    megablocks.exclusive_cumsum(x, 0, out)
+    expected = torch.tensor([0, 1, 3, 6], dtype=torch.float32).cuda()
+    assert torch.equal(out, expected), f"Expected {expected}, got {out}"
+    print("cumsum output:", out)
+# inclusive_cumsum
+def test_inclusive_cumsum():
+    """Test inclusive cumulative sum."""
+    x = torch.tensor([1, 2, 3, 4], dtype=torch.int16).cuda()
+    out = torch.empty_like(x)
+    megablocks.inclusive_cumsum(x, dim=0, out=out)
+    expected = torch.tensor([1, 3, 6, 10], dtype=torch.float32).cuda()
+    assert torch.equal(out, expected), f"Expected {expected}, got {out}"
+# histogram
+def test_histogram():
+    """Test histogram operation."""
+    x = torch.tensor([0, 1, 1, 2, 2, 2], dtype=torch.int16).cuda()
+    num_bins = 3
+    hist = megablocks.histogram(x, num_bins)
+    expected_hist = torch.tensor([1, 2, 3], dtype=torch.int32).cuda()
+    assert torch.equal(hist, expected_hist), f"Expected {expected_hist}, got {hist}"

tests/test_mb_moe_shared_expert.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import torch
+import megablocks
+from megablocks.layers import MegaBlocksMoeMLPWithSharedExpert, create_shared_expert_weights
+def test_megablocks_moe_mlp_with_shared_expert_import():
+    mlp = MegaBlocksMoeMLPWithSharedExpert()
+    assert hasattr(mlp, 'shared_up_proj_weight')
+    assert hasattr(mlp, 'shared_down_proj_weight')
+    assert hasattr(mlp, 'set_shared_expert_weights')
+def test_set_shared_expert_weights():
+    mlp = MegaBlocksMoeMLPWithSharedExpert()
+    hidden_size = 128
+    shared_expert_hidden_size = 256
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    dtype = torch.float32
+    up_proj_weight = torch.randn(shared_expert_hidden_size, hidden_size, device=device, dtype=dtype)
+    down_proj_weight = torch.randn(hidden_size, shared_expert_hidden_size, device=device, dtype=dtype)
+    up_proj_bias = torch.randn(shared_expert_hidden_size, device=device, dtype=dtype)
+    down_proj_bias = torch.randn(hidden_size, device=device, dtype=dtype)
+    mlp.set_shared_expert_weights(
+        up_proj_weight=up_proj_weight,
+        down_proj_weight=down_proj_weight,
+        up_proj_bias=up_proj_bias,
+        down_proj_bias=down_proj_bias,
+        weighted_sum=True,
+        activation_fn=torch.nn.functional.gelu
+    )
+    assert torch.equal(mlp.shared_up_proj_weight, up_proj_weight)
+    assert torch.equal(mlp.shared_down_proj_weight, down_proj_weight)
+    assert torch.equal(mlp.shared_up_proj_bias, up_proj_bias)
+    assert torch.equal(mlp.shared_down_proj_bias, down_proj_bias)
+    assert mlp.shared_expert_weighted_sum == True
+    assert mlp.shared_activation_fn == torch.nn.functional.gelu
+def test_create_shared_expert_weights():
+    hidden_size = 128
+    shared_expert_hidden_size = 256
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    dtype = torch.float32
+    def init_method(tensor):
+        torch.nn.init.xavier_uniform_(tensor)
+    up_proj_weight, down_proj_weight, up_proj_bias, down_proj_bias = create_shared_expert_weights(
+        hidden_size=hidden_size,
+        shared_expert_hidden_size=shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+        init_method=init_method
+    )
+    assert up_proj_weight.shape == (shared_expert_hidden_size, hidden_size)
+    assert down_proj_weight.shape == (hidden_size, shared_expert_hidden_size)
+    assert up_proj_weight.device.type == device.type
+    assert down_proj_weight.device.type == device.type
+    assert up_proj_weight.dtype == dtype
+    assert down_proj_weight.dtype == dtype
+    assert up_proj_bias is None
+    assert down_proj_bias is None
+def test_shared_expert_weights_none_by_default():
+    mlp = MegaBlocksMoeMLPWithSharedExpert()
+    assert mlp.shared_up_proj_weight is None
+    assert mlp.shared_down_proj_weight is None
+    assert mlp.shared_up_proj_bias is None
+    assert mlp.shared_down_proj_bias is None
+    assert mlp.shared_expert_weighted_sum == False
+    assert mlp.shared_activation_fn is None
+def test_inheritance_from_megablocks_moe_mlp():
+    mlp = MegaBlocksMoeMLPWithSharedExpert()
+    from megablocks.layers import MegaBlocksMoeMLP
+    assert isinstance(mlp, MegaBlocksMoeMLP)
+    assert hasattr(mlp, 'forward')
+def test_shared_expert_weights_custom_init():
+    hidden_size = 64
+    shared_expert_hidden_size = 128
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    dtype = torch.float16
+    def custom_init(tensor):
+        torch.nn.init.constant_(tensor, 0.5)
+    def custom_output_init(tensor):
+        torch.nn.init.constant_(tensor, 0.1)
+    up_proj_weight, down_proj_weight, up_proj_bias, down_proj_bias = create_shared_expert_weights(
+        hidden_size=hidden_size,
+        shared_expert_hidden_size=shared_expert_hidden_size,
+        device=device,
+        dtype=dtype,
+        init_method=custom_init,
+        output_layer_init_method=custom_output_init
+    )
+    assert torch.all(up_proj_weight == 0.5)
+    assert torch.all(down_proj_weight == 0.1)
+    assert up_proj_weight.dtype == dtype
+    assert down_proj_weight.dtype == dtype
+def test_shared_expert_weights_dimensions():
+    mlp = MegaBlocksMoeMLPWithSharedExpert()
+    batch_size = 4
+    seq_len = 16
+    hidden_size = 128
+    shared_expert_hidden_size = 256
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    up_proj_weight = torch.randn(shared_expert_hidden_size, hidden_size, device=device)
+    down_proj_weight = torch.randn(hidden_size, shared_expert_hidden_size, device=device)
+    mlp.set_shared_expert_weights(
+        up_proj_weight=up_proj_weight,
+        down_proj_weight=down_proj_weight
+    )
+    x = torch.randn(seq_len, batch_size, hidden_size, device=device)
+    expected_up_output_shape = (seq_len, batch_size, shared_expert_hidden_size)
+    expected_down_output_shape = (seq_len, batch_size, hidden_size)
+    assert up_proj_weight.shape[1] == x.shape[-1]
+    assert down_proj_weight.shape[0] == x.shape[-1]

tests/test_mb_moe_shared_expert_multi.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import os
+import pytest
+from megablocks.layers import MegaBlocksMoeMLPWithSharedExpert, create_shared_expert_weights
+def run_distributed_shared_expert_test(rank, world_size):
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12356"
+    os.environ["RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    dist.init_process_group(
+        backend="gloo",
+        rank=rank,
+        world_size=world_size,
+    )
+    model = MegaBlocksMoeMLPWithSharedExpert()
+    hidden_size = 128
+    shared_expert_hidden_size = 192
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    def simple_init(tensor):
+        torch.nn.init.xavier_uniform_(tensor)
+    shared_up_proj_weight, shared_down_proj_weight, shared_up_proj_bias, shared_down_proj_bias = create_shared_expert_weights(
+        hidden_size=hidden_size,
+        shared_expert_hidden_size=shared_expert_hidden_size,
+        device=torch.device(device),
+        dtype=torch.float32,
+        init_method=simple_init
+    )
+    model.set_shared_expert_weights(
+        up_proj_weight=shared_up_proj_weight,
+        down_proj_weight=shared_down_proj_weight,
+        up_proj_bias=shared_up_proj_bias,
+        down_proj_bias=shared_down_proj_bias,
+        weighted_sum=True,
+        activation_fn=torch.nn.functional.gelu
+    )
+    assert model.shared_up_proj_weight is not None, f"Shared up proj weight not set on rank {rank}"
+    assert model.shared_down_proj_weight is not None, f"Shared down proj weight not set on rank {rank}"
+    assert model.shared_expert_weighted_sum == True, f"Weighted sum not set correctly on rank {rank}"
+    print(f"Rank {rank}: Shared expert setup test passed!")
+    dist.destroy_process_group()
+def run_distributed_shared_expert_weighted_sum_test(rank, world_size):
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12357"
+    os.environ["RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    dist.init_process_group(
+        backend="gloo",
+        rank=rank,
+        world_size=world_size,
+    )
+    model = MegaBlocksMoeMLPWithSharedExpert()
+    hidden_size = 64
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    def simple_init(tensor):
+        torch.nn.init.xavier_uniform_(tensor)
+    shared_up_proj_weight, shared_down_proj_weight, _, _ = create_shared_expert_weights(
+        hidden_size=hidden_size,
+        shared_expert_hidden_size=96,
+        device=torch.device(device),
+        dtype=torch.float32,
+        init_method=simple_init
+    )
+    model.set_shared_expert_weights(
+        up_proj_weight=shared_up_proj_weight,
+        down_proj_weight=shared_down_proj_weight,
+        weighted_sum=False,
+        activation_fn=torch.nn.functional.relu
+    )
+    assert model.shared_up_proj_weight is not None, f"Shared up proj weight not set on rank {rank}"
+    assert model.shared_down_proj_weight is not None, f"Shared down proj weight not set on rank {rank}"
+    assert model.shared_expert_weighted_sum == False, f"Weighted sum not set correctly on rank {rank}"
+    assert model.shared_activation_fn == torch.nn.functional.relu, f"Activation function not set correctly on rank {rank}"
+    print(f"Rank {rank}: Weighted sum setup test passed!")
+    dist.destroy_process_group()
+@pytest.mark.parametrize("world_size", [1, 2, 4, 8])
+def test_shared_expert_distributed_functionality(world_size):
+    if world_size == 1:
+        # Single process test
+        model = MegaBlocksMoeMLPWithSharedExpert()
+        hidden_size = 128
+        shared_expert_hidden_size = 192
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        def simple_init(tensor):
+            torch.nn.init.xavier_uniform_(tensor)
+        shared_up_proj_weight, shared_down_proj_weight, shared_up_proj_bias, shared_down_proj_bias = create_shared_expert_weights(
+            hidden_size=hidden_size,
+            shared_expert_hidden_size=shared_expert_hidden_size,
+            device=torch.device(device),
+            dtype=torch.float32,
+            init_method=simple_init
+        )
+        model.set_shared_expert_weights(
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            up_proj_bias=shared_up_proj_bias,
+            down_proj_bias=shared_down_proj_bias,
+            weighted_sum=True,
+            activation_fn=torch.nn.functional.gelu
+        )
+        assert model.shared_up_proj_weight is not None, "Shared up proj weight not set"
+        assert model.shared_down_proj_weight is not None, "Shared down proj weight not set"
+        assert model.shared_expert_weighted_sum == True, "Weighted sum not set correctly"
+        print("Single process shared expert setup test passed!")
+    else:
+        # Multi-process test
+        mp.spawn(run_distributed_shared_expert_test, args=(world_size,), nprocs=world_size, join=True)
+        print("Multi-process shared expert test completed successfully!")
+@pytest.mark.parametrize("world_size", [1, 2, 4, 8])
+def test_shared_expert_distributed_weighted_sum(world_size):
+    if world_size == 1:
+        # Single process test
+        model = MegaBlocksMoeMLPWithSharedExpert()
+        hidden_size = 64
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        def simple_init(tensor):
+            torch.nn.init.xavier_uniform_(tensor)
+        shared_up_proj_weight, shared_down_proj_weight, _, _ = create_shared_expert_weights(
+            hidden_size=hidden_size,
+            shared_expert_hidden_size=96,
+            device=torch.device(device),
+            dtype=torch.float32,
+            init_method=simple_init
+        )
+        model.set_shared_expert_weights(
+            up_proj_weight=shared_up_proj_weight,
+            down_proj_weight=shared_down_proj_weight,
+            weighted_sum=False,
+            activation_fn=torch.nn.functional.relu
+        )
+        assert model.shared_up_proj_weight is not None, "Shared up proj weight not set"
+        assert model.shared_down_proj_weight is not None, "Shared down proj weight not set"
+        assert model.shared_expert_weighted_sum == False, "Weighted sum not set correctly"
+        assert model.shared_activation_fn == torch.nn.functional.relu, "Activation function not set correctly"
+        print("Single process weighted sum setup test passed!")
+    else:
+        # Multi-process test
+        mp.spawn(run_distributed_shared_expert_weighted_sum_test, args=(world_size,), nprocs=world_size, join=True)
+        print("Multi-process shared expert weighted sum test completed successfully!")
+def test_shared_expert_single_process():
+    model = MegaBlocksMoeMLPWithSharedExpert()
+    assert model.shared_up_proj_weight is None
+    assert model.shared_down_proj_weight is None
+    assert hasattr(model, 'set_shared_expert_weights')
+    print("Single process shared expert basic test passed!")
+if __name__ == "__main__":
+    test_shared_expert_single_process()
+    print("Single process test passed!")
+    os.environ['WORLD_SIZE'] = '2'
+    test_shared_expert_distributed_functionality()
+    print("Distributed functionality test passed!")
+    test_shared_expert_distributed_weighted_sum()
+    print("Distributed weighted sum test passed!")

torch-ext/megablocks/__init__.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+from ._ops import ops
+#from .grouped_gemm import backend as gg_backend
+#from .grouped_gemm import ops as gg_ops
+from ._layers.arguments import Arguments
+from ._layers.dmoe import ParallelDroplessMLP, dMoE
+from ._layers.glu import SparseGLU
+from ._layers.mlp import MLP, SparseMLP
+from ._layers.moe import MoE, ParallelMLP, get_load_balancing_loss
+from . import layers
+# This section contains the direct kernel exports (not inlcuded in the original code)
+def exclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute exclusive cumulative sum along the specified dimension.
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+    Returns:
+        The output tensor
+    """
+    result = ops.exclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+def inclusive_cumsum(x: torch.Tensor, dim: int, out: torch.Tensor) -> torch.Tensor:
+    """
+    Compute inclusive cumulative sum along the specified dimension.
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum
+        out: Output tensor (modified in-place)
+    Returns:
+        The output tensor
+    """
+    result = ops.inclusive_cumsum(x, dim)
+    out.copy_(result)
+    return out
+def histogram(x: torch.Tensor, num_bins: int) -> torch.Tensor:
+    """
+    Compute histogram of input tensor values.
+    Args:
+        x: Input tensor
+        num_bins: Number of histogram bins
+    Returns:
+        Histogram tensor with counts for each bin
+    """
+    return ops.histogram(x, num_bins)
+def indices(
+    padded_bins: torch.Tensor,
+    block_size: int,
+    output_block_rows: int,
+    output_block_columns: int,
+) -> torch.Tensor:
+    """
+    Construct indices from padded bins for sparse operations.
+    Args:
+        padded_bins: Tensor containing bin boundaries
+        block_size: Size of each block
+        output_block_rows: Number of rows in output blocks
+        output_block_columns: Number of columns in output blocks
+    Returns:
+        Tensor containing constructed indices
+    """
+    return ops.indices(padded_bins, block_size, output_block_rows, output_block_columns)
+def replicate_forward(
+    x: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Forward pass of replicate operation - replicate values according to bin sizes.
+    Args:
+        x: Input tensor with values to replicate
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_forward(x, bins, out)
+def replicate_backward(
+    grad: torch.Tensor, bins: torch.Tensor, out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Backward pass of replicate operation - reduce gradients back to bins.
+    Args:
+        grad: Gradient tensor to reduce
+        bins: Tensor containing bin sizes
+        out: Output tensor (modified in-place)
+    Returns:
+        The output tensor
+    """
+    return ops.replicate_backward(grad, bins, out)
+def sort(
+    x: torch.Tensor, end_bit: int, x_out: torch.Tensor, iota_out: torch.Tensor
+) -> torch.Tensor:
+    """
+    Radix sort with index tracking.
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+        x_out: Output tensor for sorted values
+        iota_out: Output tensor for sorted indices
+    Returns:
+        The sorted values tensor
+    """
+    return ops.sort(x, end_bit, x_out, iota_out)
+# Convenience functions for common use cases
+def cumsum(x: torch.Tensor, dim: int = -1, exclusive: bool = False) -> torch.Tensor:
+    """
+    Compute cumulative sum with automatic output allocation.
+    Args:
+        x: Input tensor
+        dim: Dimension along which to compute cumsum (default: last dimension)
+        exclusive: Whether to compute exclusive (True) or inclusive (False) cumsum
+    Returns:
+        New tensor containing the cumulative sum
+    """
+    out = torch.empty_like(x)
+    if exclusive:
+        return exclusive_cumsum(x, dim, out)
+    else:
+        return inclusive_cumsum(x, dim, out)
+def argsort(x: torch.Tensor, end_bit: int = 32) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Sort tensor and return both sorted values and indices.
+    Args:
+        x: Input tensor to sort
+        end_bit: Number of bits to consider in sorting
+    Returns:
+        Tuple of (sorted_values, sorted_indices)
+    """
+    x_out = torch.empty_like(x)
+    iota_out = torch.empty_like(x)
+    sort(x, end_bit, x_out, iota_out)
+    return x_out, iota_out
+# Export public API
+__all__ = [
+    "MyReplacementLayer",
+    # Direct kernel exports
+    "exclusive_cumsum",
+    "inclusive_cumsum",
+    "histogram",
+    "indices",
+    "replicate_forward",
+    "replicate_backward",
+    "sort",
+    "cumsum",
+    "argsort",
+    # Original exports
+    "Arguments",
+    "ParallelDroplessMLP",
+    "dMoE",
+    "SparseGLU",
+    "MLP",
+    "SparseMLP",
+    "MoE",
+    "ParallelMLP",
+    "get_load_balancing_loss",
+]

torch-ext/megablocks/_layers/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+# from megablocks.layers.dmoe import dMoE
+from .moe import MoE
+__all__ = [
+    'MoE',
+    # 'dMoE',
+]

torch-ext/megablocks/_layers/activation_fn.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Callable, Union
+import torch
+from ..stk import Matrix
+def act_fn(
+    x: Matrix,
+    function: Callable,
+    return_grad_fn: bool = False,
+    **kwargs,
+) -> Union[tuple[Matrix, Any] | Matrix]:
+    assert isinstance(x, Matrix)
+    with torch.set_grad_enabled(torch.is_grad_enabled() or return_grad_fn):
+        if return_grad_fn:
+            x.data.requires_grad = True
+        out = function(x.data, **kwargs)
+        y = Matrix(
+            x.size(),
+            out,
+            x.row_indices,
+            x.column_indices,
+            x.offsets,
+            x.column_indices_t,
+            x.offsets_t,
+            x.block_offsets_t,
+        )
+        if return_grad_fn:
+            return y, out.backward
+        return y

torch-ext/megablocks/_layers/all_to_all.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# Copyright 2024 Databricks
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.distributed as dist
+class AllToAllOp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, output_split_sizes, input_split_sizes, group, async_op):
+        out = torch.empty((sum(output_split_sizes),) + x.shape[1:], device=x.device, dtype=x.dtype)
+        ctx.input_shape = x.shape
+        ctx.output_split_sizes = output_split_sizes
+        ctx.input_split_sizes = input_split_sizes
+        ctx.group = group
+        handle = dist.all_to_all_single(
+            out,
+            x,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+        return out, handle
+    @staticmethod
+    def backward(ctx, grad, _):
+        if ctx.needs_input_grad[0]:
+            out = torch.empty(
+                ctx.input_shape,
+                device=grad.device,
+                dtype=grad.dtype,
+            )
+            dist.all_to_all_single(
+                out,
+                grad,
+                output_split_sizes=ctx.input_split_sizes,
+                input_split_sizes=ctx.output_split_sizes,
+                group=ctx.group,
+            )
+            return out, None, None, None, None
+        return None, None, None, None, None
+def all_to_all(x, output_split_sizes, input_split_sizes, group, async_op=False):
+    return AllToAllOp.apply(
+        x,
+        output_split_sizes,
+        input_split_sizes,
+        group,
+        async_op,
+    )