minimal fix to support Windows

based on @Jamezo97 and @acpopescu work manually cherry-picked from PR bitsandbytes-foundation#788 and PR bitsandbytes-foundation#229 and cleanup by wkpark Signed-off-by: Won-Kyu Park <[email protected]>
green-s · Jan 18, 2024 · 36434ad · 36434ad
1 parent 8ac46e1
commit 36434ad
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 10 deletions.
diff --git a/csrc/common.h b/csrc/common.h
@@ -7,8 +7,16 @@ using namespace BinSearch;
 
 #define BLOCK_SIZE 16384
 
+#if defined(USE_AVX) || defined(USE_AVX2)
+#define INSTR_SET AVX
+#elif defined(USE_SSE41) || defined(USE_SSE42)
+#define INSTR_SET SSE
+#else
+#define INSTR_SET Scalar
+#endif
+
 struct quantize_block_args {
- BinAlgo<Scalar, float, Direct2> *bin_searcher;
+ BinAlgo<INSTR_SET, float, Direct2> *bin_searcher;
  float *code;
  float *A;
  float *absmax;

diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
@@ -1,5 +1,9 @@
 #include <BinSearch.h>
+#ifdef _WIN32
+#include <thread>
+#else
 #include <pthread.h>
+#endif
 #include <common.h>
 
 using namespace BinSearch;
@@ -23,15 +27,19 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
  num_blocks += n % blocksize == 0 ? 0 : 1;
 
  const uint32 elements_code = 256;
- BinAlgo<Scalar, float, Direct2> bin_searcher(code, elements_code);
+ BinAlgo<INSTR_SET, float, Direct2> bin_searcher(code, elements_code);
 
  int thread_wave_size = 256;
  // we chunk the thresds into waves of 256 since the max limit is
  // between 16k and 64k on Linux (we reach this when running BLOOM-176B with a large batch size)
  for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size)
  {
  long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset;
+#ifdef _WIN32
+ std::thread *threads = (std::thread *) malloc(sizeof(std::thread) * valid_chunks);
+#else
  pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * valid_chunks);
+#endif
 
  struct quantize_block_args **args = (quantize_block_args **) malloc(valid_chunks * sizeof(quantize_block_args *));
 
@@ -55,14 +63,23 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
  arg->threadidx = block_idx / blocksize;
  arg->blocksize = blocksize;
 
+#ifdef _WIN32
+ new (&threads[chunks_processed]) std::thread(quantize_block, arg);
+#else
  pthread_create(&threads[chunks_processed], NULL, &quantize_block, (void *) arg);
+#endif
  chunks_processed += 1;
  if(chunks_processed == valid_chunks){ break; }
  }
 
  for (int i = 0; i < valid_chunks; i++)
+ {
+#ifdef _WIN32
+ threads[i].join();
+#else
  int err = pthread_join(threads[i], NULL);
-
+#endif
+ }
  free(threads);
  for (int i = 0; i < valid_chunks; i++)
  free(args[i]);

diff --git a/csrc/kernels.cu b/csrc/kernels.cu
@@ -3821,12 +3821,12 @@ template __global__ void kgemm_4bit_inference_naive<float, 128, 32>(int M, int N
 template __global__ void kExtractOutliers<COL_TURING>(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
 template __global__ void kExtractOutliers<COL_AMPERE>(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
 
-template __global__ void kspmm_coo_very_sparse_naive<half, 8, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<half, 16, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<half, 32, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 8, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 16, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 32, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 8, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 16, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 32, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 8, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 16, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 32, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
 
 template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 0, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
 template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);

diff --git a/csrc/ops.cuh b/csrc/ops.cuh
@@ -9,7 +9,6 @@
 
 #include <stdio.h>
 #include <iostream>
-#include <unistd.h>
 #include <assert.h>
 
 #include <cuda_runtime_api.h>

diff --git a/include/SIMD.h b/include/SIMD.h
@@ -64,6 +64,16 @@ template <> struct InstrFloatTraits<SSE, double>
  typedef __m128d vec_t;
 };
 
+template <> struct InstrFloatTraits<Scalar, float>
+{
+ typedef float vec_t;
+};
+
+template <> struct InstrFloatTraits<Scalar, double>
+{
+ typedef double vec_t;
+};
+
 template <InstrSet I, typename T>
 struct FTOITraits
 {