From bf3f000f8567ae0dd9a34e6f9f7d1a5c8d804849 Mon Sep 17 00:00:00 2001
From: Lunderberg <Lunderberg@users.noreply.github.com>
Date: Fri, 18 Jun 2021 00:16:45 -0700
Subject: [PATCH] [topi][CuDNN] Removed requirement for GPU from topi
 conv2d_cudnn.cuda and conv3d_cudnn.cuda (#8276)

Previously, `conv2d_cudnn.cuda` would use cudnn's benchmarking
function to select a forward convolution when `cfg.is_fallback`, and
`conv3d_cudnn.cuda` would use cudnn's benchmarking at all times.
After this commit, both expose the cudnn algorithm choice as an
option.  If `cfg.is_fallback`, the local device will be benchmarked if
present, otherwise will select a default cudnn implementation.

In the future, to better support RPC use-cases, the fallback config
should be based on cudnn-specific parameters saved in the Target
object.

Co-authored-by: Eric Lunderberg <elunderberg@octoml.ai>
---
 python/tvm/contrib/cudnn.py              | 18 +++++++++++++
 python/tvm/topi/cuda/conv2d.py           | 12 ++++++---
 python/tvm/topi/cuda/conv3d.py           | 12 ++++++++-
 src/runtime/contrib/cudnn/cudnn_utils.cc | 32 +++++++++++++++++++++---
 src/runtime/contrib/cudnn/cudnn_utils.h  |  5 +++-
 5 files changed, 71 insertions(+), 8 deletions(-)

diff --git a/python/tvm/contrib/cudnn.py b/python/tvm/contrib/cudnn.py
index 0e22e0c09274..ac3835eb2cd1 100644
--- a/python/tvm/contrib/cudnn.py
+++ b/python/tvm/contrib/cudnn.py
@@ -64,6 +64,24 @@
 _ALGO_TYPE = ["fwd", "bwd_filter", "bwd_data"]
 
 
+def exists():
+    """
+    Checks whether the local machine can use CuDNN.
+
+    Returns
+    -------
+        exists: bool
+
+            True if CuDNN support is enabled and a CuDNN-capable GPU
+            exists.  Otherwise, False.
+    """
+    func = tvm.get_global_func("tvm.contrib.cudnn.exists", allow_missing=True)
+    if func is None:
+        return False
+
+    return bool(func())
+
+
 def algo_to_index(algo_type, algo_name):
     """Return a index represents the algorithm, which can be used in
     calling CuDNN function
diff --git a/python/tvm/topi/cuda/conv2d.py b/python/tvm/topi/cuda/conv2d.py
index 63c7c9308284..a199534ccb51 100644
--- a/python/tvm/topi/cuda/conv2d.py
+++ b/python/tvm/topi/cuda/conv2d.py
@@ -117,9 +117,15 @@ def conv2d_cudnn(
     else:
         dtype = data.dtype
 
-    cfg.define_knob("algo", range(8))
-    if cfg.is_fallback:  # Let CUDNN choose the best algo
-        cfg["algo"] = OtherOptionEntity(-1)
+    cfg.define_knob("algo", range(cudnn.algo_to_index("fwd", "CUDNN_CONVOLUTION_FWD_ALGO_COUNT")))
+    if cfg.is_fallback:
+        if cudnn.exists():
+            # Let CUDNN choose the best algo, based on benchmarks run
+            # on the local machine.  In the future, this should be
+            # based on parameters stored in the Target.
+            cfg["algo"] = OtherOptionEntity(-1)
+        else:
+            cfg["algo"] = OtherOptionEntity(0)
 
     return cudnn.conv_forward(
         data,
diff --git a/python/tvm/topi/cuda/conv3d.py b/python/tvm/topi/cuda/conv3d.py
index 530df31ed3dc..51f1f7a27c7e 100644
--- a/python/tvm/topi/cuda/conv3d.py
+++ b/python/tvm/topi/cuda/conv3d.py
@@ -221,6 +221,16 @@ def conv3d_cudnn(
             * ((KW - 1) * dilation_w + 1)
         )
 
+    cfg.define_knob("algo", range(cudnn.algo_to_index("fwd", "CUDNN_CONVOLUTION_FWD_ALGO_COUNT")))
+    if cfg.is_fallback:
+        if cudnn.exists():
+            # Let CUDNN choose the best algo, based on benchmarks run
+            # on the local machine.  In the future, this should be
+            # based on parameters stored in the Target.
+            cfg["algo"] = OtherOptionEntity(-1)
+        else:
+            cfg["algo"] = OtherOptionEntity(0)
+
     return cudnn.conv_forward(
         data,
         kernel,
@@ -229,7 +239,7 @@ def conv3d_cudnn(
         [dilation_d, dilation_h, dilation_w],
         conv_mode=1,
         tensor_format=tensor_format,
-        algo=-1,  # let CUDNN choose the best algo
+        algo=cfg["algo"].val,
         conv_dtype=dtype,
     )
 
diff --git a/src/runtime/contrib/cudnn/cudnn_utils.cc b/src/runtime/contrib/cudnn/cudnn_utils.cc
index da67c2e1a9a5..a320c9236ceb 100644
--- a/src/runtime/contrib/cudnn/cudnn_utils.cc
+++ b/src/runtime/contrib/cudnn/cudnn_utils.cc
@@ -99,16 +99,38 @@ CuDNNThreadEntry::CuDNNThreadEntry() {
   auto func = runtime::Registry::Get("device_api.cuda");
   void* ret = (*func)();
   cuda_api = static_cast<runtime::DeviceAPI*>(ret);
-  CUDNN_CALL(cudnnCreate(&handle));
+
+  // If no CuDNN-capable device is present, allow the CuDNNThreadEntry
+  // object to be created.  This is needed for
+  // CuDNNThreadEntry::exists.
+  {
+    cudnnStatus_t create_res = cudnnCreate(&handle);
+    if (create_res == CUDNN_STATUS_NOT_INITIALIZED) {
+      return;
+    }
+    CUDNN_CALL(create_res);
+  }
+
   CUDNN_CALL(cudnnSetStream(handle, stream));
   conv_entry.cuda_api = cuda_api;
 }
 
-CuDNNThreadEntry::~CuDNNThreadEntry() { CUDNN_CALL(cudnnDestroy(handle)); }
+CuDNNThreadEntry::~CuDNNThreadEntry() {
+  if (handle) {
+    CUDNN_CALL(cudnnDestroy(handle));
+  }
+}
 
 typedef dmlc::ThreadLocalStore<CuDNNThreadEntry> CuDNNThreadStore;
 
-CuDNNThreadEntry* CuDNNThreadEntry::ThreadLocal() { return CuDNNThreadStore::Get(); }
+CuDNNThreadEntry* CuDNNThreadEntry::ThreadLocal(bool check_exists) {
+  auto* res = CuDNNThreadStore::Get();
+  if (check_exists) {
+    ICHECK(res->exists()) << "CUDNN_STATUS_NOT_INITIALIZED";
+  }
+
+  return res;
+}
 
 // ConvEntry
 
@@ -148,5 +170,9 @@ SoftmaxEntry::SoftmaxEntry() { CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_des
 
 SoftmaxEntry::~SoftmaxEntry() { CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc)); }
 
+TVM_REGISTER_GLOBAL("tvm.contrib.cudnn.exists").set_body_typed([]() -> bool {
+  return CuDNNThreadEntry::ThreadLocal(false)->exists();
+});
+
 }  // namespace contrib
 }  // namespace tvm
diff --git a/src/runtime/contrib/cudnn/cudnn_utils.h b/src/runtime/contrib/cudnn/cudnn_utils.h
index 72380b64121a..01b92d61e66e 100644
--- a/src/runtime/contrib/cudnn/cudnn_utils.h
+++ b/src/runtime/contrib/cudnn/cudnn_utils.h
@@ -93,11 +93,14 @@ struct SoftmaxEntry {
 struct CuDNNThreadEntry {
   CuDNNThreadEntry();
   ~CuDNNThreadEntry();
+
+  bool exists() const { return handle; }
+
   cudnnHandle_t handle{nullptr};
   ConvEntry conv_entry;
   SoftmaxEntry softmax_entry;
   runtime::DeviceAPI* cuda_api{nullptr};
-  static CuDNNThreadEntry* ThreadLocal();
+  static CuDNNThreadEntry* ThreadLocal(bool check_exists = true);
 };  // CuDNNThreadEntry
 
 }  // namespace contrib