Support calling custom method names via METHOD_TO_CALL (fixes triton-inference-server/server#5209)

iceychris · iceychris · commit c6a9e1ce6c9f · 2023-04-11T12:49:05.000+02:00
diff --git a/README.md b/README.md
@@ -206,6 +206,20 @@ complex execution modes and dynamic shapes. If not specified, all are enabled by
 
     `ENABLE_TENSOR_FUSER`
 
+* `MODULE_METHOD_NAME`: String flag to specify which method on the PyTorch model is being called.
+Default value is `forward`.
+
+The section of model config file specifying this parameter will look like:
+
+```
+parameters: {
+key: "MODULE_METHOD_NAME"
+    value: {
+    string_value:"custom_method"
+    }
+}
+```
+
 ### Important Note
 
 * The execution of PyTorch model on GPU is asynchronous in nature. See
diff --git a/src/libtorch.cc b/src/libtorch.cc
@@ -25,7 +25,9 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <stdint.h>
+
 #include <exception>
+
 #include "libtorch_utils.h"
 #include "triton/backend/backend_common.h"
 #include "triton/backend/backend_input_collector.h"
@@ -53,6 +55,9 @@
 #include <cuda_runtime_api.h>
 #endif  // TRITON_ENABLE_GPU
 
+// Default forward method to call on PyTorch modules
+const std::string DEFAULT_MODULE_METHOD_NAME = "forward";
+
 //
 // PyTorch C++ (LibTorch) Backend that implements the TRITONBACKEND API.
 //
@@ -103,6 +108,7 @@ class ModelState : public BackendModel {
 
   bool EnabledWeightSharing() { return enable_weight_sharing_; }
   const std::vector<std::string>& ModelOutputs() { return output_names_; }
+  const std::string& ModuleMethodName() { return module_method_name_; }
 
  private:
   ModelState(TRITONBACKEND_Model* triton_model);
@@ -145,6 +151,10 @@ class ModelState : public BackendModel {
   // List of all the outputs specified in the output section of model
   // configuration.
   std::vector<std::string> output_names_;
+
+  // Method to call on PyTorch Module.
+  // Defaults to DEFAULT_MODULE_METHOD_NAME.
+  std::string module_method_name_;
 };
 
 TRITONSERVER_Error*
@@ -180,7 +190,8 @@ ModelState::ModelState(TRITONBACKEND_Model* triton_model)
       enable_weight_sharing_(false), enable_tensor_fuser_pair_({false, true}),
       enable_jit_profiling_pair_({false, true}),
       enable_jit_executor_pair_({false, true}),
-      enable_nvfuser_pair_({false, false})
+      enable_nvfuser_pair_({false, false}),
+      module_method_name_(DEFAULT_MODULE_METHOD_NAME)
 {
   output_names_.clear();
 
@@ -454,6 +465,30 @@ ModelState::ParseParameters()
                                   " for model instance '" + Name() + "'")
                                      .c_str());
     }
+
+    // If 'MODULE_METHOD_NAME' is not present in 'parameters' then
+    // 'module_method_name_' is set to 'DEFAULT_MODULE_METHOD_NAME' ('forward').
+    std::string module_method_name = DEFAULT_MODULE_METHOD_NAME;
+    err = GetParameterValue(params, "MODULE_METHOD_NAME", &module_method_name);
+    if (err != nullptr) {
+      if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) {
+        return err;
+      } else {
+        LOG_MESSAGE(
+            TRITONSERVER_LOG_INFO,
+            (std::string("module_method_name is not specified") +
+             " for model instance '" + Name() + "'")
+                .c_str());
+        TRITONSERVER_ErrorDelete(err);
+      }
+    } else {
+      module_method_name_ = module_method_name;
+      LOG_MESSAGE(
+          TRITONSERVER_LOG_INFO,
+          (std::string("module_method_name is ") + module_method_name_ +
+           " for model instance '" + Name() + "'")
+              .c_str());
+    }
   }
 
   return nullptr;
@@ -764,7 +799,8 @@ ModelInstanceState::ValidateInputs(const size_t expected_input_cnt)
   // configuration specifies only those.
   std::vector<std::string> allowed_inputs;
 
-  const torch::jit::Method& method = torch_model_->get_method("forward");
+  const torch::jit::Method& method =
+      torch_model_->get_method(model_state_->ModuleMethodName());
   const auto& schema = method.function().getSchema();
   const std::vector<c10::Argument>& arguments = schema.arguments();
 
@@ -1312,30 +1348,36 @@ ModelInstanceState::Execute(
         torch::jit::overrideCanFuseOnCPU(false);
         torch::jit::overrideCanFuseOnGPU(false);
         torch::jit::setTensorExprFuserEnabled(false);
-	torch::jit::fuser::cuda::setEnabled(true);
+        torch::jit::fuser::cuda::setEnabled(true);
       } else {
         torch::jit::overrideCanFuseOnCPU(true);
         torch::jit::overrideCanFuseOnGPU(true);
         torch::jit::setTensorExprFuserEnabled(true);
-	torch::jit::fuser::cuda::setEnabled(false);
+        torch::jit::fuser::cuda::setEnabled(false);
       }
     }
 
     torch::NoGradGuard no_grad;
 
     // If input is a dictionary, prepare dictionary from 'input_tensors'.
+    std::string module_method_name = model_state_->ModuleMethodName();
+    std::vector<c10::IValue> inputs;
     if (is_dict_input_) {
-      torch::Dict<std::string, torch::Tensor> input_dict;
+      c10::Dict<std::string, at::Tensor> dict;
       for (auto& input_index : input_index_map_) {
         torch::jit::IValue ival = (*input_tensors)[input_index.second];
-        input_dict.insert(input_index.first, ival.toTensor());
+        dict.insert(input_index.first, ival.toTensor());
       }
-      std::vector<torch::jit::IValue> input_dict_ivalue = {input_dict};
-      model_outputs_ = torch_model_->forward(input_dict_ivalue);
+      inputs.push_back(dict);
     } else {
-      model_outputs_ = torch_model_->forward(*input_tensors);
+      for (auto& input_tensor : *input_tensors) {
+        inputs.push_back(input_tensor.toTensor());
+      }
     }
 
+    // Actually run the method on the model.
+    model_outputs_ = torch_model_->get_method(module_method_name)(inputs);
+
     if (model_outputs_.isTuple()) {
       auto model_outputs_tuple = model_outputs_.toTuple();
       size_t op_index = 0;
@@ -1761,9 +1803,9 @@ ModelInstanceState::SetInputTensors(
 
         batchn_shape[0] += GetElementCount(input_shape, input_dims_count);
       }
-    }
-    else {
-      batchn_shape = std::vector<int64_t>(input_shape, input_shape + input_dims_count);
+    } else {
+      batchn_shape =
+          std::vector<int64_t>(input_shape, input_shape + input_dims_count);
       if (supports_batching_) {
         batchn_shape[0] = total_batch_size;
       }
@@ -1772,8 +1814,8 @@ ModelInstanceState::SetInputTensors(
     // The input must be in contiguous CPU/GPU memory.
     std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> alloc_perference;
     if (device_.is_cpu()) {
-      alloc_perference = {{TRITONSERVER_MEMORY_CPU_PINNED, 0},
-                          {TRITONSERVER_MEMORY_CPU, 0}};
+      alloc_perference = {
+          {TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};
     } else {
       alloc_perference = {{TRITONSERVER_MEMORY_GPU, device_.index()}};
     }
@@ -1887,9 +1929,11 @@ ModelInstanceState::ReadOutputTensors(
 
       // Output tensors may not reside on the same device as model
       torch::Device tensor_device = output_flat.device();
-      const auto memory_type = (tensor_device.type() == torch::kCPU) ? TRITONSERVER_MEMORY_CPU
-                                                  : TRITONSERVER_MEMORY_GPU;
-      const auto memory_id = (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index();
+      const auto memory_type = (tensor_device.type() == torch::kCPU)
+                                   ? TRITONSERVER_MEMORY_CPU
+                                   : TRITONSERVER_MEMORY_GPU;
+      const auto memory_id =
+          (tensor_device.type() == torch::kCPU) ? 0 : tensor_device.index();
 
       // Batch output doesn't support string data type yet, as it is not trivial
       // to parse string output
@@ -1906,16 +1950,16 @@ ModelInstanceState::ReadOutputTensors(
           return TRITONSERVER_ErrorNew(
               TRITONSERVER_ERROR_INVALID_ARG,
               (std::string("output '") + name +
-              "' is a scalar which is not supported.")
+               "' is a scalar which is not supported.")
                   .c_str());
         }
 
         responder.ProcessTensor(
-            name, output_dtype, batchn_shape, output_buffer,
-            memory_type, memory_id);
+            name, output_dtype, batchn_shape, output_buffer, memory_type,
+            memory_id);
       } else {
         responder.ProcessBatchOutput(
-          name, *batch_output, output_buffer, memory_type, memory_id);
+            name, *batch_output, output_buffer, memory_type, memory_id);
       }
     } else if (output_tensors[op_index].isList()) {
       // Custom handling for string/bytes tensor...
diff --git a/src/libtorch_utils.cc b/src/libtorch_utils.cc
@@ -152,7 +152,7 @@ ParseParameter(
 #ifdef TRITON_ENABLE_GPU
 TRITONSERVER_Error*
 ConvertCUDAStatusToTritonError(
-   cudaError_t cuda_error,TRITONSERVER_Error_Code code, const char* msg)
+    cudaError_t cuda_error, TRITONSERVER_Error_Code code, const char* msg)
 {
   if (cuda_error != cudaSuccess) {
     return TRITONSERVER_ErrorNew(

Original file line number	Diff line number	Diff line change
`@@ -152,7 +152,7 @@ ParseParameter(`
`152`	`152`	`#ifdef TRITON_ENABLE_GPU`
`153`	`153`	`TRITONSERVER_Error*`
`154`	`154`	`ConvertCUDAStatusToTritonError(`
`155`		`- cudaError_t cuda_error,TRITONSERVER_Error_Code code, const char* msg)`
	`155`	`+ cudaError_t cuda_error, TRITONSERVER_Error_Code code, const char* msg)`
`156`	`156`	`{`
`157`	`157`	`if (cuda_error != cudaSuccess) {`
`158`	`158`	`return TRITONSERVER_ErrorNew(`