-
Notifications
You must be signed in to change notification settings - Fork 95
/
Copy pathrecommended.cc
750 lines (644 loc) · 28 KB
/
recommended.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "triton/backend/backend_common.h"
#include "triton/backend/backend_input_collector.h"
#include "triton/backend/backend_model.h"
#include "triton/backend/backend_model_instance.h"
#include "triton/backend/backend_output_responder.h"
#include "triton/core/tritonbackend.h"
namespace triton { namespace backend { namespace recommended {
//
// Backend that demonstrates the TRITONBACKEND API. This backend works
// for any model that has 1 input with any datatype and any shape and
// 1 output with the same shape and datatype as the input. The backend
// supports both batching and non-batching models.
//
// For each batch of requests, the backend returns the input tensor
// value in the output tensor.
//
/////////////
extern "C" {
// Triton calls TRITONBACKEND_Initialize when a backend is loaded into
// Triton to allow the backend to create and initialize any state that
// is intended to be shared across all models and model instances that
// use the backend. The backend should also verify version
// compatibility with Triton in this function.
//
TRITONSERVER_Error*
TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
{
const char* cname;
RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
std::string name(cname);
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("TRITONBACKEND_Initialize: ") + name).c_str());
// Check the backend API version that Triton supports vs. what this
// backend was compiled against. Make sure that the Triton major
// version is the same and the minor version is >= what this backend
// uses.
uint32_t api_version_major, api_version_minor;
RETURN_IF_ERROR(
TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor));
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("Triton TRITONBACKEND API version: ") +
std::to_string(api_version_major) + "." +
std::to_string(api_version_minor))
.c_str());
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("'") + name + "' TRITONBACKEND API version: " +
std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." +
std::to_string(TRITONBACKEND_API_VERSION_MINOR))
.c_str());
if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) ||
(api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
"triton backend API version does not support this backend");
}
// The backend configuration may contain information needed by the
// backend, such as tritonserver command-line arguments. This
// backend doesn't use any such configuration but for this example
// print whatever is available.
TRITONSERVER_Message* backend_config_message;
RETURN_IF_ERROR(
TRITONBACKEND_BackendConfig(backend, &backend_config_message));
const char* buffer;
size_t byte_size;
RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson(
backend_config_message, &buffer, &byte_size));
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("backend configuration:\n") + buffer).c_str());
// This backend does not require any "global" state but as an
// example create a string to demonstrate.
std::string* state = new std::string("backend state");
RETURN_IF_ERROR(
TRITONBACKEND_BackendSetState(backend, reinterpret_cast<void*>(state)));
return nullptr; // success
}
// Triton calls TRITONBACKEND_Finalize when a backend is no longer
// needed.
//
TRITONSERVER_Error*
TRITONBACKEND_Finalize(TRITONBACKEND_Backend* backend)
{
// Delete the "global" state associated with the backend.
void* vstate;
RETURN_IF_ERROR(TRITONBACKEND_BackendState(backend, &vstate));
std::string* state = reinterpret_cast<std::string*>(vstate);
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("TRITONBACKEND_Finalize: state is '") + *state + "'")
.c_str());
delete state;
return nullptr; // success
}
} // extern "C"
/////////////
//
// ModelState
//
// State associated with a model that is using this backend. An object
// of this class is created and associated with each
// TRITONBACKEND_Model. ModelState is derived from BackendModel class
// provided in the backend utilities that provides many common
// functions.
//
class ModelState : public BackendModel {
public:
static TRITONSERVER_Error* Create(
TRITONBACKEND_Model* triton_model, ModelState** state);
virtual ~ModelState() = default;
// Name of the input and output tensor
const std::string& InputTensorName() const { return input_name_; }
const std::string& OutputTensorName() const { return output_name_; }
// Datatype of the input and output tensor
TRITONSERVER_DataType TensorDataType() const { return datatype_; }
// Shape of the input and output tensor as given in the model
// configuration file. This shape will not include the batch
// dimension (if the model has one).
const std::vector<int64_t>& TensorNonBatchShape() const { return nb_shape_; }
// Shape of the input and output tensor, including the batch
// dimension (if the model has one). This method cannot be called
// until the model is completely loaded and initialized, including
// all instances of the model. In practice, this means that backend
// should only call it in TRITONBACKEND_ModelInstanceExecute.
TRITONSERVER_Error* TensorShape(std::vector<int64_t>& shape);
// Validate that this model is supported by this backend.
TRITONSERVER_Error* ValidateModelConfig();
private:
ModelState(TRITONBACKEND_Model* triton_model);
std::string input_name_;
std::string output_name_;
TRITONSERVER_DataType datatype_;
bool shape_initialized_;
std::vector<int64_t> nb_shape_;
std::vector<int64_t> shape_;
};
ModelState::ModelState(TRITONBACKEND_Model* triton_model)
: BackendModel(triton_model), shape_initialized_(false)
{
// Validate that the model's configuration matches what is supported
// by this backend.
THROW_IF_BACKEND_MODEL_ERROR(ValidateModelConfig());
}
TRITONSERVER_Error*
ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
{
try {
*state = new ModelState(triton_model);
}
catch (const BackendModelException& ex) {
RETURN_ERROR_IF_TRUE(
ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
std::string("unexpected nullptr in BackendModelException"));
RETURN_IF_ERROR(ex.err_);
}
return nullptr; // success
}
TRITONSERVER_Error*
ModelState::TensorShape(std::vector<int64_t>& shape)
{
// This backend supports models that batch along the first dimension
// and those that don't batch. For non-batch models the output shape
// will be the shape from the model configuration. For batch models
// the output shape will be the shape from the model configuration
// prepended with [ -1 ] to represent the batch dimension. The
// backend "responder" utility used below will set the appropriate
// batch dimension value for each response. The shape needs to be
// initialized lazily because the SupportsFirstDimBatching function
// cannot be used until the model is completely loaded.
if (!shape_initialized_) {
bool supports_first_dim_batching;
RETURN_IF_ERROR(SupportsFirstDimBatching(&supports_first_dim_batching));
if (supports_first_dim_batching) {
shape_.push_back(-1);
}
shape_.insert(shape_.end(), nb_shape_.begin(), nb_shape_.end());
shape_initialized_ = true;
}
shape = shape_;
return nullptr; // success
}
TRITONSERVER_Error*
ModelState::ValidateModelConfig()
{
// If verbose logging is enabled, dump the model's configuration as
// JSON into the console output.
if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {
common::TritonJson::WriteBuffer buffer;
RETURN_IF_ERROR(ModelConfig().PrettyWrite(&buffer));
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("model configuration:\n") + buffer.Contents()).c_str());
}
// ModelConfig is the model configuration as a TritonJson
// object. Use the TritonJson utilities to parse the JSON and
// determine if the configuration is supported by this backend.
common::TritonJson::Value inputs, outputs;
RETURN_IF_ERROR(ModelConfig().MemberAsArray("input", &inputs));
RETURN_IF_ERROR(ModelConfig().MemberAsArray("output", &outputs));
// The model must have exactly 1 input and 1 output.
RETURN_ERROR_IF_FALSE(
inputs.ArraySize() == 1, TRITONSERVER_ERROR_INVALID_ARG,
std::string("model configuration must have 1 input"));
RETURN_ERROR_IF_FALSE(
outputs.ArraySize() == 1, TRITONSERVER_ERROR_INVALID_ARG,
std::string("model configuration must have 1 output"));
common::TritonJson::Value input, output;
RETURN_IF_ERROR(inputs.IndexAsObject(0, &input));
RETURN_IF_ERROR(outputs.IndexAsObject(0, &output));
// Record the input and output name in the model state.
const char* input_name;
size_t input_name_len;
RETURN_IF_ERROR(input.MemberAsString("name", &input_name, &input_name_len));
input_name_ = std::string(input_name);
const char* output_name;
size_t output_name_len;
RETURN_IF_ERROR(
output.MemberAsString("name", &output_name, &output_name_len));
output_name_ = std::string(output_name);
// Input and output must have same datatype
std::string input_dtype, output_dtype;
RETURN_IF_ERROR(input.MemberAsString("data_type", &input_dtype));
RETURN_IF_ERROR(output.MemberAsString("data_type", &output_dtype));
RETURN_ERROR_IF_FALSE(
input_dtype == output_dtype, TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected input and output datatype to match, got ") +
input_dtype + " and " + output_dtype);
datatype_ = ModelConfigDataTypeToTritonServerDataType(input_dtype);
// Input and output must have same shape. Reshape is not supported
// on either input or output so flag an error is the model
// configuration uses it.
triton::common::TritonJson::Value reshape;
RETURN_ERROR_IF_TRUE(
input.Find("reshape", &reshape), TRITONSERVER_ERROR_UNSUPPORTED,
std::string("reshape not supported for input tensor"));
RETURN_ERROR_IF_TRUE(
output.Find("reshape", &reshape), TRITONSERVER_ERROR_UNSUPPORTED,
std::string("reshape not supported for output tensor"));
std::vector<int64_t> input_shape, output_shape;
RETURN_IF_ERROR(backend::ParseShape(input, "dims", &input_shape));
RETURN_IF_ERROR(backend::ParseShape(output, "dims", &output_shape));
RETURN_ERROR_IF_FALSE(
input_shape == output_shape, TRITONSERVER_ERROR_INVALID_ARG,
std::string("expected input and output shape to match, got ") +
backend::ShapeToString(input_shape) + " and " +
backend::ShapeToString(output_shape));
nb_shape_ = input_shape;
return nullptr; // success
}
extern "C" {
// Triton calls TRITONBACKEND_ModelInitialize when a model is loaded
// to allow the backend to create any state associated with the model,
// and to also examine the model configuration to determine if the
// configuration is suitable for the backend. Any errors reported by
// this function will prevent the model from loading.
//
TRITONSERVER_Error*
TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
{
// Create a ModelState object and associate it with the
// TRITONBACKEND_Model. If anything goes wrong with initialization
// of the model state then an error is returned and Triton will fail
// to load the model.
ModelState* model_state;
RETURN_IF_ERROR(ModelState::Create(model, &model_state));
RETURN_IF_ERROR(
TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
return nullptr; // success
}
// Triton calls TRITONBACKEND_ModelFinalize when a model is no longer
// needed. The backend should cleanup any state associated with the
// model. This function will not be called until all model instances
// of the model have been finalized.
//
TRITONSERVER_Error*
TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
{
void* vstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
delete model_state;
return nullptr; // success
}
} // extern "C"
/////////////
//
// ModelInstanceState
//
// State associated with a model instance. An object of this class is
// created and associated with each
// TRITONBACKEND_ModelInstance. ModelInstanceState is derived from
// BackendModelInstance class provided in the backend utilities that
// provides many common functions.
//
class ModelInstanceState : public BackendModelInstance {
public:
static TRITONSERVER_Error* Create(
ModelState* model_state,
TRITONBACKEND_ModelInstance* triton_model_instance,
ModelInstanceState** state);
virtual ~ModelInstanceState() = default;
// Get the state of the model that corresponds to this instance.
ModelState* StateForModel() const { return model_state_; }
private:
ModelInstanceState(
ModelState* model_state,
TRITONBACKEND_ModelInstance* triton_model_instance)
: BackendModelInstance(model_state, triton_model_instance),
model_state_(model_state)
{
}
ModelState* model_state_;
};
TRITONSERVER_Error*
ModelInstanceState::Create(
ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,
ModelInstanceState** state)
{
try {
*state = new ModelInstanceState(model_state, triton_model_instance);
}
catch (const BackendModelInstanceException& ex) {
RETURN_ERROR_IF_TRUE(
ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,
std::string("unexpected nullptr in BackendModelInstanceException"));
RETURN_IF_ERROR(ex.err_);
}
return nullptr; // success
}
extern "C" {
// Triton calls TRITONBACKEND_ModelInstanceInitialize when a model
// instance is created to allow the backend to initialize any state
// associated with the instance.
//
TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
{
// Get the model state associated with this instance's model.
TRITONBACKEND_Model* model;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
void* vmodelstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
// Create a ModelInstanceState object and associate it with the
// TRITONBACKEND_ModelInstance.
ModelInstanceState* instance_state;
RETURN_IF_ERROR(
ModelInstanceState::Create(model_state, instance, &instance_state));
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(
instance, reinterpret_cast<void*>(instance_state)));
return nullptr; // success
}
// Triton calls TRITONBACKEND_ModelInstanceFinalize when a model
// instance is no longer needed. The backend should cleanup any state
// associated with the model instance.
//
TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
{
void* vstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
ModelInstanceState* instance_state =
reinterpret_cast<ModelInstanceState*>(vstate);
delete instance_state;
return nullptr; // success
}
} // extern "C"
/////////////
extern "C" {
// When Triton calls TRITONBACKEND_ModelInstanceExecute it is required
// that a backend create a response for each request in the batch. A
// response may be the output tensors required for that request or may
// be an error that is returned in the response.
//
TRITONSERVER_Error*
TRITONBACKEND_ModelInstanceExecute(
TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,
const uint32_t request_count)
{
// Collect various timestamps during the execution of this batch or
// requests. These values are reported below before returning from
// the function.
uint64_t exec_start_ns = 0;
SET_TIMESTAMP(exec_start_ns);
// Triton will not call this function simultaneously for the same
// 'instance'. But since this backend could be used by multiple
// instances from multiple models the implementation needs to handle
// multiple calls to this function at the same time (with different
// 'instance' objects). Best practice for a high-performance
// implementation is to avoid introducing mutex/lock and instead use
// only function-local and model-instance-specific state.
ModelInstanceState* instance_state;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(
instance, reinterpret_cast<void**>(&instance_state)));
ModelState* model_state = instance_state->StateForModel();
// 'responses' is initialized as a parallel array to 'requests',
// with one TRITONBACKEND_Response object for each
// TRITONBACKEND_Request object. If something goes wrong while
// creating these response objects, the backend simply returns an
// error from TRITONBACKEND_ModelInstanceExecute, indicating to
// Triton that this backend did not create or send any responses and
// so it is up to Triton to create and send an appropriate error
// response for each request. RETURN_IF_ERROR is one of several
// useful macros for error handling that can be found in
// backend_common.h.
std::vector<TRITONBACKEND_Response*> responses;
responses.reserve(request_count);
for (uint32_t r = 0; r < request_count; ++r) {
TRITONBACKEND_Request* request = requests[r];
TRITONBACKEND_Response* response;
RETURN_IF_ERROR(TRITONBACKEND_ResponseNew(&response, request));
responses.push_back(response);
}
// At this point, the backend takes ownership of 'requests', which
// means that it is responsible for sending a response for every
// request. From here, even if something goes wrong in processing,
// the backend must return 'nullptr' from this function to indicate
// success. Any errors and failures must be communicated via the
// response objects.
//
// To simplify error handling, the backend utilities manage
// 'responses' in a specific way and it is recommended that backends
// follow this same pattern. When an error is detected in the
// processing of a request, an appropriate error response is sent
// and the corresponding TRITONBACKEND_Response object within
// 'responses' is set to nullptr to indicate that the
// request/response has already been handled and no further processing
// should be performed for that request. Even if all responses fail,
// the backend still allows execution to flow to the end of the
// function so that statistics are correctly reported by the calls
// to TRITONBACKEND_ModelInstanceReportStatistics and
// TRITONBACKEND_ModelInstanceReportBatchStatistics.
// RESPOND_AND_SET_NULL_IF_ERROR, and
// RESPOND_ALL_AND_SET_NULL_IF_ERROR are macros from
// backend_common.h that assist in this management of response
// objects.
// The backend could iterate over the 'requests' and process each
// one separately. But for performance reasons it is usually
// preferred to create batched input tensors that are processed
// simultaneously. This is especially true for devices like GPUs
// that are capable of exploiting the large amount parallelism
// exposed by larger data sets.
//
// The backend utilities provide a "collector" to facilitate this
// batching process. The 'collector's ProcessTensor function will
// combine a tensor's value from each request in the batch into a
// single contiguous buffer. The buffer can be provided by the
// backend or 'collector' can create and manage it. In this backend,
// there is not a specific buffer into which the batch should be
// created, so use ProcessTensor arguments that cause collector to
// manage it. ProcessTensor does NOT support TRITONSERVER_TYPE_BYTES
// data type.
BackendInputCollector collector(
requests, request_count, &responses, model_state->TritonMemoryManager(),
false /* pinned_enabled */, nullptr /* stream*/);
// To instruct ProcessTensor to "gather" the entire batch of input
// tensors into a single contiguous buffer in CPU memory, set the
// "allowed input types" to be the CPU ones (see tritonserver.h in
// the triton-inference-server/core repo for allowed memory types).
std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> allowed_input_types =
{{TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};
const char* input_buffer;
size_t input_buffer_byte_size;
TRITONSERVER_MemoryType input_buffer_memory_type;
int64_t input_buffer_memory_type_id;
RESPOND_ALL_AND_SET_NULL_IF_ERROR(
responses, request_count,
collector.ProcessTensor(
model_state->InputTensorName().c_str(), nullptr /* existing_buffer */,
0 /* existing_buffer_byte_size */, allowed_input_types, &input_buffer,
&input_buffer_byte_size, &input_buffer_memory_type,
&input_buffer_memory_type_id));
// Finalize the collector. If 'true' is returned, 'input_buffer'
// will not be valid until the backend synchronizes the CUDA
// stream or event that was used when creating the collector. For
// this backend, GPU is not supported and so no CUDA sync should
// be needed; so if 'true' is returned simply log an error.
const bool need_cuda_input_sync = collector.Finalize();
if (need_cuda_input_sync) {
LOG_MESSAGE(
TRITONSERVER_LOG_ERROR,
"'recommended' backend: unexpected CUDA sync required by collector");
}
// 'input_buffer' contains the batched input tensor. The backend can
// implement whatever logic is necessary to produce the output
// tensor. This backend simply logs the input tensor value and then
// returns the input tensor value in the output tensor so no actual
// computation is needed.
uint64_t compute_start_ns = 0;
SET_TIMESTAMP(compute_start_ns);
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("model ") + model_state->Name() + ": requests in batch " +
std::to_string(request_count))
.c_str());
std::string tstr;
IGNORE_ERROR(BufferAsTypedString(
tstr, input_buffer, input_buffer_byte_size,
model_state->TensorDataType()));
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("batched " + model_state->InputTensorName() + " value: ") +
tstr)
.c_str());
const char* output_buffer = input_buffer;
TRITONSERVER_MemoryType output_buffer_memory_type = input_buffer_memory_type;
int64_t output_buffer_memory_type_id = input_buffer_memory_type_id;
uint64_t compute_end_ns = 0;
SET_TIMESTAMP(compute_end_ns);
bool supports_first_dim_batching;
RESPOND_ALL_AND_SET_NULL_IF_ERROR(
responses, request_count,
model_state->SupportsFirstDimBatching(&supports_first_dim_batching));
std::vector<int64_t> tensor_shape;
RESPOND_ALL_AND_SET_NULL_IF_ERROR(
responses, request_count, model_state->TensorShape(tensor_shape));
// Because the output tensor values are concatenated into a single
// contiguous 'output_buffer', the backend must "scatter" them out
// to the individual response output tensors. The backend utilities
// provide a "responder" to facilitate this scattering process.
// BackendOutputResponder does NOT support TRITONSERVER_TYPE_BYTES
// data type.
// The 'responders's ProcessTensor function will copy the portion of
// 'output_buffer' corresponding to each request's output into the
// response for that request.
BackendOutputResponder responder(
requests, request_count, &responses, model_state->TritonMemoryManager(),
supports_first_dim_batching, false /* pinned_enabled */,
nullptr /* stream*/);
responder.ProcessTensor(
model_state->OutputTensorName().c_str(), model_state->TensorDataType(),
tensor_shape, output_buffer, output_buffer_memory_type,
output_buffer_memory_type_id);
// Finalize the responder. If 'true' is returned, the output
// tensors' data will not be valid until the backend synchronizes
// the CUDA stream or event that was used when creating the
// responder. For this backend, GPU is not supported and so no CUDA
// sync should be needed; so if 'true' is returned simply log an
// error.
const bool need_cuda_output_sync = responder.Finalize();
if (need_cuda_output_sync) {
LOG_MESSAGE(
TRITONSERVER_LOG_ERROR,
"'recommended' backend: unexpected CUDA sync required by responder");
}
// Send all the responses that haven't already been sent because of
// an earlier error.
for (auto& response : responses) {
if (response != nullptr) {
LOG_IF_ERROR(
TRITONBACKEND_ResponseSend(
response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
"failed to send response");
}
}
uint64_t exec_end_ns = 0;
SET_TIMESTAMP(exec_end_ns);
#ifdef TRITON_ENABLE_STATS
// For batch statistics need to know the total batch size of the
// requests. This is not necessarily just the number of requests,
// because if the model supports batching then any request can be a
// batched request itself.
size_t total_batch_size = 0;
if (!supports_first_dim_batching) {
total_batch_size = request_count;
} else {
for (uint32_t r = 0; r < request_count; ++r) {
auto& request = requests[r];
TRITONBACKEND_Input* input = nullptr;
LOG_IF_ERROR(
TRITONBACKEND_RequestInputByIndex(request, 0 /* index */, &input),
"failed getting request input");
if (input != nullptr) {
const int64_t* shape = nullptr;
LOG_IF_ERROR(
TRITONBACKEND_InputProperties(
input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr),
"failed getting input properties");
if (shape != nullptr) {
total_batch_size += shape[0];
}
}
}
}
#else
(void)exec_start_ns;
(void)exec_end_ns;
(void)compute_start_ns;
(void)compute_end_ns;
#endif // TRITON_ENABLE_STATS
// Report statistics for each request, and then release the request.
for (uint32_t r = 0; r < request_count; ++r) {
auto& request = requests[r];
#ifdef TRITON_ENABLE_STATS
LOG_IF_ERROR(
TRITONBACKEND_ModelInstanceReportStatistics(
instance_state->TritonModelInstance(), request,
(responses[r] != nullptr) /* success */, exec_start_ns,
compute_start_ns, compute_end_ns, exec_end_ns),
"failed reporting request statistics");
#endif // TRITON_ENABLE_STATS
LOG_IF_ERROR(
TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
"failed releasing request");
}
#ifdef TRITON_ENABLE_STATS
// Report batch statistics.
LOG_IF_ERROR(
TRITONBACKEND_ModelInstanceReportBatchStatistics(
instance_state->TritonModelInstance(), total_batch_size,
exec_start_ns, compute_start_ns, compute_end_ns, exec_end_ns),
"failed reporting batch request statistics");
#endif // TRITON_ENABLE_STATS
return nullptr; // success
}
} // extern "C"
}}} // namespace triton::backend::recommended