Skip to content

Commit 2f46967

Browse files
authored
YQ kqprun supported retries (#16031)
1 parent 5f13bfb commit 2f46967

File tree

4 files changed

+86
-20
lines changed

4 files changed

+86
-20
lines changed

ydb/tests/tools/kqprun/configuration/app_config.conf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,12 +188,12 @@ ResourceBrokerConfig {
188188
Weight: 30
189189

190190
Limit {
191-
Memory: 6442450944
191+
Memory: 64424509440
192192
}
193193
}
194194

195195
ResourceLimit {
196-
Memory: 6442450944
196+
Memory: 64424509440
197197
}
198198
}
199199

ydb/tests/tools/kqprun/kqprun.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -690,6 +690,19 @@ class TMain : public TMainBase {
690690
.NoArgument()
691691
.SetFlag(&RunnerOptions.YdbSettings.SameSession);
692692

693+
options.AddLongOption("retry", "Retry queries which failed with specific status")
694+
.RequiredArgument("status")
695+
.Handler1([this](const NLastGetopt::TOptsParser* option) {
696+
const TString statusName(option->CurValOrDef());
697+
Ydb::StatusIds::StatusCode status;
698+
if (!Ydb::StatusIds::StatusCode_Parse(statusName, &status)) {
699+
ythrow yexception() << "Invalid status to retry: " << statusName << ", should be one of Ydb::StatusIds::StatusCode";
700+
}
701+
if (!RunnerOptions.RetryableStatuses.emplace(status).second) {
702+
ythrow yexception() << "Got duplicated status to retry: " << statusName;
703+
}
704+
});
705+
693706
// Cluster settings
694707

695708
options.AddLongOption('N', "node-count", "Number of nodes to create")

ydb/tests/tools/kqprun/src/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ struct TRunnerOptions {
8383
std::optional<size_t> TraceOptScriptId;
8484

8585
TDuration ScriptCancelAfter;
86+
std::unordered_set<Ydb::StatusIds::StatusCode> RetryableStatuses;
8687

8788
TYdbSetupSettings YdbSettings;
8889
};

ydb/tests/tools/kqprun/src/kqp_runner.cpp

Lines changed: 70 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ namespace NKqpRun {
1111

1212
class TKqpRunner::TImpl {
1313
using EVerbose = TYdbSetupSettings::EVerbose;
14+
using IRetryPolicy = IRetryPolicy<Ydb::StatusIds::StatusCode>;
15+
16+
static constexpr TDuration RETRY_PERIOD = TDuration::MilliSeconds(100);
1417

1518
public:
1619
enum class EQueryType {
@@ -28,7 +31,28 @@ class TKqpRunner::TImpl {
2831
, CoutColors_(NColorizer::AutoColors(Cout))
2932
{}
3033

31-
bool ExecuteSchemeQuery(const TRequestOptions& query) const {
34+
bool ExecuteWithRetries(std::function<Ydb::StatusIds::StatusCode()> queryRunner) {
35+
RetryState_ = nullptr;
36+
while (true) {
37+
const auto status = queryRunner();
38+
if (status == Ydb::StatusIds::SUCCESS) {
39+
return true;
40+
}
41+
42+
if (!RetryState_) {
43+
SetupRetryState();
44+
}
45+
46+
if (const auto delay = RetryState_->GetNextRetryDelay(status)) {
47+
Cout << CoutColors_.Yellow() << TInstant::Now().ToIsoStringLocal() << " Retrying query execution in " << *delay << "..." << CoutColors_.Default() << Endl;
48+
Sleep(*delay);
49+
} else {
50+
return false;
51+
}
52+
}
53+
}
54+
55+
Ydb::StatusIds::StatusCode ExecuteSchemeQuery(const TRequestOptions& query) const {
3256
StartSchemeTraceOpt();
3357

3458
if (VerboseLevel_ >= EVerbose::QueriesText) {
@@ -43,13 +67,13 @@ class TKqpRunner::TImpl {
4367

4468
if (!status.IsSuccess()) {
4569
Cerr << CerrColors_.Red() << "Failed to execute scheme query, reason:" << CerrColors_.Default() << Endl << status.ToString() << Endl;
46-
return false;
70+
return status.Status;
4771
}
4872

49-
return true;
73+
return Ydb::StatusIds::SUCCESS;
5074
}
5175

52-
bool ExecuteScript(const TRequestOptions& script) {
76+
Ydb::StatusIds::StatusCode ExecuteScript(const TRequestOptions& script) {
5377
StartScriptTraceOpt(script.QueryId);
5478

5579
if (VerboseLevel_ >= EVerbose::QueriesText) {
@@ -60,7 +84,7 @@ class TKqpRunner::TImpl {
6084

6185
if (!status.IsSuccess()) {
6286
Cerr << CerrColors_.Red() << "Failed to start script execution, reason:" << CerrColors_.Default() << Endl << status.ToString() << Endl;
63-
return false;
87+
return status.Status;
6488
}
6589

6690
ExecutionMeta_ = TExecutionMeta();
@@ -69,7 +93,7 @@ class TKqpRunner::TImpl {
6993
return WaitScriptExecutionOperation(script.QueryId);
7094
}
7195

72-
bool ExecuteQuery(const TRequestOptions& query, EQueryType queryType) {
96+
Ydb::StatusIds::StatusCode ExecuteQuery(const TRequestOptions& query, EQueryType queryType) {
7397
StartScriptTraceOpt(query.QueryId);
7498
StartTime_ = TInstant::Now();
7599

@@ -93,7 +117,7 @@ class TKqpRunner::TImpl {
93117

94118
case EQueryType::AsyncQuery:
95119
YdbSetup_.QueryRequestAsync(query);
96-
return true;
120+
return Ydb::StatusIds::SUCCESS;
97121
}
98122

99123
TYdbSetup::StopTraceOpt();
@@ -109,14 +133,14 @@ class TKqpRunner::TImpl {
109133

110134
if (!status.IsSuccess()) {
111135
Cerr << CerrColors_.Red() << "Failed to execute query, reason:" << CerrColors_.Default() << Endl << status.ToString() << Endl;
112-
return false;
136+
return status.Status;
113137
}
114138

115139
if (!status.Issues.Empty()) {
116140
Cerr << CerrColors_.Red() << "Request finished with issues:" << CerrColors_.Default() << Endl << status.Issues.ToString() << Endl;
117141
}
118142

119-
return true;
143+
return Ydb::StatusIds::SUCCESS;
120144
}
121145

122146
void FinalizeRunner() const {
@@ -171,7 +195,7 @@ class TKqpRunner::TImpl {
171195
}
172196

173197
private:
174-
bool WaitScriptExecutionOperation(ui64 queryId) {
198+
Ydb::StatusIds::StatusCode WaitScriptExecutionOperation(ui64 queryId) {
175199
StartTime_ = TInstant::Now();
176200
Y_DEFER {
177201
TYdbSetup::StopTraceOpt();
@@ -193,15 +217,15 @@ class TKqpRunner::TImpl {
193217

194218
if (!status.IsSuccess()) {
195219
Cerr << CerrColors_.Red() << "Failed to get script execution operation, reason:" << CerrColors_.Default() << Endl << status.ToString() << Endl;
196-
return false;
220+
return status.Status;
197221
}
198222

199223
if (Options_.ScriptCancelAfter && TInstant::Now() - StartTime_ > Options_.ScriptCancelAfter) {
200224
Cout << CoutColors_.Yellow() << TInstant::Now().ToIsoStringLocal() << " Cancelling script execution..." << CoutColors_.Default() << Endl;
201225
TRequestResult cancelStatus = YdbSetup_.CancelScriptExecutionOperationRequest(ExecutionMeta_.Database, ExecutionOperation_);
202226
if (!cancelStatus.IsSuccess()) {
203227
Cerr << CerrColors_.Red() << "Failed to cancel script execution operation, reason:" << CerrColors_.Default() << Endl << cancelStatus.ToString() << Endl;
204-
return false;
228+
return cancelStatus.Status;
205229
}
206230
}
207231

@@ -215,14 +239,14 @@ class TKqpRunner::TImpl {
215239

216240
if (!status.IsSuccess() || ExecutionMeta_.ExecutionStatus != NYdb::NQuery::EExecStatus::Completed) {
217241
Cerr << CerrColors_.Red() << "Failed to execute script, invalid final status, reason:" << CerrColors_.Default() << Endl << status.ToString() << Endl;
218-
return false;
242+
return status.Status;
219243
}
220244

221245
if (!status.Issues.Empty()) {
222246
Cerr << CerrColors_.Red() << "Request finished with issues:" << CerrColors_.Default() << Endl << status.Issues.ToString() << Endl;
223247
}
224248

225-
return true;
249+
return Ydb::StatusIds::SUCCESS;
226250
}
227251

228252
void StartSchemeTraceOpt() const {
@@ -304,9 +328,29 @@ class TKqpRunner::TImpl {
304328
Cout << CoutColors_.Default() << Endl;
305329
}
306330

331+
void SetupRetryState() {
332+
if (!RetryPolicy_) {
333+
const auto retryFunc = [this](Ydb::StatusIds::StatusCode status) {
334+
if (Options_.RetryableStatuses.contains(status)) {
335+
return ERetryErrorClass::ShortRetry;
336+
}
337+
return ERetryErrorClass::NoRetry;
338+
};
339+
RetryPolicy_ = IRetryPolicy::GetExponentialBackoffPolicy(
340+
retryFunc,
341+
RETRY_PERIOD,
342+
RETRY_PERIOD,
343+
TDuration::Seconds(1)
344+
);
345+
}
346+
RetryState_ = RetryPolicy_->CreateRetryState();
347+
}
348+
307349
private:
308350
TRunnerOptions Options_;
309351
EVerbose VerboseLevel_;
352+
IRetryPolicy::TPtr RetryPolicy_;
353+
IRetryPolicy::IRetryState::TPtr RetryState_;
310354

311355
TYdbSetup YdbSetup_;
312356
TStatsPrinter StatsPrinter_;
@@ -327,19 +371,27 @@ TKqpRunner::TKqpRunner(const TRunnerOptions& options)
327371
{}
328372

329373
bool TKqpRunner::ExecuteSchemeQuery(const TRequestOptions& query) const {
330-
return Impl_->ExecuteSchemeQuery(query);
374+
return Impl_->ExecuteWithRetries([this, query]() {
375+
return Impl_->ExecuteSchemeQuery(query);
376+
});
331377
}
332378

333379
bool TKqpRunner::ExecuteScript(const TRequestOptions& script) const {
334-
return Impl_->ExecuteScript(script);
380+
return Impl_->ExecuteWithRetries([this, script]() {
381+
return Impl_->ExecuteScript(script);
382+
});
335383
}
336384

337385
bool TKqpRunner::ExecuteQuery(const TRequestOptions& query) const {
338-
return Impl_->ExecuteQuery(query, TImpl::EQueryType::ScriptQuery);
386+
return Impl_->ExecuteWithRetries([this, query]() {
387+
return Impl_->ExecuteQuery(query, TImpl::EQueryType::ScriptQuery);
388+
});
339389
}
340390

341391
bool TKqpRunner::ExecuteYqlScript(const TRequestOptions& query) const {
342-
return Impl_->ExecuteQuery(query, TImpl::EQueryType::YqlScriptQuery);
392+
return Impl_->ExecuteWithRetries([this, query]() {
393+
return Impl_->ExecuteQuery(query, TImpl::EQueryType::YqlScriptQuery);
394+
});
343395
}
344396

345397
void TKqpRunner::ExecuteQueryAsync(const TRequestOptions& query) const {

0 commit comments

Comments
 (0)