Skip to content

Commit

Permalink
Add health check API endpoints
Browse files Browse the repository at this point in the history
  • Loading branch information
oneonestar authored Feb 5, 2025
1 parent e308c93 commit 3a82b83
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 1 deletion.
2 changes: 1 addition & 1 deletion docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ Standard Helm options such as `replicaCount`, `image`, `imagePullSecrets`,
More detail about the chart are available in the [values
reference documentation](https://github.com/trinodb/charts/blob/main/charts/gateway/README.md)

### Health Checks
### Health checks on Trino clusters

The Trino Gateway periodically performs health checks and maintains
an in-memory TrinoStatus for each backend. If a backend fails a health check,
Expand Down
13 changes: 13 additions & 0 deletions docs/operation.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ documentation](https://trino.io/docs/current/admin/graceful-shutdown.html) for
more details.

## Query routing options

- The default router selects the backend randomly to route the queries.
- If you want to route the queries to the least loaded backend for a user
i.e. backend with the least number of queries running or queued from a particular user,
Expand Down Expand Up @@ -65,3 +66,15 @@ scrape_configs:
- targets:
- gateway1.example.com:8080
```

## Trino Gateway health endpoints

Trino Gateway provides two API endpoints to indicate the current status of the server:

* `/trino-gateway/livez` always returns status code 200, indicating the server is
alive. However, it might not respond if the Trino Gateway is too busy, stuck, or
taking a long time for garbage collection.
* `/trino-gateway/readyz` returns status code 200, indicating the server has
completed initialization and is ready to serve requests. This means the initial
connection to the database and the first round of health check on Trino clusters
are completed. Otherwise, status code 503 is returned.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import io.trino.gateway.ha.module.RouterBaseModule;
import io.trino.gateway.ha.module.StochasticRoutingManagerProvider;
import io.trino.gateway.ha.resource.EntityEditorResource;
import io.trino.gateway.ha.resource.GatewayHealthCheckResource;
import io.trino.gateway.ha.resource.GatewayResource;
import io.trino.gateway.ha.resource.GatewayViewResource;
import io.trino.gateway.ha.resource.GatewayWebAppResource;
Expand Down Expand Up @@ -179,6 +180,7 @@ private static void registerResources(Binder binder)
jaxrsBinder(binder).bind(PublicResource.class);
jaxrsBinder(binder).bind(TrinoResource.class);
jaxrsBinder(binder).bind(WebUIStaticResource.class);
jaxrsBinder(binder).bind(GatewayHealthCheckResource.class);
}

private static void registerAuthFilters(Binder binder)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ public class ActiveClusterMonitor
public static final int DEFAULT_THREAD_POOL_SIZE = 20;
private static final Logger log = Logger.get(ActiveClusterMonitor.class);

private volatile boolean isInitialized;
private final List<TrinoClusterStatsObserver> clusterStatsObservers;
private final GatewayBackendManager gatewayBackendManager;

Expand Down Expand Up @@ -83,6 +84,7 @@ public void start()
observer.observe(stats);
}
}
isInitialized = true;
}
catch (Exception e) {
log.error(e, "Error performing backend monitor tasks");
Expand All @@ -96,4 +98,9 @@ public void stop()
executorService.shutdownNow();
scheduledExecutor.shutdownNow();
}

public boolean isInitialized()
{
return isInitialized;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.gateway.ha.resource;

import com.google.inject.Inject;
import io.trino.gateway.ha.clustermonitor.ActiveClusterMonitor;
import jakarta.ws.rs.GET;
import jakarta.ws.rs.Path;
import jakarta.ws.rs.core.Response;

import static java.util.Objects.requireNonNull;

@Path("/trino-gateway")
public class GatewayHealthCheckResource
{
private final ActiveClusterMonitor activeClusterMonitor;

@Inject
public GatewayHealthCheckResource(ActiveClusterMonitor activeClusterMonitor)
{
this.activeClusterMonitor = requireNonNull(activeClusterMonitor, "activeClusterMonitor is null");
}

@GET
@Path("/livez")
public Response liveness()
{
return Response.ok("ok").build();
}

@GET
@Path("/readyz")
public Response readiness()
{
if (!activeClusterMonitor.isInitialized()) {
return Response
.status(Response.Status.SERVICE_UNAVAILABLE)
.entity("Trino Gateway is still initializing")
.build();
}
return Response.ok("ok").build();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import static com.google.common.collect.MoreCollectors.onlyElement;
import static com.google.common.net.HttpHeaders.CONTENT_TYPE;
import static com.google.common.net.MediaType.JSON_UTF_8;
import static com.google.common.util.concurrent.Uninterruptibles.sleepUninterruptibly;
import static org.assertj.core.api.Assertions.assertThat;
import static org.testcontainers.utility.MountableFile.forClasspathResource;

Expand Down Expand Up @@ -362,6 +363,30 @@ void testCookieSigning()
assertThat(callbackResponse.code()).isEqualTo(500);
}

@Test
void testHealthCheckEndpoints()
throws IOException
{
Request livenessCheck = new Request.Builder()
.url("http://localhost:" + routerPort + "/trino-gateway/livez")
.build();
Response livenessResponse = httpClient.newCall(livenessCheck).execute();
assertThat(livenessResponse.code()).isEqualTo(200);

Request readinessCheck = new Request.Builder()
.url("http://localhost:" + routerPort + "/trino-gateway/readyz")
.build();
for (int i = 0; i < 100; i++) {
try (Response readinessResponse = httpClient.newCall(readinessCheck).execute()) {
if (readinessResponse.code() == 200) {
return;
}
}
sleepUninterruptibly(100, TimeUnit.MILLISECONDS);
}
throw new IllegalStateException("Trino Gateway health check failed");
}

@AfterAll
void cleanup()
{
Expand Down

0 comments on commit 3a82b83

Please sign in to comment.