Skip to content

Commit ae4948f

Browse files
committed
feat(compute): Add perf test for compute startup time breakdown
1 parent 99639c2 commit ae4948f

File tree

3 files changed

+240
-194
lines changed

3 files changed

+240
-194
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
from __future__ import annotations
2+
3+
import pytest
4+
import requests
5+
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
6+
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
7+
8+
9+
# Just start and measure duration.
10+
#
11+
# This test runs pretty quickly and can be informative when used in combination
12+
# with emulated network delay. Some useful delay commands:
13+
#
14+
# 1. Add 2msec delay to all localhost traffic
15+
# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec`
16+
#
17+
# 2. Test that it works (you should see 4ms ping)
18+
# `ping localhost`
19+
#
20+
# 3. Revert back to normal
21+
# `sudo tc qdisc del dev lo root netem`
22+
#
23+
# NOTE this test might not represent the real startup time because the basebackup
24+
# for a large database might be larger if there's a lof of transaction metadata,
25+
# or safekeepers might need more syncing, or there might be more operations to
26+
# apply during config step, like more users, databases, or extensions. By default
27+
# we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this
28+
# test we only load neon.
29+
def test_compute_startup_simple(
30+
neon_env_builder: NeonEnvBuilder,
31+
zenbenchmark: NeonBenchmarker,
32+
):
33+
neon_env_builder.num_safekeepers = 3
34+
env = neon_env_builder.init_start()
35+
36+
env.create_branch("test_startup")
37+
38+
endpoint = None
39+
40+
# We do two iterations so we can see if the second startup is faster. It should
41+
# be because the compute node should already be configured with roles, databases,
42+
# extensions, etc from the first run.
43+
for i in range(2):
44+
# Start
45+
with zenbenchmark.record_duration(f"{i}_start_and_select"):
46+
if endpoint:
47+
endpoint.start()
48+
else:
49+
endpoint = env.endpoints.create(
50+
"test_startup",
51+
# Shared buffers need to be allocated during startup, so they
52+
# impact startup time. This is the default value we use for
53+
# 1CPU pods (maybe different for VMs).
54+
#
55+
# TODO extensions also contribute to shared memory allocation,
56+
# and this test doesn't include all default extensions we
57+
# load.
58+
config_lines=["shared_buffers=262144"],
59+
)
60+
# Do not skip pg_catalog updates at first start, i.e.
61+
# imitate 'the first start after project creation'.
62+
endpoint.respec(skip_pg_catalog_updates=False)
63+
endpoint.start()
64+
endpoint.safe_psql("select 1;")
65+
66+
# Get metrics
67+
metrics = requests.get(
68+
f"http://localhost:{endpoint.external_http_port}/metrics.json"
69+
).json()
70+
durations = {
71+
"wait_for_spec_ms": f"{i}_wait_for_spec",
72+
"sync_safekeepers_ms": f"{i}_sync_safekeepers",
73+
"sync_sk_check_ms": f"{i}_sync_sk_check",
74+
"basebackup_ms": f"{i}_basebackup",
75+
"start_postgres_ms": f"{i}_start_postgres",
76+
"config_ms": f"{i}_config",
77+
"total_startup_ms": f"{i}_total_startup",
78+
}
79+
for key, name in durations.items():
80+
value = metrics[key]
81+
zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER)
82+
83+
# Check basebackup size makes sense
84+
basebackup_bytes = metrics["basebackup_bytes"]
85+
if i > 0:
86+
assert basebackup_bytes < 100 * 1024
87+
88+
# Stop so we can restart
89+
endpoint.stop()
90+
91+
# Imitate optimizations that console would do for the second start
92+
endpoint.respec(skip_pg_catalog_updates=True)
93+
94+
95+
# Start and measure duration with huge SLRU segments.
96+
# This test is similar to test_compute_startup_simple, but it creates huge number of transactions
97+
# and records containing this XIDs. Autovacuum is disable for the table to prevent CLOG truncation.
98+
# TODO: this is very suspicious test, I doubt that it does what it's supposed to do,
99+
# e.g. these two starts do not make much sense. Looks like it's just copy-paste.
100+
# To be fixed within https://github.com/neondatabase/cloud/issues/8673
101+
@pytest.mark.timeout(1800)
102+
@pytest.mark.parametrize("slru", ["lazy", "eager"])
103+
def test_compute_ondemand_slru_startup(
104+
slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker
105+
):
106+
neon_env_builder.num_safekeepers = 3
107+
env = neon_env_builder.init_start()
108+
109+
lazy_slru_download = "true" if slru == "lazy" else "false"
110+
tenant, _ = env.create_tenant(
111+
conf={
112+
"lazy_slru_download": lazy_slru_download,
113+
}
114+
)
115+
116+
endpoint = env.endpoints.create_start("main", tenant_id=tenant)
117+
with endpoint.cursor() as cur:
118+
cur.execute("CREATE TABLE t (pk integer PRIMARY KEY, x integer)")
119+
cur.execute("ALTER TABLE t SET (autovacuum_enabled = false)")
120+
cur.execute("INSERT INTO t VALUES (1, 0)")
121+
cur.execute(
122+
"""
123+
CREATE PROCEDURE updating() as
124+
$$
125+
DECLARE
126+
i integer;
127+
BEGIN
128+
FOR i IN 1..1000000 LOOP
129+
UPDATE t SET x = x + 1 WHERE pk=1;
130+
COMMIT;
131+
END LOOP;
132+
END
133+
$$ LANGUAGE plpgsql
134+
"""
135+
)
136+
cur.execute("SET statement_timeout=0")
137+
cur.execute("call updating()")
138+
139+
endpoint.stop()
140+
141+
# We do two iterations so we can see if the second startup is faster. It should
142+
# be because the compute node should already be configured with roles, databases,
143+
# extensions, etc from the first run.
144+
for i in range(2):
145+
# Start
146+
with zenbenchmark.record_duration(f"{slru}_{i}_start"):
147+
endpoint.start()
148+
149+
with zenbenchmark.record_duration(f"{slru}_{i}_select"):
150+
sum = endpoint.safe_psql("select sum(x) from t")[0][0]
151+
assert sum == 1000000
152+
153+
# Get metrics
154+
metrics = requests.get(
155+
f"http://localhost:{endpoint.external_http_port}/metrics.json"
156+
).json()
157+
durations = {
158+
"wait_for_spec_ms": f"{slru}_{i}_wait_for_spec",
159+
"sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers",
160+
"sync_sk_check_ms": f"{slru}_{i}_sync_sk_check",
161+
"basebackup_ms": f"{slru}_{i}_basebackup",
162+
"start_postgres_ms": f"{slru}_{i}_start_postgres",
163+
"config_ms": f"{slru}_{i}_config",
164+
"total_startup_ms": f"{slru}_{i}_total_startup",
165+
}
166+
for key, name in durations.items():
167+
value = metrics[key]
168+
zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER)
169+
170+
basebackup_bytes = metrics["basebackup_bytes"]
171+
zenbenchmark.record(
172+
f"{slru}_{i}_basebackup_bytes",
173+
basebackup_bytes,
174+
"bytes",
175+
report=MetricReport.LOWER_IS_BETTER,
176+
)
177+
178+
# Stop so we can restart
179+
endpoint.stop()
180+
181+
# Imitate optimizations that console would do for the second start
182+
endpoint.respec(skip_pg_catalog_updates=True)
183+
184+
185+
@pytest.mark.timeout(240)
186+
def test_compute_startup_latency(
187+
neon_env_builder: NeonEnvBuilder,
188+
pg_bin: PgBin,
189+
zenbenchmark: NeonBenchmarker,
190+
):
191+
"""
192+
Do NUM_STARTS 'optimized' starts, i.e. with pg_catalog updates skipped,
193+
and measure the duration of each step. Report p50, p90, p99 latencies.
194+
"""
195+
neon_env_builder.num_safekeepers = 3
196+
env = neon_env_builder.init_start()
197+
198+
endpoint = env.endpoints.create_start("main")
199+
pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s4", endpoint.connstr()])
200+
endpoint.stop()
201+
202+
NUM_STARTS = 100
203+
204+
durations: dict[str, list[int]] = {
205+
"sync_sk_check_ms": [],
206+
"sync_safekeepers_ms": [],
207+
"basebackup_ms": [],
208+
"start_postgres_ms": [],
209+
"total_startup_ms": [],
210+
}
211+
212+
for _i in range(NUM_STARTS):
213+
endpoint.start()
214+
client = endpoint.http_client()
215+
metrics = client.metrics_json()
216+
for key in durations.keys():
217+
value = metrics[key]
218+
durations[key].append(value)
219+
endpoint.stop()
220+
221+
for key in durations.keys():
222+
durations[key] = sorted(durations[key])
223+
zenbenchmark.record(
224+
f"{key}_p50",
225+
durations[key][len(durations[key]) // 2],
226+
"ms",
227+
report=MetricReport.LOWER_IS_BETTER,
228+
)
229+
zenbenchmark.record(
230+
f"{key}_p90",
231+
durations[key][len(durations[key]) * 9 // 10],
232+
"ms",
233+
report=MetricReport.LOWER_IS_BETTER,
234+
)
235+
zenbenchmark.record(
236+
f"{key}_p99",
237+
durations[key][len(durations[key]) * 99 // 100],
238+
"ms",
239+
report=MetricReport.LOWER_IS_BETTER,
240+
)

test_runner/performance/test_lazy_startup.py

-110
This file was deleted.

0 commit comments

Comments
 (0)