Skip to content

Commit a9906f0

Browse files
pdillingermeta-codesync[bot]
authored andcommitted
A better approach to clearing DBs for crash test (#14254)
Summary: Clearing DB dir for crash test is currently a hodgepodge of 1. Caller of db_crashtest.py maybe tries to clear the dir 2. db_crashtest.py tries to clear the dir in get_dbname() (but ignoring failure) 3. db_crashtest.py passes --destroy_db_initially to some db_stress calls as needed 4. db_crashtest.py tries to clear the dir between some db_stress calls 5. db_crashtest.py tries to clear the dir after everything is done and successful (no artifacts to investigate or save) (but ignoring failure) 6. Try to add more uniqueness to the directory from #14249 This change reverts or replaces 2, 4, 5, and 6 by doubling-down on (expanding) 3 and a small variant of it: * crash_test.mk passes --destroy_db_initially=1 so that the first run of db_stress clears the db dir. * After each db_stress invocation, db_crashtest.py resets destroy_db_initially=0 so that the next invocation reuses the same DB, except in cases where there is an incompatibility that requires a fresh DB (from cases 3 and 4 above). * On success, uses new `db_stress --destroy_db_and_exit` option to clean up the DB dir without needing a custom cleanup_cmd (now ignored) Note that although case 1 is likely obsolete, it is out of control of an open source PR. Pull Request resolved: #14254 Test Plan: some manual runs Reviewed By: xingbowang Differential Revision: D91164731 Pulled By: pdillinger fbshipit-source-id: 0a66c8c0e130c9eeacc55af411a18a09bc9debdf
1 parent b89d290 commit a9906f0

File tree

7 files changed

+72
-45
lines changed

7 files changed

+72
-45
lines changed

crash_test.mk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ DB_STRESS_CMD?=./db_stress
88
include common.mk
99

1010
CRASHTEST_MAKE=$(MAKE) -f crash_test.mk
11-
CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --cleanup_cmd='$(DB_CLEANUP_CMD)'
11+
CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --cleanup_cmd='$(DB_CLEANUP_CMD)' --destroy_db_initially=1
1212

1313
.PHONY: crash_test crash_test_with_atomic_flush crash_test_with_txn \
1414
crash_test_with_wc_txn crash_test_with_wp_txn crash_test_with_wup_txn \

db_stress_tool/db_stress_common.cc

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -877,5 +877,24 @@ Status DestroyUnverifiedSubdir(const std::string& dirname) {
877877
return s;
878878
}
879879

880+
Status DbStressDestroyDb(const std::string& db_path) {
881+
Status s;
882+
Options options;
883+
// NOTE: using db_stress_listener_env in order to see obsolete MANIFEST files
884+
options.env = db_stress_listener_env;
885+
// Remove DB files in a principled way to avoid issues
886+
if (FLAGS_use_blob_db) {
887+
s = blob_db::DestroyBlobDB(db_path, options, blob_db::BlobDBOptions());
888+
} else {
889+
s = DestroyDB(db_path, options);
890+
}
891+
if (!s.ok()) {
892+
return s;
893+
}
894+
// Remove everything else recursively, only reporting success if able to
895+
// delete everything
896+
return DestroyDir(db_stress_listener_env, db_path);
897+
}
898+
880899
} // namespace ROCKSDB_NAMESPACE
881900
#endif // GFLAGS

db_stress_tool/db_stress_common.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ DECLARE_bool(enable_pipelined_write);
100100
DECLARE_bool(verify_before_write);
101101
DECLARE_bool(histogram);
102102
DECLARE_bool(destroy_db_initially);
103+
DECLARE_bool(destroy_db_and_exit);
103104
DECLARE_bool(verbose);
104105
DECLARE_bool(progress_reports);
105106
DECLARE_uint64(db_write_buffer_size);
@@ -820,5 +821,10 @@ Status SaveFilesInDirectory(const std::string& src_dirname,
820821
const std::string& dst_dirname);
821822
Status DestroyUnverifiedSubdir(const std::string& dirname);
822823
Status InitUnverifiedSubdir(const std::string& dirname);
824+
825+
// Destroy the DB at the given path under the env configured for db_stress.
826+
// Handles both regular DB and BlobDB, and cleans and removes the entire dir.
827+
Status DbStressDestroyDb(const std::string& db_path);
828+
823829
} // namespace ROCKSDB_NAMESPACE
824830
#endif // GFLAGS

db_stress_tool/db_stress_gflags.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,10 @@ DEFINE_bool(histogram, false, "Print histogram of operation timings");
135135
DEFINE_bool(destroy_db_initially, true,
136136
"Destroys the database dir before start if this is true");
137137

138+
DEFINE_bool(destroy_db_and_exit, false,
139+
"Destroys the database dir and exits. Useful for cleanup without "
140+
"running stress test. Other options are mostly ignored.");
141+
138142
DEFINE_bool(verbose, false, "Verbose");
139143

140144
DEFINE_bool(progress_reports, true,

db_stress_tool/db_stress_test_base.cc

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -77,22 +77,7 @@ StressTest::StressTest()
7777
secondary_db_(nullptr),
7878
is_db_stopped_(false) {
7979
if (FLAGS_destroy_db_initially) {
80-
std::vector<std::string> files;
81-
db_stress_env->GetChildren(FLAGS_db, &files);
82-
for (unsigned int i = 0; i < files.size(); i++) {
83-
if (Slice(files[i]).starts_with("heap-")) {
84-
db_stress_env->DeleteFile(FLAGS_db + "/" + files[i]);
85-
}
86-
}
87-
88-
Options options;
89-
options.env = db_stress_env;
90-
// Remove files without preserving manfiest files
91-
const Status s = !FLAGS_use_blob_db
92-
? DestroyDB(FLAGS_db, options)
93-
: blob_db::DestroyBlobDB(FLAGS_db, options,
94-
blob_db::BlobDBOptions());
95-
80+
const Status s = DbStressDestroyDb(FLAGS_db);
9681
if (!s.ok()) {
9782
fprintf(stderr, "Cannot destroy original db: %s\n", s.ToString().c_str());
9883
exit(1);

db_stress_tool/db_stress_tool.cc

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,19 @@ int db_stress_tool(int argc, char** argv) {
9898
raw_env, std::make_shared<DbStressFSWrapper>(raw_env->GetFileSystem()));
9999
db_stress_env = env_wrapper_guard.get();
100100

101+
// Handle --destroy_db_and_exit early, before other option validation
102+
if (FLAGS_destroy_db_and_exit) {
103+
s = DbStressDestroyDb(FLAGS_db);
104+
if (s.ok()) {
105+
fprintf(stdout, "Successfully destroyed db at %s\n", FLAGS_db.c_str());
106+
return 0;
107+
} else {
108+
fprintf(stderr, "Failed to destroy db at %s: %s\n", FLAGS_db.c_str(),
109+
s.ToString().c_str());
110+
return 1;
111+
}
112+
}
113+
101114
FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str());
102115

103116
// The number of background threads should be at least as much the

tools/db_crashtest.py

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,10 @@ def early_argument_parsing_before_main():
6060
global per_iteration_random_seed_override
6161
per_iteration_random_seed_override = args.per_iteration_random_seed_override
6262
global is_remote_db
63-
# Set is_remote_db if remain_args has a non-empty --env_uri= argument
63+
# Set is_remote_db if remain_args has a non-empty --env_uri= or --fs_uri= argument
6464
for arg in remain_args:
6565
parts = arg.split("=", 1)
66-
if parts[0] == "--env_uri" and len(parts) > 1 and parts[1]:
66+
if parts[0] in ["--env_uri", "--fs_uri"] and len(parts) > 1 and parts[1]:
6767
is_remote_db = True
6868
break
6969

@@ -454,32 +454,20 @@ def apply_random_seed_per_iteration():
454454
_DEBUG_LEVEL_ENV_VAR = "DEBUG_LEVEL"
455455

456456
stress_cmd = "./db_stress"
457-
cleanup_cmd = None
458457

459458

460459
def is_release_mode():
461460
return os.environ.get(_DEBUG_LEVEL_ENV_VAR) == "0"
462461

463462

464-
# Generate a unique run ID for this script execution. This ensures each run
465-
# gets a unique database directory when TEST_TMPDIR is set, avoiding issues
466-
# with parameter changes (like use_put_entity_one_in) between runs.
467-
run_id = str(random.randint(0, 2**63))
468-
469-
470463
def get_dbname(test_name):
471464
test_dir_name = "rocksdb_crashtest_" + test_name
472465
test_tmpdir = os.environ.get(_TEST_DIR_ENV_VAR)
473466
if test_tmpdir is None or test_tmpdir == "":
474467
dbname = tempfile.mkdtemp(prefix=test_dir_name)
475468
else:
476-
dbname = test_tmpdir + "/" + test_dir_name + "_" + run_id
469+
dbname = test_tmpdir + "/" + test_dir_name
477470
if not is_remote_db:
478-
shutil.rmtree(dbname, True)
479-
if cleanup_cmd is not None:
480-
print("Running DB cleanup command - %s\n" % cleanup_cmd)
481-
# Ignore failure
482-
os.system(cleanup_cmd)
483471
os.makedirs(dbname, exist_ok=True)
484472
return dbname
485473

@@ -1387,13 +1375,18 @@ def print_output_and_exit_on_error(stdout, stderr, print_stderr_separately=False
13871375

13881376

13891377
def cleanup_after_success(dbname):
1390-
if not is_remote_db:
1391-
shutil.rmtree(dbname, True)
1392-
if cleanup_cmd is not None:
1393-
print("Running DB cleanup command - %s\n" % cleanup_cmd)
1394-
ret = os.system(cleanup_cmd)
1395-
if ret != 0:
1396-
print("WARNING: DB cleanup returned error %d\n" % ret)
1378+
# Use db_stress --destroy_db_and_exit, which simplifies remote DB cleanup
1379+
cleanup_cmd_parts = [stress_cmd, "--destroy_db_and_exit=1", "--db=" + dbname]
1380+
# Pass through relevant arguments for remote DB access
1381+
for arg in remain_args:
1382+
parts = arg.split("=", 1)
1383+
if parts[0] in ["--env_uri", "--fs_uri"]:
1384+
cleanup_cmd_parts.append(arg)
1385+
print("Running DB cleanup command - %s\n" % " ".join(cleanup_cmd_parts))
1386+
ret = subprocess.call(cleanup_cmd_parts)
1387+
if ret != 0:
1388+
print("ERROR: DB cleanup returned error %d\n" % ret)
1389+
sys.exit(2)
13971390

13981391

13991392
# This script runs and kills db_stress multiple times. It checks consistency
@@ -1421,6 +1414,10 @@ def blackbox_crash_main(args, unknown_args):
14211414

14221415
hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params["interval"])
14231416

1417+
# Reset destroy_db_initially after each run (it may have been set by
1418+
# command line for first run only)
1419+
cmd_params["destroy_db_initially"] = 0
1420+
14241421
if not hit_timeout:
14251422
print("Exit Before Killing")
14261423
print_output_and_exit_on_error(outs, errs, args.print_stderr_separately)
@@ -1563,7 +1560,7 @@ def whitebox_crash_main(args, unknown_args):
15631560
"`compaction_style` is changed in current run so `destroy_db_initially` is set to 1 as a short-term solution to avoid cycling through previous db of different compaction style."
15641561
+ "\n"
15651562
)
1566-
additional_opts["destroy_db_initially"] = 1
1563+
cmd_params["destroy_db_initially"] = 1
15671564
prev_compaction_style = cur_compaction_style
15681565

15691566
cmd = gen_cmd(
@@ -1588,6 +1585,11 @@ def whitebox_crash_main(args, unknown_args):
15881585
hit_timeout, retncode, stdoutdata, stderrdata = execute_cmd(
15891586
cmd, exit_time - time.time() + 900
15901587
)
1588+
1589+
# Reset destroy_db_initially after each run (it may have been set by
1590+
# command line for first run, or set for various reasons for a step)
1591+
cmd_params["destroy_db_initially"] = 0
1592+
15911593
msg = "check_mode={}, kill option={}, exitcode={}\n".format(
15921594
check_mode, additional_opts["kill_random_test"], retncode
15931595
)
@@ -1617,7 +1619,8 @@ def whitebox_crash_main(args, unknown_args):
16171619
# First half of the duration, keep doing kill test. For the next half,
16181620
# try different modes.
16191621
if time.time() > half_time:
1620-
cleanup_after_success(dbname)
1622+
# Set next iteration to destroy DB (works for remote DB)
1623+
cmd_params["destroy_db_initially"] = 1
16211624
if expected_values_dir is not None:
16221625
shutil.rmtree(expected_values_dir, True)
16231626
os.mkdir(expected_values_dir)
@@ -1633,7 +1636,6 @@ def whitebox_crash_main(args, unknown_args):
16331636

16341637
def main():
16351638
global stress_cmd
1636-
global cleanup_cmd
16371639

16381640
parser = argparse.ArgumentParser(
16391641
description="This script runs and kills \
@@ -1649,7 +1651,7 @@ def main():
16491651
parser.add_argument("--test_multiops_txn", action="store_true")
16501652
parser.add_argument("--stress_cmd")
16511653
parser.add_argument("--test_tiered_storage", action="store_true")
1652-
parser.add_argument("--cleanup_cmd")
1654+
parser.add_argument("--cleanup_cmd") # ignore old option for now
16531655
parser.add_argument("--print_stderr_separately", action="store_true", default=False)
16541656

16551657
all_params = dict(
@@ -1690,8 +1692,6 @@ def main():
16901692

16911693
if args.stress_cmd:
16921694
stress_cmd = args.stress_cmd
1693-
if args.cleanup_cmd:
1694-
cleanup_cmd = args.cleanup_cmd
16951695
if args.test_type == "blackbox":
16961696
blackbox_crash_main(args, unknown_args)
16971697
if args.test_type == "whitebox":

0 commit comments

Comments
 (0)