From d23dde5aa2bec01b7788af8455e49e81f50e6cc7 Mon Sep 17 00:00:00 2001 From: Wei He Date: Fri, 16 Aug 2024 08:32:30 -0700 Subject: [PATCH] Avoid generating unsupported data types for JoinFuzzer with PQR (#10662) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/10662 When running JoinFuzzerTest with PrestoQueryRunner, currently only about 16-20% iterations are verified against Presto. The rest iterations are unverified due to unsupported data types. This diff makes JoinFuzzer to avoid generating unsupported types when running with PrestoQueryRunner. After this change, over 85% of iterations are verified against Presto. Reviewed By: xiaoxmeng Differential Revision: D60768414 fbshipit-source-id: acbcc17e11c65009222fccbcf6e74e3a06e631da --- velox/exec/fuzzer/CMakeLists.txt | 1 + velox/exec/fuzzer/DuckQueryRunner.cpp | 15 ++++++ velox/exec/fuzzer/DuckQueryRunner.h | 11 ++++ velox/exec/fuzzer/JoinFuzzer.cpp | 65 +++++++++++++++++++----- velox/exec/fuzzer/PrestoQueryRunner.cpp | 16 ++++++ velox/exec/fuzzer/PrestoQueryRunner.h | 6 +++ velox/exec/fuzzer/ReferenceQueryRunner.h | 11 ++++ velox/vector/fuzzer/VectorFuzzer.cpp | 5 +- velox/vector/fuzzer/VectorFuzzer.h | 3 ++ 9 files changed, 115 insertions(+), 18 deletions(-) diff --git a/velox/exec/fuzzer/CMakeLists.txt b/velox/exec/fuzzer/CMakeLists.txt index fbc6321efb66..a58ef6d79a20 100644 --- a/velox/exec/fuzzer/CMakeLists.txt +++ b/velox/exec/fuzzer/CMakeLists.txt @@ -17,6 +17,7 @@ add_library(velox_fuzzer_util DuckQueryRunner.cpp PrestoQueryRunner.cpp target_link_libraries( velox_fuzzer_util + velox_vector_fuzzer velox_core velox_exec_test_lib cpr::cpr diff --git a/velox/exec/fuzzer/DuckQueryRunner.cpp b/velox/exec/fuzzer/DuckQueryRunner.cpp index acdc68ea205d..b4e3a369d402 100644 --- a/velox/exec/fuzzer/DuckQueryRunner.cpp +++ b/velox/exec/fuzzer/DuckQueryRunner.cpp @@ -63,6 +63,21 @@ void DuckQueryRunner::disableAggregateFunctions( } } +const std::vector& DuckQueryRunner::supportedScalarTypes() const { + static const std::vector kScalarTypes{ + BOOLEAN(), + TINYINT(), + SMALLINT(), + INTEGER(), + BIGINT(), + REAL(), + DOUBLE(), + VARCHAR(), + DATE(), + }; + return kScalarTypes; +} + std::multiset> DuckQueryRunner::execute( const std::string& sql, const std::vector& input, diff --git a/velox/exec/fuzzer/DuckQueryRunner.h b/velox/exec/fuzzer/DuckQueryRunner.h index bab0cc5a60d1..328343d06f9e 100644 --- a/velox/exec/fuzzer/DuckQueryRunner.h +++ b/velox/exec/fuzzer/DuckQueryRunner.h @@ -23,6 +23,17 @@ class DuckQueryRunner : public ReferenceQueryRunner { public: DuckQueryRunner(); + RunnerType runnerType() const override { + return RunnerType::kDuckQueryRunner; + } + + /// Skip Timestamp, Varbinary, Unknown, and IntervalDayTime types. DuckDB + /// doesn't support nanosecond precision for timestamps or casting from Bigint + /// to Interval. + /// + /// TODO Investigate mismatches reported when comparing Varbinary. + const std::vector& supportedScalarTypes() const override; + /// Specify names of aggregate function to exclude from the list of supported /// functions. Used to exclude functions that are non-determonistic, have bugs /// or whose semantics differ from Velox. diff --git a/velox/exec/fuzzer/JoinFuzzer.cpp b/velox/exec/fuzzer/JoinFuzzer.cpp index 50261592c363..049670da8374 100644 --- a/velox/exec/fuzzer/JoinFuzzer.cpp +++ b/velox/exec/fuzzer/JoinFuzzer.cpp @@ -63,6 +63,10 @@ namespace facebook::velox::exec::test { namespace { +std::string makePercentageString(size_t value, size_t total) { + return fmt::format("{} ({:.2f}%)", value, (double)value / total * 100); +} + class JoinFuzzer { public: JoinFuzzer( @@ -299,6 +303,30 @@ class JoinFuzzer { VectorFuzzer vectorFuzzer_; std::unique_ptr referenceQueryRunner_; + + struct Stats { + // The total number of iterations tested. + size_t numIterations{0}; + + // The number of iterations verified against reference DB. + size_t numVerified{0}; + + // The number of iterations that test cross product. + size_t numCrossProduct{0}; + + std::string toString() const { + std::stringstream out; + out << "\nTotal iterations tested: " << numIterations << std::endl; + out << "Total iterations verified against reference DB: " + << makePercentageString(numVerified, numIterations) << std::endl; + out << "Total iterations testing cross product: " + << makePercentageString(numCrossProduct, numIterations) << std::endl; + + return out.str(); + } + }; + + Stats stats_; }; JoinFuzzer::JoinFuzzer( @@ -351,7 +379,8 @@ std::vector JoinFuzzer::generateJoinKeyTypes(int32_t numKeys) { types.reserve(numKeys); for (auto i = 0; i < numKeys; ++i) { // Pick random scalar type. - types.push_back(vectorFuzzer_.randType(0 /*maxDepth*/)); + types.push_back(vectorFuzzer_.randType( + referenceQueryRunner_->supportedScalarTypes(), /*maxDepth=*/0)); } return types; } @@ -374,7 +403,8 @@ std::vector JoinFuzzer::generateProbeInput( const auto numPayload = randInt(0, 3); for (auto i = 0; i < numPayload; ++i) { names.push_back(fmt::format("tp{}", i + keyNames.size())); - types.push_back(vectorFuzzer_.randType(2 /*maxDepth*/)); + types.push_back(vectorFuzzer_.randType( + referenceQueryRunner_->supportedScalarTypes(), /*maxDepth=*/2)); } const auto inputType = ROW(std::move(names), std::move(types)); @@ -407,7 +437,8 @@ std::vector JoinFuzzer::generateBuildInput( const auto numPayload = randInt(0, 3); for (auto i = 0; i < numPayload; ++i) { names.push_back(fmt::format("bp{}", i + buildKeys.size())); - types.push_back(vectorFuzzer_.randType(2 /*maxDepth*/)); + types.push_back(vectorFuzzer_.randType( + referenceQueryRunner_->supportedScalarTypes(), /*maxDepth=*/2)); } const auto rowType = ROW(std::move(names), std::move(types)); @@ -619,12 +650,10 @@ std::optional JoinFuzzer::computeReferenceResults( const core::PlanNodePtr& plan, const std::vector& probeInput, const std::vector& buildInput) { - if (containsUnsupportedTypes(probeInput[0]->type())) { - return std::nullopt; - } - - if (containsUnsupportedTypes(buildInput[0]->type())) { - return std::nullopt; + if (referenceQueryRunner_->runnerType() == + ReferenceQueryRunner::RunnerType::kDuckQueryRunner) { + VELOX_CHECK(!containsUnsupportedTypes(probeInput[0]->type())); + VELOX_CHECK(!containsUnsupportedTypes(buildInput[0]->type())); } if (auto sql = referenceQueryRunner_->toSql(plan)) { @@ -934,6 +963,9 @@ RowVectorPtr JoinFuzzer::testCrossProduct( assertEqualResults( referenceResult.value(), plan.plan->outputType(), {expected}), "Velox and DuckDB results don't match"); + + LOG(INFO) << "Result matches with referenc DB."; + stats_.numVerified++; } } @@ -1007,6 +1039,8 @@ void JoinFuzzer::verify(core::JoinType joinType) { core::isFullJoin(joinType)) && FLAGS_batch_size * FLAGS_num_batches <= 500) { if (vectorFuzzer_.coinToss(0.1)) { + stats_.numCrossProduct++; + auto result = testCrossProduct( tableScanDir->getPath(), joinType, @@ -1069,6 +1103,9 @@ void JoinFuzzer::verify(core::JoinType joinType) { defaultPlan.plan->outputType(), {expected}), "Velox and Reference results don't match"); + + LOG(INFO) << "Result matches with referenc DB."; + stats_.numVerified++; } } @@ -1434,11 +1471,10 @@ void JoinFuzzer::go() { VELOX_USER_CHECK_GE(FLAGS_batch_size, 10, "Batch size must be at least 10."); const auto startTime = std::chrono::system_clock::now(); - size_t iteration = 0; - while (!isDone(iteration, startTime)) { + while (!isDone(stats_.numIterations, startTime)) { LOG(WARNING) << "==============================> Started iteration " - << iteration << " (seed: " << currentSeed_ << ")"; + << stats_.numIterations << " (seed: " << currentSeed_ << ")"; // Pick join type. const auto joinType = pickJoinType(); @@ -1446,11 +1482,12 @@ void JoinFuzzer::go() { verify(joinType); LOG(WARNING) << "==============================> Done with iteration " - << iteration; + << stats_.numIterations; reSeed(); - ++iteration; + ++stats_.numIterations; } + LOG(INFO) << stats_.toString(); } } // namespace diff --git a/velox/exec/fuzzer/PrestoQueryRunner.cpp b/velox/exec/fuzzer/PrestoQueryRunner.cpp index 89740c83f3e8..599552b7ab81 100644 --- a/velox/exec/fuzzer/PrestoQueryRunner.cpp +++ b/velox/exec/fuzzer/PrestoQueryRunner.cpp @@ -251,6 +251,22 @@ bool isSupportedDwrfType(const TypePtr& type) { } // namespace +const std::vector& PrestoQueryRunner::supportedScalarTypes() const { + static const std::vector kScalarTypes{ + BOOLEAN(), + TINYINT(), + SMALLINT(), + INTEGER(), + BIGINT(), + REAL(), + DOUBLE(), + VARCHAR(), + VARBINARY(), + TIMESTAMP(), + }; + return kScalarTypes; +} + std::optional PrestoQueryRunner::toSql( const std::shared_ptr& aggregationNode) { // Assume plan is Aggregation over Values. diff --git a/velox/exec/fuzzer/PrestoQueryRunner.h b/velox/exec/fuzzer/PrestoQueryRunner.h index c890b7838ed2..4aa5119af9e1 100644 --- a/velox/exec/fuzzer/PrestoQueryRunner.h +++ b/velox/exec/fuzzer/PrestoQueryRunner.h @@ -41,6 +41,12 @@ class PrestoQueryRunner : public velox::exec::test::ReferenceQueryRunner { std::string user, std::chrono::milliseconds timeout); + RunnerType runnerType() const override { + return RunnerType::kPrestoQueryRunner; + } + + const std::vector& supportedScalarTypes() const override; + /// Converts Velox query plan to Presto SQL. Supports Values -> Aggregation or /// Window with an optional Project on top. /// diff --git a/velox/exec/fuzzer/ReferenceQueryRunner.h b/velox/exec/fuzzer/ReferenceQueryRunner.h index bad9cf51df8c..b34d209d459b 100644 --- a/velox/exec/fuzzer/ReferenceQueryRunner.h +++ b/velox/exec/fuzzer/ReferenceQueryRunner.h @@ -17,14 +17,25 @@ #include #include "velox/core/PlanNode.h" +#include "velox/vector/fuzzer/VectorFuzzer.h" namespace facebook::velox::exec::test { /// Query runner that uses reference database, i.e. DuckDB, Presto, Spark. class ReferenceQueryRunner { public: + enum class RunnerType { kPrestoQueryRunner, kDuckQueryRunner }; + virtual ~ReferenceQueryRunner() = default; + virtual RunnerType runnerType() const = 0; + + // Scalar types supported by the reference database, to be used to restrict + // candidates when generating random types for fuzzers. + virtual const std::vector& supportedScalarTypes() const { + return defaultScalarTypes(); + } + /// Converts Velox plan into SQL accepted by the reference database. /// @return std::nullopt if the plan uses features not supported by the /// reference database. diff --git a/velox/vector/fuzzer/VectorFuzzer.cpp b/velox/vector/fuzzer/VectorFuzzer.cpp index df7457ea3787..7708b56b6001 100644 --- a/velox/vector/fuzzer/VectorFuzzer.cpp +++ b/velox/vector/fuzzer/VectorFuzzer.cpp @@ -1025,9 +1025,7 @@ VectorPtr VectorLoaderWrap::makeEncodingPreservedCopy( std::move(nulls), std::move(indices), vectorSize, baseResult); } -namespace { - -const std::vector defaultScalarTypes() { +const std::vector& defaultScalarTypes() { // @TODO Add decimal TypeKinds to randType. // Refer https://github.com/facebookincubator/velox/issues/3942 static std::vector kScalarTypes{ @@ -1046,7 +1044,6 @@ const std::vector defaultScalarTypes() { }; return kScalarTypes; } -} // namespace TypePtr randType(FuzzerGenerator& rng, int maxDepth) { return randType(rng, defaultScalarTypes(), maxDepth); diff --git a/velox/vector/fuzzer/VectorFuzzer.h b/velox/vector/fuzzer/VectorFuzzer.h index 289491784144..e4a6ec21581f 100644 --- a/velox/vector/fuzzer/VectorFuzzer.h +++ b/velox/vector/fuzzer/VectorFuzzer.h @@ -389,4 +389,7 @@ RowTypePtr randRowType( const std::vector& scalarTypes, int maxDepth = 5); +/// Default set of scalar types to be chosen from when generating random types. +const std::vector& defaultScalarTypes(); + } // namespace facebook::velox