Skip to content

Commit

Permalink
Avoid generating unsupported data types for JoinFuzzer with PQR (face…
Browse files Browse the repository at this point in the history
…bookincubator#10662)

Summary:
Pull Request resolved: facebookincubator#10662

When running JoinFuzzerTest with PrestoQueryRunner, currently only about
16-20% iterations are verified against Presto. The rest iterations are
unverified due to unsupported data types. This diff makes JoinFuzzer to
avoid generating unsupported types when running with PrestoQueryRunner.
After this change, over 85% of iterations are verified against Presto.

Reviewed By: xiaoxmeng

Differential Revision: D60768414

fbshipit-source-id: acbcc17e11c65009222fccbcf6e74e3a06e631da
  • Loading branch information
kagamiori authored and facebook-github-bot committed Aug 16, 2024
1 parent d805d31 commit d23dde5
Show file tree
Hide file tree
Showing 9 changed files with 115 additions and 18 deletions.
1 change: 1 addition & 0 deletions velox/exec/fuzzer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ add_library(velox_fuzzer_util DuckQueryRunner.cpp PrestoQueryRunner.cpp

target_link_libraries(
velox_fuzzer_util
velox_vector_fuzzer
velox_core
velox_exec_test_lib
cpr::cpr
Expand Down
15 changes: 15 additions & 0 deletions velox/exec/fuzzer/DuckQueryRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,21 @@ void DuckQueryRunner::disableAggregateFunctions(
}
}

const std::vector<TypePtr>& DuckQueryRunner::supportedScalarTypes() const {
static const std::vector<TypePtr> kScalarTypes{
BOOLEAN(),
TINYINT(),
SMALLINT(),
INTEGER(),
BIGINT(),
REAL(),
DOUBLE(),
VARCHAR(),
DATE(),
};
return kScalarTypes;
}

std::multiset<std::vector<velox::variant>> DuckQueryRunner::execute(
const std::string& sql,
const std::vector<RowVectorPtr>& input,
Expand Down
11 changes: 11 additions & 0 deletions velox/exec/fuzzer/DuckQueryRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,17 @@ class DuckQueryRunner : public ReferenceQueryRunner {
public:
DuckQueryRunner();

RunnerType runnerType() const override {
return RunnerType::kDuckQueryRunner;
}

/// Skip Timestamp, Varbinary, Unknown, and IntervalDayTime types. DuckDB
/// doesn't support nanosecond precision for timestamps or casting from Bigint
/// to Interval.
///
/// TODO Investigate mismatches reported when comparing Varbinary.
const std::vector<TypePtr>& supportedScalarTypes() const override;

/// Specify names of aggregate function to exclude from the list of supported
/// functions. Used to exclude functions that are non-determonistic, have bugs
/// or whose semantics differ from Velox.
Expand Down
65 changes: 51 additions & 14 deletions velox/exec/fuzzer/JoinFuzzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,10 @@ namespace facebook::velox::exec::test {

namespace {

std::string makePercentageString(size_t value, size_t total) {
return fmt::format("{} ({:.2f}%)", value, (double)value / total * 100);
}

class JoinFuzzer {
public:
JoinFuzzer(
Expand Down Expand Up @@ -299,6 +303,30 @@ class JoinFuzzer {

VectorFuzzer vectorFuzzer_;
std::unique_ptr<ReferenceQueryRunner> referenceQueryRunner_;

struct Stats {
// The total number of iterations tested.
size_t numIterations{0};

// The number of iterations verified against reference DB.
size_t numVerified{0};

// The number of iterations that test cross product.
size_t numCrossProduct{0};

std::string toString() const {
std::stringstream out;
out << "\nTotal iterations tested: " << numIterations << std::endl;
out << "Total iterations verified against reference DB: "
<< makePercentageString(numVerified, numIterations) << std::endl;
out << "Total iterations testing cross product: "
<< makePercentageString(numCrossProduct, numIterations) << std::endl;

return out.str();
}
};

Stats stats_;
};

JoinFuzzer::JoinFuzzer(
Expand Down Expand Up @@ -351,7 +379,8 @@ std::vector<TypePtr> JoinFuzzer::generateJoinKeyTypes(int32_t numKeys) {
types.reserve(numKeys);
for (auto i = 0; i < numKeys; ++i) {
// Pick random scalar type.
types.push_back(vectorFuzzer_.randType(0 /*maxDepth*/));
types.push_back(vectorFuzzer_.randType(
referenceQueryRunner_->supportedScalarTypes(), /*maxDepth=*/0));
}
return types;
}
Expand All @@ -374,7 +403,8 @@ std::vector<RowVectorPtr> JoinFuzzer::generateProbeInput(
const auto numPayload = randInt(0, 3);
for (auto i = 0; i < numPayload; ++i) {
names.push_back(fmt::format("tp{}", i + keyNames.size()));
types.push_back(vectorFuzzer_.randType(2 /*maxDepth*/));
types.push_back(vectorFuzzer_.randType(
referenceQueryRunner_->supportedScalarTypes(), /*maxDepth=*/2));
}

const auto inputType = ROW(std::move(names), std::move(types));
Expand Down Expand Up @@ -407,7 +437,8 @@ std::vector<RowVectorPtr> JoinFuzzer::generateBuildInput(
const auto numPayload = randInt(0, 3);
for (auto i = 0; i < numPayload; ++i) {
names.push_back(fmt::format("bp{}", i + buildKeys.size()));
types.push_back(vectorFuzzer_.randType(2 /*maxDepth*/));
types.push_back(vectorFuzzer_.randType(
referenceQueryRunner_->supportedScalarTypes(), /*maxDepth=*/2));
}

const auto rowType = ROW(std::move(names), std::move(types));
Expand Down Expand Up @@ -619,12 +650,10 @@ std::optional<MaterializedRowMultiset> JoinFuzzer::computeReferenceResults(
const core::PlanNodePtr& plan,
const std::vector<RowVectorPtr>& probeInput,
const std::vector<RowVectorPtr>& buildInput) {
if (containsUnsupportedTypes(probeInput[0]->type())) {
return std::nullopt;
}

if (containsUnsupportedTypes(buildInput[0]->type())) {
return std::nullopt;
if (referenceQueryRunner_->runnerType() ==
ReferenceQueryRunner::RunnerType::kDuckQueryRunner) {
VELOX_CHECK(!containsUnsupportedTypes(probeInput[0]->type()));
VELOX_CHECK(!containsUnsupportedTypes(buildInput[0]->type()));
}

if (auto sql = referenceQueryRunner_->toSql(plan)) {
Expand Down Expand Up @@ -934,6 +963,9 @@ RowVectorPtr JoinFuzzer::testCrossProduct(
assertEqualResults(
referenceResult.value(), plan.plan->outputType(), {expected}),
"Velox and DuckDB results don't match");

LOG(INFO) << "Result matches with referenc DB.";
stats_.numVerified++;
}
}

Expand Down Expand Up @@ -1007,6 +1039,8 @@ void JoinFuzzer::verify(core::JoinType joinType) {
core::isFullJoin(joinType)) &&
FLAGS_batch_size * FLAGS_num_batches <= 500) {
if (vectorFuzzer_.coinToss(0.1)) {
stats_.numCrossProduct++;

auto result = testCrossProduct(
tableScanDir->getPath(),
joinType,
Expand Down Expand Up @@ -1069,6 +1103,9 @@ void JoinFuzzer::verify(core::JoinType joinType) {
defaultPlan.plan->outputType(),
{expected}),
"Velox and Reference results don't match");

LOG(INFO) << "Result matches with referenc DB.";
stats_.numVerified++;
}
}

Expand Down Expand Up @@ -1434,23 +1471,23 @@ void JoinFuzzer::go() {
VELOX_USER_CHECK_GE(FLAGS_batch_size, 10, "Batch size must be at least 10.");

const auto startTime = std::chrono::system_clock::now();
size_t iteration = 0;

while (!isDone(iteration, startTime)) {
while (!isDone(stats_.numIterations, startTime)) {
LOG(WARNING) << "==============================> Started iteration "
<< iteration << " (seed: " << currentSeed_ << ")";
<< stats_.numIterations << " (seed: " << currentSeed_ << ")";

// Pick join type.
const auto joinType = pickJoinType();

verify(joinType);

LOG(WARNING) << "==============================> Done with iteration "
<< iteration;
<< stats_.numIterations;

reSeed();
++iteration;
++stats_.numIterations;
}
LOG(INFO) << stats_.toString();
}

} // namespace
Expand Down
16 changes: 16 additions & 0 deletions velox/exec/fuzzer/PrestoQueryRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,22 @@ bool isSupportedDwrfType(const TypePtr& type) {

} // namespace

const std::vector<TypePtr>& PrestoQueryRunner::supportedScalarTypes() const {
static const std::vector<TypePtr> kScalarTypes{
BOOLEAN(),
TINYINT(),
SMALLINT(),
INTEGER(),
BIGINT(),
REAL(),
DOUBLE(),
VARCHAR(),
VARBINARY(),
TIMESTAMP(),
};
return kScalarTypes;
}

std::optional<std::string> PrestoQueryRunner::toSql(
const std::shared_ptr<const core::AggregationNode>& aggregationNode) {
// Assume plan is Aggregation over Values.
Expand Down
6 changes: 6 additions & 0 deletions velox/exec/fuzzer/PrestoQueryRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ class PrestoQueryRunner : public velox::exec::test::ReferenceQueryRunner {
std::string user,
std::chrono::milliseconds timeout);

RunnerType runnerType() const override {
return RunnerType::kPrestoQueryRunner;
}

const std::vector<TypePtr>& supportedScalarTypes() const override;

/// Converts Velox query plan to Presto SQL. Supports Values -> Aggregation or
/// Window with an optional Project on top.
///
Expand Down
11 changes: 11 additions & 0 deletions velox/exec/fuzzer/ReferenceQueryRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,25 @@

#include <set>
#include "velox/core/PlanNode.h"
#include "velox/vector/fuzzer/VectorFuzzer.h"

namespace facebook::velox::exec::test {

/// Query runner that uses reference database, i.e. DuckDB, Presto, Spark.
class ReferenceQueryRunner {
public:
enum class RunnerType { kPrestoQueryRunner, kDuckQueryRunner };

virtual ~ReferenceQueryRunner() = default;

virtual RunnerType runnerType() const = 0;

// Scalar types supported by the reference database, to be used to restrict
// candidates when generating random types for fuzzers.
virtual const std::vector<TypePtr>& supportedScalarTypes() const {
return defaultScalarTypes();
}

/// Converts Velox plan into SQL accepted by the reference database.
/// @return std::nullopt if the plan uses features not supported by the
/// reference database.
Expand Down
5 changes: 1 addition & 4 deletions velox/vector/fuzzer/VectorFuzzer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1025,9 +1025,7 @@ VectorPtr VectorLoaderWrap::makeEncodingPreservedCopy(
std::move(nulls), std::move(indices), vectorSize, baseResult);
}

namespace {

const std::vector<TypePtr> defaultScalarTypes() {
const std::vector<TypePtr>& defaultScalarTypes() {
// @TODO Add decimal TypeKinds to randType.
// Refer https://github.com/facebookincubator/velox/issues/3942
static std::vector<TypePtr> kScalarTypes{
Expand All @@ -1046,7 +1044,6 @@ const std::vector<TypePtr> defaultScalarTypes() {
};
return kScalarTypes;
}
} // namespace

TypePtr randType(FuzzerGenerator& rng, int maxDepth) {
return randType(rng, defaultScalarTypes(), maxDepth);
Expand Down
3 changes: 3 additions & 0 deletions velox/vector/fuzzer/VectorFuzzer.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,4 +389,7 @@ RowTypePtr randRowType(
const std::vector<TypePtr>& scalarTypes,
int maxDepth = 5);

/// Default set of scalar types to be chosen from when generating random types.
const std::vector<TypePtr>& defaultScalarTypes();

} // namespace facebook::velox

0 comments on commit d23dde5

Please sign in to comment.