Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions velox/connectors/lakehouse/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

add_subdirectory(storage_adapters)
add_subdirectory(common)
add_subdirectory(iceberg)
53 changes: 53 additions & 0 deletions velox/connectors/lakehouse/common/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

velox_add_library(velox_hive_config OBJECT HiveConfig.cpp)
velox_link_libraries(velox_hive_config velox_core velox_exception)

add_subdirectory(iceberg)

velox_add_library(
velox_hive_connector
OBJECT
FileHandle.cpp
HiveConfig.cpp
HiveConnector.cpp
HiveConnectorUtil.cpp
HiveConnectorSplit.cpp
HiveDataSink.cpp
HiveDataSource.cpp
HivePartitionUtil.cpp
PartitionIdGenerator.cpp
SplitReader.cpp
TableHandle.cpp)

velox_link_libraries(
velox_hive_connector
PUBLIC velox_hive_iceberg_splitreader
PRIVATE velox_common_io velox_connector velox_dwio_catalog_fbhive
velox_hive_partition_function)

velox_add_library(velox_hive_partition_function HivePartitionFunction.cpp)

velox_link_libraries(velox_hive_partition_function velox_core velox_exec)

add_subdirectory(storage_adapters)

if(${VELOX_BUILD_TESTING})
add_subdirectory(tests)
endif()

if(${VELOX_ENABLE_BENCHMARKS})
add_subdirectory(benchmarks)
endif()
78 changes: 78 additions & 0 deletions velox/connectors/lakehouse/common/FileHandle.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "velox/connectors/hive/FileHandle.h"
#include "velox/common/base/Counters.h"
#include "velox/common/base/StatsReporter.h"
#include "velox/common/file/FileSystems.h"
#include "velox/common/time/Timer.h"

#include <atomic>

namespace facebook::velox {

uint64_t FileHandleSizer::operator()(const FileHandle& fileHandle) {
// TODO: add to support variable file cache size support when the file system
// underneath supports.
return 1;
}

namespace {
// The group tracking is at the level of the directory, i.e. Hive partition.
std::string groupName(const std::string& filename) {
const char* slash = strrchr(filename.c_str(), '/');
return slash ? std::string(filename.data(), slash - filename.data())
: filename;
}
} // namespace

std::unique_ptr<FileHandle> FileHandleGenerator::operator()(
const FileHandleKey& key,
const FileProperties* properties,
filesystems::File::IoStats* stats) {
// We have seen cases where drivers are stuck when creating file handles.
// Adding a trace here to spot this more easily in future.
process::TraceContext trace("FileHandleGenerator::operator()");
uint64_t elapsedTimeUs{0};
std::unique_ptr<FileHandle> fileHandle;
{
MicrosecondTimer timer(&elapsedTimeUs);
fileHandle = std::make_unique<FileHandle>();
filesystems::FileOptions options;
options.stats = stats;
options.tokenProvider = key.tokenProvider;
if (properties) {
options.fileSize = properties->fileSize;
options.readRangeHint = properties->readRangeHint;
options.extraFileInfo = properties->extraFileInfo;
}
const auto& filename = key.filename;
fileHandle->file = filesystems::getFileSystem(filename, properties_)
->openFileForRead(filename, options);
fileHandle->uuid = StringIdLease(fileIds(), filename);
fileHandle->groupId = StringIdLease(fileIds(), groupName(filename));
VLOG(1) << "Generating file handle for: " << filename
<< " uuid: " << fileHandle->uuid.id();
}
RECORD_HISTOGRAM_METRIC_VALUE(
kMetricHiveFileHandleGenerateLatencyMs, elapsedTimeUs / 1000);
// TODO: build the hash map/etc per file type -- presumably after reading
// the appropriate magic number from the file, or perhaps we include the file
// type in the file handle key.
return fileHandle;
}

} // namespace facebook::velox
130 changes: 130 additions & 0 deletions velox/connectors/lakehouse/common/FileHandle.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

// A FileHandle is a File pointer plus some (optional, file-type-dependent)
// extra information for speeding up loading columnar data. For example, when
// we open a file we might build a hash map saying what region(s) on disk
// correspond to a given column in a given stripe.
//
// The FileHandle will normally be used in conjunction with a CachedFactory
// to speed up queries that hit the same files repeatedly; see the
// FileHandleCache and FileHandleFactory.

#pragma once

#include "velox/common/base/BitUtil.h"
#include "velox/common/caching/CachedFactory.h"
#include "velox/common/caching/FileIds.h"
#include "velox/common/config/Config.h"
#include "velox/common/file/File.h"
#include "velox/common/file/TokenProvider.h"
#include "velox/connectors/hive/FileProperties.h"

namespace facebook::velox {

// See the file comment.
struct FileHandle {
std::shared_ptr<ReadFile> file;

// Each time we make a new FileHandle we assign it a uuid and use that id as
// the identifier in downstream data caching structures. This saves a lot of
// memory compared to using the filename as the identifier.
StringIdLease uuid;

// Id for the group of files this belongs to, e.g. its
// directory. Used for coarse granularity access tracking, for
// example to decide placing on SSD.
StringIdLease groupId;

// We'll want to have a hash map here to record the identifier->byte range
// mappings. Different formats may have different identifiers, so we may need
// a union of maps. For example in orc you need 3 integers (I think, to be
// confirmed with xldb): the row bundle, the node, and the sequence. For the
// first diff we'll not include the map.
};

/// Estimates the memory usage of a FileHandle object.
struct FileHandleSizer {
uint64_t operator()(const FileHandle& a);
};

struct FileHandleKey {
std::string filename;
std::shared_ptr<filesystems::TokenProvider> tokenProvider{nullptr};

bool operator==(const FileHandleKey& other) const {
if (filename != other.filename) {
return false;
}

if (tokenProvider == other.tokenProvider) {
return true;
}

if (!tokenProvider || !other.tokenProvider) {
return false;
}

return tokenProvider->equals(*other.tokenProvider);
}
};

} // namespace facebook::velox

namespace std {
template <>
struct hash<facebook::velox::FileHandleKey> {
size_t operator()(const facebook::velox::FileHandleKey& key) const noexcept {
size_t filenameHash = std::hash<std::string>()(key.filename);
return key.tokenProvider ? facebook::velox::bits::hashMix(
filenameHash, key.tokenProvider->hash())
: filenameHash;
}
};
} // namespace std

namespace facebook::velox {
using FileHandleCache =
SimpleLRUCache<facebook::velox::FileHandleKey, FileHandle>;

// Creates FileHandles via the Generator interface the CachedFactory requires.
class FileHandleGenerator {
public:
FileHandleGenerator() {}
FileHandleGenerator(std::shared_ptr<const config::ConfigBase> properties)
: properties_(std::move(properties)) {}
std::unique_ptr<FileHandle> operator()(
const FileHandleKey& filename,
const FileProperties* properties,
filesystems::File::IoStats* stats);

private:
const std::shared_ptr<const config::ConfigBase> properties_;
};

using FileHandleFactory = CachedFactory<
FileHandleKey,
FileHandle,
FileHandleGenerator,
FileProperties,
filesystems::File::IoStats,
FileHandleSizer>;

using FileHandleCachedPtr = CachedPtr<FileHandleKey, FileHandle>;

using FileHandleCacheStats = SimpleLRUCacheStats;

} // namespace facebook::velox
39 changes: 39 additions & 0 deletions velox/connectors/lakehouse/common/FileProperties.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

// A FileHandle is a File pointer plus some (optional, file-type-dependent)
// extra information for speeding up loading columnar data. For example, when
// we open a file we might build a hash map saying what region(s) on disk
// correspond to a given column in a given stripe.
//
// The FileHandle will normally be used in conjunction with a CachedFactory
// to speed up queries that hit the same files repeatedly; see the
// FileHandleCache and FileHandleFactory.

#pragma once

#include <cstdint>

namespace facebook::velox {

struct FileProperties {
std::optional<int64_t> fileSize;
std::optional<int64_t> modificationTime;
std::optional<int64_t> readRangeHint{std::nullopt};
std::shared_ptr<std::string> extraFileInfo{nullptr};
};

} // namespace facebook::velox
Loading