|
18 | 18 | package com.mongodb.spark.sql.connector.read.partitioner;
|
19 | 19 |
|
20 | 20 | import static com.mongodb.spark.sql.connector.read.partitioner.Partitioner.LOGGER;
|
21 |
| -import static java.util.Collections.singletonList; |
| 21 | +import static java.lang.String.format; |
| 22 | +import static java.util.Arrays.asList; |
22 | 23 |
|
23 | 24 | import com.mongodb.MongoCommandException;
|
24 | 25 | import com.mongodb.client.MongoDatabase;
|
|
37 | 38 | /** Partitioner helper class, contains various utility methods used by the partitioner instances. */
|
38 | 39 | public final class PartitionerHelper {
|
39 | 40 |
|
40 |
| - private static final List<BsonDocument> COLL_STATS_AGGREGATION_PIPELINE = |
41 |
| - singletonList(BsonDocument.parse("{'$collStats': {'storageStats': { } } }")); |
| 41 | + private static final List<BsonDocument> COLL_STATS_AGGREGATION_PIPELINE = asList( |
| 42 | + BsonDocument.parse("{'$collStats': {'storageStats': { } } }"), |
| 43 | + BsonDocument.parse( |
| 44 | + "{'$project': {'size': '$storageStats.size', 'count': '$storageStats.count' } }")); |
42 | 45 | private static final BsonDocument PING_COMMAND = BsonDocument.parse("{ping: 1}");
|
| 46 | + private static final BsonDocument BUILD_INFO_COMMAND = BsonDocument.parse("{buildInfo: 1}"); |
43 | 47 | public static final Partitioner SINGLE_PARTITIONER = new SinglePartitionPartitioner();
|
44 | 48 |
|
45 | 49 | /**
|
@@ -101,14 +105,34 @@ public static List<BsonDocument> createPartitionPipeline(
|
101 | 105 | public static BsonDocument storageStats(final ReadConfig readConfig) {
|
102 | 106 | LOGGER.info("Getting collection stats for: {}", readConfig.getNamespace().getFullName());
|
103 | 107 | try {
|
104 |
| - return readConfig |
105 |
| - .withCollection( |
106 |
| - coll -> Optional.ofNullable(coll.aggregate(COLL_STATS_AGGREGATION_PIPELINE) |
107 |
| - .allowDiskUse(readConfig.getAggregationAllowDiskUse()) |
108 |
| - .comment(readConfig.getComment()) |
109 |
| - .first()) |
110 |
| - .orElseGet(BsonDocument::new)) |
111 |
| - .getDocument("storageStats", new BsonDocument()); |
| 108 | + BsonDocument buildInfo = readConfig.withClient(c -> { |
| 109 | + MongoDatabase db = c.getDatabase(readConfig.getDatabaseName()); |
| 110 | + return db.runCommand(BUILD_INFO_COMMAND).toBsonDocument(); |
| 111 | + }); |
| 112 | + |
| 113 | + // Atlas Data Federation does not support the storageStats property and requires |
| 114 | + // special handling to return the federated collection stats. |
| 115 | + if (!buildInfo.containsKey("dataLake")) { |
| 116 | + return readConfig.withClient(c -> { |
| 117 | + MongoDatabase db = c.getDatabase(readConfig.getDatabaseName()); |
| 118 | + BsonDocument command = |
| 119 | + BsonDocument.parse(format("{ collStats: '%s' }", readConfig.getCollectionName())); |
| 120 | + BsonDocument result = db.runCommand(command).toBsonDocument(); |
| 121 | + |
| 122 | + BsonDocument formattedResult = new BsonDocument(); |
| 123 | + formattedResult.append("count", result.get("count")); |
| 124 | + formattedResult.append("size", result.get("size")); |
| 125 | + |
| 126 | + return formattedResult; |
| 127 | + }); |
| 128 | + } |
| 129 | + |
| 130 | + return readConfig.withCollection( |
| 131 | + coll -> Optional.ofNullable(coll.aggregate(COLL_STATS_AGGREGATION_PIPELINE) |
| 132 | + .allowDiskUse(readConfig.getAggregationAllowDiskUse()) |
| 133 | + .comment(readConfig.getComment()) |
| 134 | + .first()) |
| 135 | + .orElseGet(BsonDocument::new)); |
112 | 136 | } catch (RuntimeException ex) {
|
113 | 137 | if (ex instanceof MongoCommandException
|
114 | 138 | && (ex.getMessage().contains("not found.")
|
@@ -138,5 +162,24 @@ public static List<String> getPreferredLocations(final ReadConfig readConfig) {
|
138 | 162 | .collect(Collectors.toList());
|
139 | 163 | }
|
140 | 164 |
|
| 165 | + /** |
| 166 | + * Returns the average document size in a collection, either using {@code avgObjSize} |
| 167 | + * or calculated from document count and collection size. |
| 168 | + * |
| 169 | + * @param storageStats the storage stats of a collection |
| 170 | + * @param documentCount the number of documents in a collection |
| 171 | + * @return the average document size in a collection |
| 172 | + */ |
| 173 | + public static double averageDocumentSize(final BsonDocument storageStats, final long documentCount) { |
| 174 | + if (storageStats.containsKey("avgObjSize")) { |
| 175 | + return storageStats.get("avgObjSize", new BsonInt32(0)).asNumber().doubleValue(); |
| 176 | + } |
| 177 | + |
| 178 | + long size = storageStats.getNumber("size").longValue(); |
| 179 | + double avgObjSizeInBytes = Math.floor(size / documentCount); |
| 180 | + |
| 181 | + return avgObjSizeInBytes; |
| 182 | + } |
| 183 | + |
141 | 184 | private PartitionerHelper() {}
|
142 | 185 | }
|
0 commit comments