Merge pull request #13 from data-integrations/fix/temp-solutiion-for-plugin-error

mrahanjam · web-flow · commit cb44fcb289fa · 2022-02-04T11:05:00.000-08:00
temp solution for MongoDB plugin issue for versions &lt; 6.6
diff --git a/src/main/java/com/mongodb/hadoop/output/MongoOutputCommitter.java b/src/main/java/com/mongodb/hadoop/output/MongoOutputCommitter.java
@@ -0,0 +1,254 @@
+/*
+ * Copyright 2011-2013 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.mongodb.hadoop.output;
+
+import com.mongodb.BasicDBObject;
+import com.mongodb.BulkUpdateRequestBuilder;
+import com.mongodb.BulkWriteOperation;
+import com.mongodb.BulkWriteRequestBuilder;
+import com.mongodb.DBCollection;
+import com.mongodb.DBObject;
+import com.mongodb.MongoException;
+import com.mongodb.hadoop.io.BSONWritable;
+import com.mongodb.hadoop.io.MongoUpdateWritable;
+import com.mongodb.hadoop.io.MongoWritableTypes;
+import com.mongodb.hadoop.util.MongoConfigUtil;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.OutputCommitter;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+import java.io.IOException;
+
+/**
+ * MongoOutputCommitter to committ output
+*/
+public class MongoOutputCommitter extends OutputCommitter {
+
+    public static final String TEMP_DIR_NAME = "_MONGO_OUT_TEMP";
+    private static final Log LOG = LogFactory.getLog(MongoOutputCommitter.class);
+    private DBCollection collection;
+
+    public MongoOutputCommitter() {}
+
+    @Override
+    public void setupJob(final JobContext jobContext) {
+        LOG.info("Setting up job.");
+    }
+
+    @Override
+    public void setupTask(final TaskAttemptContext taskContext) {
+        LOG.info("Setting up task.");
+    }
+
+    @Override
+    public boolean needsTaskCommit(
+      final TaskAttemptContext taskContext) throws IOException {
+        try {
+            FileSystem fs = FileSystem.get(taskContext.getConfiguration());
+            // Commit is only necessary if there was any output.
+            return fs.exists(getTaskAttemptPath(taskContext));
+        } catch (IOException e) {
+            LOG.error("Could not open filesystem", e);
+            throw e;
+        }
+    }
+
+    @Override
+    public void commitTask(
+      final TaskAttemptContext taskContext) throws IOException {
+        LOG.info("Committing task.");
+
+        collection =
+          MongoConfigUtil.getOutputCollection(taskContext.getConfiguration());
+
+        // Get temporary file.
+        Path tempFilePath = getTaskAttemptPath(taskContext);
+        LOG.info("Committing from temporary file: " + tempFilePath.toString());
+        long filePos = 0, fileLen;
+        FSDataInputStream inputStream = null;
+        try {
+            FileSystem fs = FileSystem.get(taskContext.getConfiguration());
+            inputStream = fs.open(tempFilePath);
+            fileLen = fs.getFileStatus(tempFilePath).getLen();
+        } catch (IOException e) {
+            LOG.error("Could not open temporary file for committing", e);
+            cleanupAfterCommit(inputStream, taskContext);
+            throw e;
+        }
+
+        int maxDocs = MongoConfigUtil.getBatchSize(
+          taskContext.getConfiguration());
+        int curBatchSize = 0;
+
+        BulkWriteOperation bulkOp;
+        if (MongoConfigUtil.isBulkOrdered(taskContext.getConfiguration())) {
+            bulkOp = collection.initializeOrderedBulkOperation();
+        } else {
+            bulkOp = collection.initializeUnorderedBulkOperation();
+        }
+
+        // Read Writables out of the temporary file.
+        BSONWritable bw = new BSONWritable();
+        MongoUpdateWritable muw = new MongoUpdateWritable();
+        while (filePos < fileLen) {
+            try {
+                // Determine writable type, and perform corresponding operation
+                // on MongoDB.
+                int mwType = inputStream.readInt();
+                if (MongoWritableTypes.BSON_WRITABLE == mwType) {
+                    bw.readFields(inputStream);
+                    bulkOp.insert(new BasicDBObject(bw.getDoc().toMap()));
+                } else if (MongoWritableTypes.MONGO_UPDATE_WRITABLE == mwType) {
+                    muw.readFields(inputStream);
+                    DBObject query = new BasicDBObject(muw.getQuery().toMap());
+                    DBObject modifiers =
+                        new BasicDBObject(muw.getModifiers().toMap());
+                    BulkWriteRequestBuilder writeBuilder = bulkOp.find(query);
+                    if (muw.isReplace()) {
+                        writeBuilder.replaceOne(modifiers);
+                    } else if (muw.isUpsert()) {
+                        BulkUpdateRequestBuilder updateBuilder =
+                          writeBuilder.upsert();
+                        if (muw.isMultiUpdate()) {
+                            updateBuilder.update(modifiers);
+                        } else {
+                            updateBuilder.updateOne(modifiers);
+                        }
+                    } else {
+                        // No-upsert update.
+                        if (muw.isMultiUpdate()) {
+                            writeBuilder.update(modifiers);
+                        } else {
+                            writeBuilder.updateOne(modifiers);
+                        }
+                    }
+                } else {
+                    throw new IOException("Unrecognized type: " + mwType);
+                }
+                filePos = inputStream.getPos();
+                // Write to MongoDB if the batch is full, or if this is the last
+                // operation to be performed for the Task.
+                if (++curBatchSize >= maxDocs || filePos >= fileLen) {
+                    try {
+                        bulkOp.execute();
+                    } catch (MongoException e) {
+                        LOG.error("Could not write to MongoDB", e);
+                        throw e;
+                    }
+                    bulkOp = collection.initializeOrderedBulkOperation();
+                    curBatchSize = 0;
+
+                    // Signal progress back to Hadoop framework so that we
+                    // don't time out.
+                    taskContext.progress();
+                }
+            } catch (IOException e) {
+                LOG.error("Error reading from temporary file", e);
+                throw e;
+            }
+        }
+
+        cleanupAfterCommit(inputStream, taskContext);
+    }
+
+    @Override
+    public void abortTask(final TaskAttemptContext taskContext)
+      throws IOException {
+        LOG.info("Aborting task.");
+        cleanupResources(taskContext);
+    }
+
+    /**
+     * Helper method to close MongoClients and FSDataInputStream and clean up
+     * any files still left around from map/reduce tasks.
+     *
+     * @param inputStream the FSDataInputStream to close.
+     */
+    private void cleanupAfterCommit(
+        final FSDataInputStream inputStream,
+        final TaskAttemptContext context)
+        throws IOException {
+        if (inputStream != null) {
+            try {
+                inputStream.close();
+            } catch (IOException e) {
+                LOG.error("Could not close input stream", e);
+                throw e;
+            }
+        }
+        cleanupResources(context);
+    }
+
+    private void cleanupResources(
+      final TaskAttemptContext taskContext)
+        throws IOException {
+        Path currentPath = getTaskAttemptPath(taskContext);
+        Path tempDirectory = getTempDirectory(taskContext.getConfiguration());
+        FileSystem fs = FileSystem.get(taskContext.getConfiguration());
+        while (!currentPath.equals(tempDirectory)) {
+            try {
+                fs.delete(currentPath, true);
+            } catch (IOException e) {
+                LOG.error("Could not delete temporary file: " + currentPath, e);
+                throw e;
+            }
+            currentPath = currentPath.getParent();
+        }
+
+        if (collection != null) {
+            MongoConfigUtil.close(collection.getDB().getMongo());
+        }
+    }
+
+    private static Path getTempDirectory(final Configuration config) {
+        String basePath = config.get(
+          "mapreduce.task.tmp.dir",
+          config.get(
+            "mapred.child.tmp",
+            config.get("hadoop.tmp.dir", "/tmp")));
+        return new Path(basePath);
+    }
+
+    /**
+     * Get the Path to where temporary files should be stored for a
+     * TaskAttempt, whose TaskAttemptContext is provided.
+     *
+     * @param context the TaskAttemptContext.
+     * @return the Path to the temporary file for the TaskAttempt.
+     */
+    public static Path getTaskAttemptPath(
+      final TaskAttemptContext context) {
+        Configuration config = context.getConfiguration();
+        // Try to use the following base temporary directories, in this order:
+        // 1. New-style option for task tmp dir
+        // 2. Old-style option for task tmp dir
+        // 3. Hadoop system-wide tmp dir
+        // 4. /tmp
+        // Hadoop Paths always use "/" as a directory separator.
+        return new Path(
+          String.format("%s/%s/%s/_out",
+            getTempDirectory(config),
+            context.getTaskAttemptID().toString(), TEMP_DIR_NAME));
+    }
+
+}
diff --git a/src/main/java/com/mongodb/hadoop/output/MongoRecordWriter.java b/src/main/java/com/mongodb/hadoop/output/MongoRecordWriter.java
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2011-2013 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.mongodb.hadoop.output;
+
+import com.mongodb.BasicDBObject;
+import com.mongodb.DBCollection;
+import com.mongodb.DBObject;
+import com.mongodb.hadoop.MongoOutput;
+import com.mongodb.hadoop.io.BSONWritable;
+import com.mongodb.hadoop.io.MongoUpdateWritable;
+import com.mongodb.hadoop.io.MongoWritableTypes;
+import com.mongodb.hadoop.util.MongoConfigUtil;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.RecordWriter;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.bson.BSONObject;
+
+import java.io.IOException;
+import java.util.Map;
+
+/**
+  * Create a MongoRecordWriter targeting a single DBCollection.
+  * @param <K> the type of the key
+  * @param <V> the type of the value
+*/
+public class MongoRecordWriter<K, V> extends RecordWriter<K, V> {
+
+    private static final Log LOG = LogFactory.getLog(MongoRecordWriter.class);
+    private final DBCollection collection;
+    private final TaskAttemptContext context;
+    private final BSONWritable bsonWritable;
+    private FSDataOutputStream outputStream;
+
+    public MongoRecordWriter(
+      final DBCollection c,
+      final TaskAttemptContext ctx) {
+        collection = c;
+        context = ctx;
+        bsonWritable = new BSONWritable();
+
+        // Initialize output stream.
+        try {
+            FileSystem fs = FileSystem.get(context.getConfiguration());
+            Path outputPath = MongoOutputCommitter.getTaskAttemptPath(context);
+            LOG.info("Writing to temporary file: " + outputPath.toString());
+            outputStream = fs.create(outputPath, true);
+        } catch (IOException e) {
+            // TODO: re-throw IOException the next time API can be changed.
+            throw new RuntimeException(
+              "Could not open temporary file for buffering Mongo output", e);
+        }
+    }
+
+    @Override
+    public void close(final TaskAttemptContext context) {
+        if (outputStream != null) {
+            try {
+                outputStream.close();
+            } catch (IOException e) {
+                LOG.error("Could not close output stream", e);
+            }
+        }
+        MongoConfigUtil.close(collection.getDB().getMongo());
+    }
+
+    @Override
+    public void write(final K key, final V value) throws IOException {
+        if (value instanceof MongoUpdateWritable) {
+            outputStream.writeInt(MongoWritableTypes.MONGO_UPDATE_WRITABLE);
+            ((MongoUpdateWritable) value).write(outputStream);
+        } else {
+            DBObject o = new BasicDBObject();
+            if (key instanceof BSONWritable) {
+                o.put("_id", ((BSONWritable) key).getDoc());
+            } else if (key instanceof BSONObject) {
+                o.put("_id", key);
+            } else {
+                o.put("_id", BSONWritable.toBSON(key));
+            }
+
+            if (value instanceof BSONWritable) {
+                o.putAll(((BSONWritable) value).getDoc());
+            } else if (value instanceof MongoOutput) {
+                ((MongoOutput) value).appendAsValue(o);
+            } else if (value instanceof BSONObject) {
+                o.putAll((BSONObject) value);
+            } else if (value instanceof Map) {
+                o.putAll((Map) value);
+            } else {
+                o.put("value", BSONWritable.toBSON(value));
+            }
+            outputStream.writeInt(MongoWritableTypes.BSON_WRITABLE);
+            bsonWritable.setDoc(o);
+            bsonWritable.write(outputStream);
+        }
+    }
+
+    /**
+     * Add an index to be ensured before the Job starts running.
+     * @param index a DBObject describing the keys of the index.
+     * @param options a DBObject describing the options to apply when creating
+     *                the index.
+     */
+    public void ensureIndex(final DBObject index, final DBObject options) {
+        collection.createIndex(index, options);
+    }
+}