Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions config.js
Original file line number Diff line number Diff line change
Expand Up @@ -936,6 +936,9 @@ config.NSFS_GLACIER_DMAPI_PMIG_DAYS = config.S3_RESTORE_REQUEST_MAX_DAYS;
// accidental blocking reads from happening.
config.NSFS_GLACIER_DMAPI_FINALIZE_RESTORE_ENABLE = false;

config.NSFS_GLACIER_DMAPI_ENABLE_TAPE_RECLAIM = false;
config.NSFS_GLACIER_RECLAIM_INTERVAL = 15 * 60 * 1000;

config.NSFS_STATFS_CACHE_SIZE = 10000;
config.NSFS_STATFS_CACHE_EXPIRY_MS = 1 * 1000;

Expand Down Expand Up @@ -979,7 +982,7 @@ config.NSFS_GLACIER_MIGRATE_LOG_THRESHOLD = 50 * 1024;
config.NSFS_GLACIER_METRICS_STATFS_PATHS = [];
config.NSFS_GLACIER_METRICS_STATFS_INTERVAL = 60 * 1000; // Refresh statfs value every minute

/**
/**
* NSFS_GLACIER_RESERVED_BUCKET_TAGS defines an object of bucket tags which will be reserved
* by the system and PUT operations for them via S3 API would be limited - as in they would be
* mutable only if specified and only under certain conditions.
Expand All @@ -990,7 +993,7 @@ config.NSFS_GLACIER_METRICS_STATFS_INTERVAL = 60 * 1000; // Refresh statfs value
* default: any,
* event: boolean
* }>}
*
*
* @example
* {
'deep-archive-copies': {
Expand Down
3 changes: 3 additions & 0 deletions src/cmd/manage_nsfs.js
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,9 @@ async function manage_glacier_operations(action, argv) {
case GLACIER_ACTIONS.EXPIRY:
await manage_nsfs_glacier.process_expiry();
break;
case GLACIER_ACTIONS.RECLAIM:
await manage_nsfs_glacier.process_reclaim();
break;
default:
throw_cli_error(ManageCLIError.InvalidGlacierOperation);
}
Expand Down
2 changes: 2 additions & 0 deletions src/manage_nsfs/manage_nsfs_constants.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ const GLACIER_ACTIONS = Object.freeze({
MIGRATE: 'migrate',
RESTORE: 'restore',
EXPIRY: 'expiry',
RECLAIM: 'reclaim',
});

const DIAGNOSE_ACTIONS = Object.freeze({
Expand Down Expand Up @@ -72,6 +73,7 @@ const VALID_OPTIONS_GLACIER = {
'migrate': new Set([ CONFIG_ROOT_FLAG]),
'restore': new Set([ CONFIG_ROOT_FLAG]),
'expiry': new Set([ CONFIG_ROOT_FLAG]),
'reclaim': new Set([ CONFIG_ROOT_FLAG]),
};

const VALID_OPTIONS_DIAGNOSE = {
Expand Down
14 changes: 14 additions & 0 deletions src/manage_nsfs/manage_nsfs_glacier.js
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,19 @@ async function process_expiry() {
}
}

async function process_reclaim() {
const fs_context = native_fs_utils.get_process_fs_context();
const backend = Glacier.getBackend();

if (
await backend.low_free_space() ||
!(await time_exceeded(fs_context, config.NSFS_GLACIER_RECLAIM_INTERVAL, Glacier.RECLAIM_TIMESTAMP_FILE))
) return;

await backend.perform(prepare_galcier_fs_context(fs_context), "RECLAIM");
const timestamp_file_path = path.join(config.NSFS_GLACIER_LOGS_DIR, Glacier.RECLAIM_TIMESTAMP_FILE);
await record_current_time(fs_context, timestamp_file_path);
}

/**
* time_exceeded returns true if the time between last run recorded in the given
Expand Down Expand Up @@ -129,3 +142,4 @@ function prepare_galcier_fs_context(fs_context) {
exports.process_migrations = process_migrations;
exports.process_restores = process_restores;
exports.process_expiry = process_expiry;
exports.process_reclaim = process_reclaim;
3 changes: 3 additions & 0 deletions src/native/fs/fs_napi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,11 @@
#define GPFS_DMAPI_DOT_IBMOBJ_EA "IBMObj"
#define GPFS_DMAPI_DOT_IBMPMIG_EA "IBMPMig"
#define GPFS_DMAPI_DOT_IBMTPS_EA "IBMTPS"
#define GPFS_DMAPI_DOT_IBMUID_EA "IBMUID"
#define GPFS_DMAPI_XATTR_TAPE_INDICATOR GPFS_DMAPI_XATTR_PREFIX "." GPFS_DMAPI_DOT_IBMOBJ_EA
#define GPFS_DMAPI_XATTR_TAPE_PREMIG GPFS_DMAPI_XATTR_PREFIX "." GPFS_DMAPI_DOT_IBMPMIG_EA
#define GPFS_DMAPI_XATTR_TAPE_TPS GPFS_DMAPI_XATTR_PREFIX "." GPFS_DMAPI_DOT_IBMTPS_EA
#define GPFS_DMAPI_XATTR_TAPE_UID GPFS_DMAPI_XATTR_PREFIX "." GPFS_DMAPI_DOT_IBMUID_EA

// This macro should be used after openning a file
// it will autoclose the file using AutoCloser and will throw an error in case of failures
Expand Down Expand Up @@ -255,6 +257,7 @@ const static std::vector<std::string> GPFS_DMAPI_XATTRS{
GPFS_DMAPI_XATTR_TAPE_INDICATOR,
GPFS_DMAPI_XATTR_TAPE_PREMIG,
GPFS_DMAPI_XATTR_TAPE_TPS,
GPFS_DMAPI_XATTR_TAPE_UID,
};
const static std::vector<std::string> USER_XATTRS{
"user.content_type",
Expand Down
37 changes: 34 additions & 3 deletions src/sdk/glacier.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class Glacier {
static MIGRATE_TIMESTAMP_FILE = 'migrate.timestamp';
static RESTORE_TIMESTAMP_FILE = 'restore.timestamp';
static EXPIRY_TIMESTAMP_FILE = 'expiry.timestamp';
static RECLAIM_TIMESTAMP_FILE = 'reclaim.timestamp';

/**
* XATTR_RESTORE_REQUEST is set to a NUMBER (expiry days) by `restore_object` when
Expand Down Expand Up @@ -71,10 +72,21 @@ class Glacier {
*/
static GPFS_DMAPI_XATTR_TAPE_TPS = 'dmapi.IBMTPS';

/**
* GPFS_DMAPI_XATTR_TAPE_UID xattr contains UID which contains the unique ID of the UID
*
* Example: `1284427297506873931-5499940123615166566-1799306066-279655-0` (here 279655 is
* the inode number)
*
* NOTE: If IBMUID EA exists, that means the file is either migrated or premigrated.
*/
static GPFS_DMAPI_XATTR_TAPE_UID = 'dmapi.IBMUID';

static MIGRATE_WAL_NAME = 'migrate';
static MIGRATE_STAGE_WAL_NAME = 'stage.migrate';
static RESTORE_WAL_NAME = 'restore';
static RESTORE_STAGE_WAL_NAME = 'stage.restore';
static RECLAIM_WAL_NAME = 'reclaim';

/** @type {nb.RestoreState} */
static RESTORE_STATUS_CAN_RESTORE = 'CAN_RESTORE';
Expand All @@ -86,6 +98,7 @@ class Glacier {
static GLACIER_CLUSTER_LOCK = 'glacier.cluster.lock';
static GLACIER_MIGRATE_CLUSTER_LOCK = 'glacier.cluster.migrate.lock';
static GLACIER_RESTORE_CLUSTER_LOCK = 'glacier.cluster.restore.lock';
static GLACIER_RECLAIM_CLUSTER_LOCK = 'glacier.cluster.reclaim.lock';
static GLACIER_SCAN_LOCK = 'glacier.scan.lock';

/**
Expand Down Expand Up @@ -181,6 +194,20 @@ class Glacier {
throw new Error('Unimplementented');
}

/**
* reclaim cleans up inindexed items in the underlying
* glacier storage
*
* NOTE: This needs to be implemented by each backend.
* @param {nb.NativeFSContext} fs_context
* @param {LogFile} log_file log filename
* @param {(entry: string) => Promise<void>} failure_recorder
* @returns {Promise<boolean>}
*/
async reclaim(fs_context, log_file, failure_recorder) {
throw new Error('Unimplementented');
}

/**
* low_free_space must return true if the backend has
* low free space.
Expand All @@ -199,7 +226,7 @@ class Glacier {

/**
* @param {nb.NativeFSContext} fs_context
* @param {"MIGRATION" | "RESTORE" | "EXPIRY"} type
* @param {"MIGRATION" | "RESTORE" | "EXPIRY" | "RECLAIM"} type
*/
async perform(fs_context, type) {
const lock_path = lock_file => path.join(config.NSFS_GLACIER_LOGS_DIR, lock_file);
Expand All @@ -217,8 +244,8 @@ class Glacier {
* ) => Promise<boolean>} log_cb */

/**
* @param {string} namespace
* @param {log_cb} cb
* @param {string} namespace
* @param {log_cb} cb
*/
const process_glacier_logs = async (namespace, cb) => {
const logs = new PersistentLogger(
Expand Down Expand Up @@ -266,6 +293,10 @@ class Glacier {
this.restore.bind(this),
Glacier.GLACIER_RESTORE_CLUSTER_LOCK,
);
} else if (type === 'RECLAIM') {
await native_fs_utils.lock_and_run(fs_context, lock_path(Glacier.GLACIER_RECLAIM_CLUSTER_LOCK), async () => {
await process_glacier_logs(Glacier.RECLAIM_WAL_NAME, this.reclaim.bind(this));
});
}
}

Expand Down
50 changes: 50 additions & 0 deletions src/sdk/glacier_tapecloud.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ function get_bin_path(bin_name) {
class TapeCloudUtils {
static MIGRATE_SCRIPT = 'migrate';
static RECALL_SCRIPT = 'recall';
static RECLAIM_SCRIPT = 'reclaim';
static TASK_SHOW_SCRIPT = 'task_show';
static PROCESS_EXPIRED_SCRIPT = 'process_expired';
static LOW_FREE_SPACE_SCRIPT = 'low_free_space';
Expand Down Expand Up @@ -182,6 +183,29 @@ class TapeCloudUtils {
}
}

/**
* reclaim takes name of a file which contains the list
* of the files to be reclaimed.
*
* reclaim doesn't perform any failure handling and expects the
* underlying scripts to take care of retries.
*
* @param {string} file filename
* @returns {Promise<boolean>} Indicates success if true
*/
static async reclaim(file) {
try {
dbg.log1("Starting reclaim for file", file);
const out = await exec(`${get_bin_path(TapeCloudUtils.RECLAIM_SCRIPT)} ${file}`, { return_stdout: true });
dbg.log4("reclaim finished with:", out);
dbg.log1("Finished reclaim for file", file);
} catch (error) {
dbg.error("Failed to run TapeCloudUtils.reclaim for file:", file, "due to error:", error);
}

return true;
}
Comment on lines +196 to +207
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Propagate reclaim failures so WAL entries aren’t lost.

TapeCloudUtils.reclaim() always resolves to true, even when exec() fails. That tells PersistentLogger.process() the reclaim batch succeeded, so the WAL file is deleted and the entries are never retried. Please return false (or rethrow) on failure so the WAL stays on disk for the next run.

     static async reclaim(file) {
         try {
             dbg.log1("Starting reclaim for file", file);
             const out = await exec(`${get_bin_path(TapeCloudUtils.RECLAIM_SCRIPT)} ${file}`, { return_stdout: true });
             dbg.log4("reclaim finished with:", out);
             dbg.log1("Finished reclaim for file", file);
-        } catch (error) {
-            dbg.error("Failed to run TapeCloudUtils.reclaim for file:", file, "due to error:", error);
+            return true;
+        } catch (error) {
+            dbg.error("Failed to run TapeCloudUtils.reclaim for file:", file, "due to error:", error);
+            return false;
         }
-
-        return true;
     }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
static async reclaim(file) {
try {
dbg.log1("Starting reclaim for file", file);
const out = await exec(`${get_bin_path(TapeCloudUtils.RECLAIM_SCRIPT)} ${file}`, { return_stdout: true });
dbg.log4("reclaim finished with:", out);
dbg.log1("Finished reclaim for file", file);
} catch (error) {
dbg.error("Failed to run TapeCloudUtils.reclaim for file:", file, "due to error:", error);
}
return true;
}
static async reclaim(file) {
try {
dbg.log1("Starting reclaim for file", file);
const out = await exec(`${get_bin_path(TapeCloudUtils.RECLAIM_SCRIPT)} ${file}`, { return_stdout: true });
dbg.log4("reclaim finished with:", out);
dbg.log1("Finished reclaim for file", file);
return true;
} catch (error) {
dbg.error("Failed to run TapeCloudUtils.reclaim for file:", file, "due to error:", error);
return false;
}
}
🤖 Prompt for AI Agents
In src/sdk/glacier_tapecloud.js around lines 196-207 the async reclaim() always
returns true even when exec() throws, causing PersistentLogger to treat failures
as successes and delete WAL entries; change the catch to return false (or
rethrow) so failures propagate — keep the existing error logging, and ensure the
function only returns true on successful exec and returns false inside the catch
block so the WAL is retained for retry.


static async process_expired() {
dbg.log1("Starting process_expired");
const out = await exec(`${get_bin_path(TapeCloudUtils.PROCESS_EXPIRED_SCRIPT)}`, { return_stdout: true });
Expand Down Expand Up @@ -444,6 +468,21 @@ class TapeCloudGlacier extends Glacier {
}
}

/**
*
* @param {nb.NativeFSContext} fs_context
* @param {LogFile} log_file log filename
* @param {(entry: string) => Promise<void>} failure_recorder
* @returns {Promise<boolean>}
*/
async reclaim(fs_context, log_file, failure_recorder) {
try {
return this._reclaim(log_file.log_path);
} catch (error) {
dbg.error('unexpected error occured while running tapecloud.reclaim:', error);
}
}

async low_free_space() {
const result = await exec(get_bin_path(TapeCloudUtils.LOW_FREE_SPACE_SCRIPT), { return_stdout: true });
return result.toLowerCase().trim() === 'true';
Expand Down Expand Up @@ -511,6 +550,17 @@ class TapeCloudGlacier extends Glacier {
return TapeCloudUtils.process_expired();
}

/**
* _reclaim should perform object reclaim from tape
*
* NOTE: Must be overwritten for tests
* @param {string} file
* @returns {Promise<boolean>}
*/
async _reclaim(file) {
return TapeCloudUtils.reclaim(file);
}

/**
* finalizes the restore by setting the required EAs
*
Expand Down
58 changes: 55 additions & 3 deletions src/sdk/namespace_fs.js
Original file line number Diff line number Diff line change
Expand Up @@ -1407,6 +1407,8 @@ class NamespaceFS {
const is_disabled_dir_content = this._is_directory_content(file_path, params.key) && this._is_versioning_disabled();

const stat = await target_file.stat(fs_context);
const file_path_stat = config.NSFS_GLACIER_DMAPI_ENABLE_TAPE_RECLAIM &&
await nb_native().fs.stat(fs_context, file_path).catch(_.noop);
this._verify_encryption(params.encryption, this._get_encryption_info(stat));

const copy_xattr = params.copy_source && params.xattr_copy;
Expand Down Expand Up @@ -1455,6 +1457,10 @@ class NamespaceFS {
dbg.log1('NamespaceFS._finish_upload:', open_mode, file_path, upload_path, fs_xattr);

if (!same_inode && !part_upload) {
if (file_path_stat) {
await this.append_to_reclaim_wal(fs_context, file_path, file_path_stat);
}

await this._move_to_dest(fs_context, upload_path, file_path, target_file, open_mode, params.key);
}

Expand Down Expand Up @@ -2126,7 +2132,16 @@ class NamespaceFS {
if (files) await this._close_files(fs_context, files.delete_version, undefined, true);
}
} else {
await native_fs_utils.unlink_ignore_enoent(fs_context, file_path);
try {
const stat = config.NSFS_GLACIER_DMAPI_ENABLE_TAPE_RECLAIM &&
await nb_native().fs.stat(fs_context, file_path).catch(dbg.warn.bind(this));
await nb_native().fs.unlink(fs_context, file_path);
if (stat) {
await this.append_to_reclaim_wal(fs_context, file_path, stat);
}
} catch (err) {
if (err.code !== 'ENOENT' && err.code !== 'EISDIR') throw err;
}
Comment on lines +2135 to +2144
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Fix error handling in non-lifecycle deletion reclaim logging.

The current error handling has several issues:

  1. .catch(dbg.warn.bind(this)) on the stat call (line 2137) returns undefined on error, silently suppressing stat failures while still attempting unlink
  2. If append_to_reclaim_wal throws (no try-catch around it), it will fail the entire deletion operation
  3. The outer try-catch only handles ENOENT/EISDIR from unlink, not from reclaim logging

Apply this pattern to ensure reclaim logging failures don't break deletions:

-        try {
-            const stat = config.NSFS_GLACIER_DMAPI_ENABLE_TAPE_RECLAIM &&
-                await nb_native().fs.stat(fs_context, file_path).catch(dbg.warn.bind(this));
-            await nb_native().fs.unlink(fs_context, file_path);
-            if (stat) {
-                await this.append_to_reclaim_wal(fs_context, file_path, stat);
-            }
-        } catch (err) {
-            if (err.code !== 'ENOENT' && err.code !== 'EISDIR') throw err;
-        }
+        try {
+            let stat;
+            if (config.NSFS_GLACIER_DMAPI_ENABLE_TAPE_RECLAIM) {
+                try {
+                    stat = await nb_native().fs.stat(fs_context, file_path);
+                } catch (err) {
+                    dbg.warn('_delete_single_object: failed to stat for reclaim', file_path, err);
+                }
+            }
+            await nb_native().fs.unlink(fs_context, file_path);
+            if (stat) {
+                await this.append_to_reclaim_wal(fs_context, file_path, stat).catch(err => {
+                    dbg.warn('_delete_single_object: failed to log reclaim', file_path, err);
+                });
+            }
+        } catch (err) {
+            if (err.code !== 'ENOENT' && err.code !== 'EISDIR') throw err;
+        }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
try {
const stat = config.NSFS_GLACIER_DMAPI_ENABLE_TAPE_RECLAIM &&
await nb_native().fs.stat(fs_context, file_path).catch(dbg.warn.bind(this));
await nb_native().fs.unlink(fs_context, file_path);
if (stat) {
await this.append_to_reclaim_wal(fs_context, file_path, stat);
}
} catch (err) {
if (err.code !== 'ENOENT' && err.code !== 'EISDIR') throw err;
}
try {
let stat;
if (config.NSFS_GLACIER_DMAPI_ENABLE_TAPE_RECLAIM) {
try {
stat = await nb_native().fs.stat(fs_context, file_path);
} catch (err) {
dbg.warn('_delete_single_object: failed to stat for reclaim', file_path, err);
}
}
await nb_native().fs.unlink(fs_context, file_path);
if (stat) {
await this.append_to_reclaim_wal(fs_context, file_path, stat).catch(err => {
dbg.warn('_delete_single_object: failed to log reclaim', file_path, err);
});
}
} catch (err) {
if (err.code !== 'ENOENT' && err.code !== 'EISDIR') throw err;
}

}

await this._delete_path_dirs(file_path, fs_context);
Expand Down Expand Up @@ -3715,6 +3730,28 @@ class NamespaceFS {
await NamespaceFS.restore_wal.append(Glacier.getBackend().encode_log(entry));
}

/**
*
* @param {nb.NativeFSContext} fs_context
* @param {string} file_path
* @param {nb.NativeFSStats} [stat]
* @returns
*/
async append_to_reclaim_wal(fs_context, file_path, stat) {
if (!config.NSFS_GLACIER_LOGS_ENABLED || !config.NSFS_GLACIER_DMAPI_ENABLE_TAPE_RECLAIM) return;

if (!stat) {
stat = await nb_native().fs.stat(fs_context, file_path);
}

const data = JSON.stringify({
full_path: file_path,
logical_size: stat.size,
ea: stat.xattr,
});
await NamespaceFS.reclaim_wal.append(data);
}

static get migrate_wal() {
if (!NamespaceFS._migrate_wal) {
NamespaceFS._migrate_wal = new PersistentLogger(config.NSFS_GLACIER_LOGS_DIR, Glacier.MIGRATE_WAL_NAME, {
Expand All @@ -3737,6 +3774,17 @@ class NamespaceFS {
return NamespaceFS._restore_wal;
}

static get reclaim_wal() {
if (!NamespaceFS._reclaim_wal) {
NamespaceFS._reclaim_wal = new PersistentLogger(config.NSFS_GLACIER_LOGS_DIR, Glacier.RECLAIM_WAL_NAME, {
poll_interval: config.NSFS_GLACIER_LOGS_POLL_INTERVAL,
locking: 'SHARED',
});
}

return NamespaceFS._reclaim_wal;
}

////////////////////////////
// LIFECYLE HELPERS //
////////////////////////////
Expand All @@ -3763,6 +3811,9 @@ class NamespaceFS {
this._check_lifecycle_filter_before_deletion(params, stat);
const bucket_tmp_dir_path = this.get_bucket_tmpdir_full_path();
await native_fs_utils.safe_unlink(fs_context, file_path, stat, { dir_file, src_file }, bucket_tmp_dir_path);
if (!is_dir_content) {
await this.append_to_reclaim_wal(fs_context, file_path, src_stat).catch(dbg.warn.bind(this));
}
} catch (err) {
dbg.log0('_verify_lifecycle_filter_and_unlink err', err.code, err, file_path);
if (err.code !== 'ENOENT' && err.code !== 'EISDIR') throw err;
Expand Down Expand Up @@ -3809,7 +3860,8 @@ NamespaceFS._migrate_wal = null;
/** @type {PersistentLogger} */
NamespaceFS._restore_wal = null;

/** @type {PersistentLogger} */
NamespaceFS._reclaim_wal = null;

module.exports = NamespaceFS;
module.exports.multi_buffer_pool = multi_buffer_pool;


Loading