wasm-split: Add fuzzer support (#7014)

kripken · web-flow · commit 679c26faec1a · 2024-10-18T12:36:08.000-07:00
The support is added but not enabled as this is still finding bugs.

The first part here is to add Split testcase handler to the fuzzer,
which runs a wasm, then runs it again after splitting it and then
linking it at runtime, and checking for different results.

The second part is support for linking two modules at runtime
in the fuzzer's JS code, that works in tandem with the first part.
New options are added to load and link a second wasm, and to
pick which exports to run.
diff --git a/scripts/fuzz_opt.py b/scripts/fuzz_opt.py
@@ -743,8 +743,8 @@ def run_d8_js(js, args=[], liftoff=True):
 FUZZ_SHELL_JS = in_binaryen('scripts', 'fuzz_shell.js')
 
 
-def run_d8_wasm(wasm, liftoff=True):
-    return run_d8_js(FUZZ_SHELL_JS, [wasm], liftoff=liftoff)
+def run_d8_wasm(wasm, liftoff=True, args=[]):
+    return run_d8_js(FUZZ_SHELL_JS, [wasm] + args, liftoff=liftoff)
 
 
 def all_disallowed(features):
@@ -1391,6 +1391,111 @@ def handle(self, wasm):
         compare_between_vms(output, merged_output, 'Merge')
 
 
+FUNC_NAMES_REGEX = re.compile(r'\n [(]func [$](\S+)')
+
+
+# Tests wasm-split
+class Split(TestCaseHandler):
+    frequency = 1  # TODO: adjust lower when we actually enable this
+
+    def handle(self, wasm):
+        # get the list of function names, some of which we will decide to split
+        # out
+        wat = run([in_bin('wasm-dis'), wasm] + FEATURE_OPTS)
+        all_funcs = re.findall(FUNC_NAMES_REGEX, wat)
+
+        # get the original output before splitting
+        output = run_d8_wasm(wasm)
+        output = fix_output(output)
+
+        # find the names of the exports. we need this because when we split the
+        # module then new exports appear to connect the two halves of the
+        # original module. we do not want to call all the exports on the new
+        # primary module, but only the original ones.
+        exports = []
+        for line in output.splitlines():
+            if FUZZ_EXEC_CALL_PREFIX in line:
+                exports.append(get_export_from_call_line(line))
+
+        # pick which to split out, with a random rate of picking (biased towards
+        # 0.5).
+        rate = (random.random() + random.random()) / 2
+        split_funcs = []
+        for func in all_funcs:
+            if random.random() < rate:
+                split_funcs.append(func)
+
+        if not split_funcs:
+            # nothing to split out
+            return
+
+        # split the wasm into two
+        primary = wasm + '.primary.wasm'
+        secondary = wasm + '.secondary.wasm'
+
+        # we require reference types, because that allows us to create our own
+        # table. without that we use the existing table, and that may interact
+        # with user code in odd ways (it really only works with the particular
+        # form of table+segments that LLVM emits, and not with random fuzzer
+        # content).
+        split_feature_opts = FEATURE_OPTS + ['--enable-reference-types']
+
+        run([in_bin('wasm-split'), wasm, '--split',
+             '--split-funcs', ','.join(split_funcs),
+             '--primary-output', primary,
+             '--secondary-output', secondary] + split_feature_opts)
+
+        # sometimes also optimize the split modules
+        optimized = False
+
+        def optimize(name):
+            # do not optimize if it would change the ABI
+            if CLOSED_WORLD:
+                return name
+            # TODO: use other optimizations here, but we'd need to be careful of
+            #       anything that can alter the ABI, and also current
+            #       limitations of open-world optimizations (see discussion in
+            #       https://github.com/WebAssembly/binaryen/pull/6660)
+            opts = ['-O3']
+            new_name = name + '.opt.wasm'
+            run([in_bin('wasm-opt'), name, '-o', new_name, '-all'] + opts + split_feature_opts)
+            nonlocal optimized
+            optimized = True
+            return new_name
+
+        if random.random() < 0.5:
+            primary = optimize(primary)
+        if random.random() < 0.5:
+            secondary = optimize(secondary)
+
+        # prepare the list of exports to call. the format is
+        #
+        #  exports:A,B,C
+        #
+        exports_to_call = 'exports:' + ','.join(exports)
+
+        # get the output from the split modules, linking them using JS
+        # TODO run liftoff/turboshaft/etc.
+        linked_output = run_d8_wasm(primary, args=[secondary, exports_to_call])
+        linked_output = fix_output(linked_output)
+
+        # see D8.can_compare_to_self: we cannot compare optimized outputs if
+        # NaNs are allowed, as the optimizer can modify NaNs differently than
+        # the JS engine.
+        if not (NANS and optimized):
+            compare_between_vms(output, linked_output, 'Split')
+
+    def can_run_on_feature_opts(self, feature_opts):
+        # to run the split wasm we use JS, that is, JS links the exports of one
+        # to the imports of the other, etc. since we run in JS, the wasm must be
+        # valid for JS.
+        if not LEGALIZE:
+            return False
+
+        # see D8.can_run
+        return all_disallowed(['shared-everything'])
+
+
 # Check that the text format round-trips without error.
 class RoundtripText(TestCaseHandler):
     frequency = 0.05
@@ -1413,6 +1518,8 @@ def handle(self, wasm):
     TrapsNeverHappen(),
     CtorEval(),
     Merge(),
+    # TODO: enable when stable enough, and adjust |frequency| (see above)
+    # Split(),
     RoundtripText()
 ]
 
diff --git a/scripts/fuzz_shell.js b/scripts/fuzz_shell.js
@@ -1,43 +1,54 @@
-// Shell integration.
-if (typeof console === 'undefined') {
-  console = { log: print };
-}
-var tempRet0;
-var binary;
-if (typeof process === 'object' && typeof require === 'function' /* node.js detection */) {
-  var args = process.argv.slice(2);
-  binary = require('fs').readFileSync(args[0]);
-  if (!binary.buffer) binary = new Uint8Array(binary);
+// Shell integration: find argv and set up readBinary().
+var argv;
+var readBinary;
+if (typeof process === 'object' && typeof require === 'function') {
+  // Node.js.
+  argv = process.argv.slice(2);
+  readBinary = function(name) {
+    var data = require('fs').readFileSync(name);
+    if (!data.buffer) data = new Uint8Array(data);
+    return data;
+  };
 } else {
-  var args;
+  // A shell like D8.
   if (typeof scriptArgs != 'undefined') {
-    args = scriptArgs;
+    argv = scriptArgs;
   } else if (typeof arguments != 'undefined') {
-    args = arguments;
-  }
-  if (typeof readbuffer === 'function') {
-    binary = new Uint8Array(readbuffer(args[0]));
-  } else {
-    binary = read(args[0], 'binary');
+    argv = arguments;
   }
+  readBinary = function(name) {
+    if (typeof readbuffer === 'function') {
+      return new Uint8Array(readbuffer(name));
+    } else {
+      return read(name, 'binary');
+    }
+  };
+}
+
+// We are given the binary to run as a parameter.
+var binary = readBinary(argv[0]);
+
+// Normally we call all the exports of the given wasm file. But, if we are
+// passed a final parameter in the form of "exports:X,Y,Z" then we call
+// specifically the exports X, Y, and Z.
+var exportsToCall;
+if (argv[argv.length - 1].startsWith('exports:')) {
+  exportsToCall = argv[argv.length - 1].substr('exports:'.length).split(',');
+  argv.pop();
+}
+
+// If a second parameter is given, it is a second binary that we will link in
+// with it.
+var secondBinary;
+if (argv[1]) {
+  secondBinary = readBinary(argv[1]);
 }
 
 // Utilities.
 function assert(x, y) {
   if (!x) throw (y || 'assertion failed');// + new Error().stack;
 }
 
-// Deterministic randomness.
-var detrand = (function() {
-  var hash = 5381; // TODO DET_RAND_SEED;
-  var x = 0;
-  return function() {
-    hash = (((hash << 5) + hash) ^ (x & 0xff)) >>> 0;
-    x = (x + 1) % 256;
-    return (hash % 256) / 256;
-  };
-})();
-
 // Print out a value in a way that works well for fuzzing.
 function printed(x, y) {
   if (typeof y !== 'undefined') {
@@ -124,6 +135,7 @@ function logValue(x, y) {
 }
 
 // Set up the imports.
+var tempRet0;
 var imports = {
   'fuzzing-support': {
     'log-i32': logValue,
@@ -151,6 +163,24 @@ if (typeof WebAssembly.Tag !== 'undefined') {
   };
 }
 
+// If a second binary will be linked in then set up the imports for
+// placeholders. Any import like  (import "placeholder" "0" (func ..  will be
+// provided by the secondary module, and must be called using an indirection.
+if (secondBinary) {
+  imports['placeholder'] = new Proxy({}, {
+    get(target, prop, receiver) {
+      // Return a function that throws. We could do an indirect call using the
+      // exported table, but as we immediately link in the secondary module,
+      // these stubs will not be called (they are written to the table, and the
+      // secondary module overwrites them). We do need to return something so
+      // the primary module links without erroring, though.
+      return () => {
+        throw 'proxy stub should not be called';
+      }
+    }
+  });
+}
+
 // Create the wasm.
 var module = new WebAssembly.Module(binary);
 
@@ -165,17 +195,32 @@ try {
 // Handle the exports.
 var exports = instance.exports;
 
-var view;
+// Link in a second module, if one was provided.
+if (secondBinary) {
+  var secondModule = new WebAssembly.Module(secondBinary);
 
-// Recreate the view. This is important both initially and after a growth.
-function refreshView() {
-  if (exports.memory) {
-    view = new Int32Array(exports.memory.buffer);
+  // The secondary module just needs to import the primary one: all original
+  // imports it might have needed were exported from there.
+  var secondImports = {'primary': exports};
+  var secondInstance;
+  try {
+    secondInstance = new WebAssembly.Instance(secondModule, secondImports);
+  } catch (e) {
+    console.log('exception thrown: failed to instantiate second module');
+    quit();
   }
 }
 
 // Run the wasm.
-for (var e in exports) {
+if (!exportsToCall) {
+  // We were not told specific exports, so call them all.
+  exportsToCall = [];
+  for (var e in exports) {
+    exportsToCall.push(e);
+  }
+}
+
+for (var e of exportsToCall) {
   if (typeof exports[e] !== 'function') {
     continue;
   }