From ac1af7402063fb704f2c343aa1cd912ad9bab282 Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Fri, 8 Aug 2025 09:33:15 -0700
Subject: [PATCH 01/19] add supported set of instructions (ref spectec)

---
 theories/datatypes.v | 197 ++++++++++++++++++++++++++++++++++---------
 theories/simd.v      |  13 +++
 2 files changed, 172 insertions(+), 38 deletions(-)
 create mode 100644 theories/simd.v

diff --git a/theories/datatypes.v b/theories/datatypes.v
index dda9c7d5..aef8eb33 100644
--- a/theories/datatypes.v
+++ b/theories/datatypes.v
@@ -33,6 +33,8 @@ Definition u8: Set := N.
 (* 2^32 *)
 Definition u32_bound : N := 4294967296%N.
 
+Definition u8_bound : N := 256%N.
+
 Definition typeidx : Set := u32.
 Definition funcidx : Set := u32.
 Definition tableidx : Set := u32.
@@ -409,57 +411,175 @@ Inductive packed_type : Set := (* tp *)
   | Tp_i32
   .
 
-Inductive shape_vec_i: Set :=
-  | SVI_8_16
-  | SVI_16_8
-  | SVI_32_4
-  | SVI_64_2
+(** SIMD Definitions and Instructions **)
+  
+Inductive vshape_i: Set :=
+  | VSI_8_16
+  | VSI_16_8
+  | VSI_32_4
+  | VSI_64_2
   .
   
-Inductive shape_vec_f: Set :=
-  | SVF_32_4
-  | SVF_64_2
+Inductive vshape_f: Set :=
+  | VSF_32_4
+  | VSF_64_2
   .
   
-Inductive shape_vec : Set := (* shape *)
-  | SV_ishape: shape_vec_i -> shape_vec
-  | SV_fshape: shape_vec_f -> shape_vec
+Inductive vshape : Set := (* shape *)
+  | VS_i: vshape_i -> vshape
+  | VS_f: vshape_f -> vshape
+  .
+
+Inductive vec_half : Set :=
+  | VH_low
+  | VH_high
+  .
+
+Definition laneidx : Set := u8. 
+
+Inductive vvunop : Set :=
+  | VVU_not
+  .
+
+Inductive viunop : Set :=
+  | VUOI_abs
+  | VUOI_neg
+  | VUOI_popcnt (* has shape constraints: i8x16 *)
   .
 
-Inductive unop_vec : Set :=
-  | VUO_not
+Inductive vfunop : Set :=
+  | VUOF_abs
+  | VUOF_neg
+  | VUOF_sqrt
+  | VUOF_ceil
+  | VUOF_floor
+  | VUOF_trunc
+  | VUOF_nearest
+  .
+
+(* Indicate whether a cvtop has a zero flag. *)
+Inductive vec_zero : Set :=
+  | VZ_zero
   .
   
-Inductive binop_vec : Set :=
-  | VBO_and
+Inductive vcvtop : Set :=
+  | VCVT_extend : vec_half -> sx -> vcvtop
+  | VCVT_trunc_sat : sx -> option vec_zero -> vcvtop
+  | VCVT_convert : option vec_half -> sx -> vcvtop
+  | VCVT_demote : vec_zero -> vcvtop (* Superficial tag since zero has to be present for demote; only available on f64x2 *)
+  | VCVT_promote (* only available on LOW half and f32x4 *)
   .
   
-Inductive ternop_vec : Set :=
-  | VTO_bitselect
+Inductive vunop : Set :=
+  | VI_unop: viunop -> vunop
+  | VF_unop: vfunop -> vunop
+  | VV_unop: vvunop -> vunop
+  | V_cvtop: vcvtop -> vunop
+  | V_extadd_pairwise: sx -> vunop (* i16 and i32 only *)
   .
   
-Inductive test_vec : Set :=
-  | VT_any_true
+Inductive vvbinop : Set :=
+  | VVB_and
+  | VVB_andnot
+  | VVB_or
+  | VVB_xor
   .
   
-Inductive shift_vec : Set :=
-  | VSH_any_true
+Inductive vibinop : Set :=
+  | VBOI_add
+  | VBOI_sub
+  | VBOI_add_sat: sx -> vibinop (* i8 and i16 only *)
+  | VBOI_sub_sat: sx -> vibinop (* i8 and i16 only *)
+  | VBOI_mul (* i16/i32/i64 only *)
+  | VBOI_avgr (* unsigned only, i8/i16 only *)
+  | VBOI_q15mulr_sat (* signed only, i8/i16 only *)
+  | VBOI_min: sx -> vibinop (* i8/i16/i32 only *)
+  | VBOI_max: sx -> vibinop (* i8/i16/i32 only *)
+  .
+
+Inductive vfbinop : Set :=
+  | VBOF_add
+  | VBOF_sub
+  | VBOF_mul
+  | VBOF_div
+  | VBOF_min
+  | VBOF_max
+  | VBOF_pmin
+  | VBOF_pmax
   .
 
-Definition laneidx := u8.
+Inductive vextbinop: Set :=
+  | VBOE_extmul: vec_half -> sx -> vextbinop
+  | VBOE_dot (* signed only, i32 only *)
+.
+  
+Inductive vbinop : Set :=
+  | VI_binop: vibinop -> vbinop
+  | VF_binop: vfbinop -> vbinop
+  | VV_binop: vvbinop -> vbinop
+  | VE_binop: vextbinop -> vbinop
+  | V_narrow: vshape_i -> sx -> vbinop (* resulting width needs to be 2x original and at most i32 *)
+  .
 
-Inductive packed_type_vec :=
+(* Technically this is vvternop. But this is the only ternary operation. *)
+Inductive vternop : Set :=
+  | VT_bitselect
+  .
+
+Inductive vvtestop : Set :=
+  | VVT_any_true
+  .
+  
+Inductive vitestop : Set :=
+  | VIT_all_true
+  .
+
+Inductive vtestop : Set :=
+  | VI_testop: vitestop -> vtestop
+  | VV_testop: vvtestop -> vtestop
+  .
+
+Inductive virelop : Set :=
+  | VIR_eq
+  | VIR_ne
+  | VIR_lt: sx -> virelop (* not available on i64x2 unsigned *)
+  | VIR_gt: sx -> virelop (* not available on i64x2 unsigned *)
+  | VIR_le: sx -> virelop (* not available on i64x2 unsigned *)
+  | VIR_ge: sx -> virelop (* not available on i64x2 unsigned *)
+  .
+  
+Inductive vfrelop : Set :=
+  | VFR_eq
+  | VFR_ne
+  | VFR_lt: sx -> vfrelop 
+  | VFR_gt: sx -> vfrelop
+  | VFR_le: sx -> vfrelop
+  | VFR_ge: sx -> vfrelop
+  .
+
+Inductive vrelop : Set :=
+  | VI_relop: virelop -> vrelop
+  | VF_relop: vfrelop -> vrelop
+  .
+
+(* Technically vishiftop, but shifts are only available for integers *)
+Inductive vshiftop : Set :=
+  | VS_shl
+  | VS_shr : sx -> vshiftop
+  .
+
+Inductive vpacked_type :=
   | Tptv_8_8
   | Tptv_16_4
   | Tptv_32_2
 .
 
-Inductive zero_type_vec :=
+Inductive vzero_type :=
   | Tztv_32
   | Tztv_64
 .
 
-Inductive width_vec :=
+Inductive vwidth :=
   | Twv_8
   | Twv_16
   | Twv_32
@@ -467,9 +587,9 @@ Inductive width_vec :=
   .
 
 Inductive load_vec_arg :=
-  | LVA_packed: packed_type_vec -> sx -> load_vec_arg
-  | LVA_zero: zero_type_vec -> load_vec_arg
-  | LVA_splat: width_vec -> load_vec_arg
+  | LVA_packed: vpacked_type -> sx -> load_vec_arg
+  | LVA_zero: vzero_type -> load_vec_arg
+  | LVA_splat: vwidth -> load_vec_arg
   .
 
 Record memarg : Set :=
@@ -502,14 +622,14 @@ Extract lanes: consume a v128 operand and return the numeric value in a given la
 Replace lanes: consume a v128 operand and a numeric value for a given lane, and produce a v128 result.
 **)
   | BI_const_vec : value_vec -> basic_instruction
-  | BI_unop_vec: unop_vec -> basic_instruction
-  | BI_binop_vec: binop_vec -> basic_instruction
-  | BI_ternop_vec: ternop_vec -> basic_instruction
-  | BI_test_vec: test_vec -> basic_instruction
-  | BI_shift_vec: shift_vec -> basic_instruction
-  | BI_splat_vec: shape_vec -> basic_instruction
-  | BI_extract_vec: shape_vec -> option sx -> laneidx -> basic_instruction
-  | BI_replace_vec: shape_vec -> laneidx -> basic_instruction
+  | BI_unop_vec: vshape -> vunop -> basic_instruction
+  | BI_binop_vec: vshape -> vbinop -> basic_instruction
+  | BI_ternop_vec: vshape -> vternop -> basic_instruction
+  | BI_test_vec: vshape -> vtestop -> basic_instruction
+  | BI_shift_vec: vshape -> vshiftop -> basic_instruction
+  | BI_splat_vec: vshape -> basic_instruction
+  | BI_extract_vec: vshape -> option sx -> laneidx -> basic_instruction (* sx only available for i8/i16 *)
+  | BI_replace_vec: vshape -> laneidx -> basic_instruction
 (** std-doc:
 Instructions in this group are concerned with accessing references.
 **)
@@ -546,9 +666,9 @@ Instructions in this group are concerned with linear memory.
   | BI_load : number_type -> option (packed_type * sx) -> memarg -> basic_instruction
   | BI_load_vec : load_vec_arg -> memarg -> basic_instruction
   (* the lane version has a different type signature *)
-  | BI_load_vec_lane : width_vec -> memarg -> laneidx -> basic_instruction
+  | BI_load_vec_lane : vwidth -> memarg -> laneidx -> basic_instruction
   | BI_store : number_type -> option packed_type -> memarg -> basic_instruction
-  | BI_store_vec_lane : width_vec -> memarg -> laneidx -> basic_instruction
+  | BI_store_vec_lane : vwidth -> memarg -> laneidx -> basic_instruction
   | BI_memory_size
   | BI_memory_grow
   | BI_memory_fill
@@ -1069,6 +1189,7 @@ the current function originates from.
 Definition thread : Type := frame * list administrative_instruction.
 
 Definition config_tuple : Type := store_record * thread.
+
 End Host.
 End Memory.
 
diff --git a/theories/simd.v b/theories/simd.v
new file mode 100644
index 00000000..de23690c
--- /dev/null
+++ b/theories/simd.v
@@ -0,0 +1,13 @@
+From mathcomp Require Import ssreflect ssrbool eqtype seq ssrnat.
+From Coq Require Import BinInt BinNat NArith Lia Uint63.
+From Wasm Require Import numerics bytes memory common.
+
+Set Implicit Arguments.
+Unset Strict Implicit.
+Unset Printing Implicit Defensive.
+
+Module Type SIMD.
+
+Parameter v128: Type.
+
+End SIMD.

From 0d38b4bc5af9d6dea91dac3b568f4aedfe59bafc Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Mon, 11 Aug 2025 02:48:29 -0700
Subject: [PATCH 02/19] restructure simd for adding implementation

---
 theories/binary_format_printer.v | 30 +++++++-------
 theories/datatypes.v             | 10 ++---
 theories/interpreter_ctx.v       | 56 ++++++++++++-------------
 theories/operations.v            | 70 ++++++++++++++++----------------
 theories/opsem.v                 | 46 ++++++++++-----------
 theories/pp.v                    | 40 +++++++++---------
 theories/type_checker.v          | 10 ++---
 theories/type_preservation.v     | 20 ++++-----
 theories/typing.v                | 52 ++++++++++++------------
 theories/typing_inversion.v      | 10 ++---
 10 files changed, 172 insertions(+), 172 deletions(-)

diff --git a/theories/binary_format_printer.v b/theories/binary_format_printer.v
index cb5367f3..2af736eb 100644
--- a/theories/binary_format_printer.v
+++ b/theories/binary_format_printer.v
@@ -107,38 +107,38 @@ Definition binary_of_valvec (v: value_vec) :=
 (* placeholder for vector operations added in 2.0, to be filled in a future update
 https://webassembly.github.io/spec/core/binary/instructions.html#vector-instructions
 *)
-Definition binary_of_unop_vec (op: unop_vec) :=
+Definition binary_of_vunop (sh: vshape) (op: vunop) :=
   xfd :: x0c :: (List.repeat x00 16).
 
-Definition binary_of_binop_vec (op: binop_vec) :=
+Definition binary_of_vbinop (sh: vshape) (op: vbinop) :=
   xfd :: x0c :: (List.repeat x00 16).
 
-Definition binary_of_ternop_vec (op: ternop_vec) :=
+Definition binary_of_vternop (sh: vshape) (op: vternop) :=
   xfd :: x0c :: (List.repeat x00 16).
 
-Definition binary_of_test_vec (op: test_vec) :=
+Definition binary_of_vtestop (sh: vshape) (op: vtestop) :=
   xfd :: x0c :: (List.repeat x00 16).
 
-Definition binary_of_shift_vec (op: shift_vec) :=
+Definition binary_of_vshiftop (sh: vshape) (op: vshiftop) :=
   xfd :: x0c :: (List.repeat x00 16).
 
-Definition binary_of_splat_vec (sh: shape_vec) :=
+Definition binary_of_splat_vec (sh: vshape) :=
   xfd :: x0c :: (List.repeat x00 16).
 
-Definition binary_of_extract_vec (sh: shape_vec) (s: option sx) (x: laneidx) :=
+Definition binary_of_extract_vec (sh: vshape) (s: option sx) (x: laneidx) :=
   xfd :: x0c :: (List.repeat x00 16).
 
-Definition binary_of_replace_vec (sh: shape_vec) (x: laneidx) :=
+Definition binary_of_replace_vec (sh: vshape) (x: laneidx) :=
   xfd :: x0c :: (List.repeat x00 16).
   
 Definition binary_of_load_vec (lvarg: load_vec_arg) (marg: memarg) :=
   xfd :: x0c :: (List.repeat x00 16).
 
-Definition binary_of_load_vec_lane (w: width_vec) (marg: memarg) (x: laneidx) :=
+Definition binary_of_load_vec_lane (w: vwidth) (marg: memarg) (x: laneidx) :=
   xfd :: x0c :: (List.repeat x00 16).
 
 (* store_vec_lane and load_vec uses the same args. Maybe it's better to find a new name *)
-Definition binary_of_store_vec_lane (w: width_vec) (marg: memarg) (x: laneidx) :=
+Definition binary_of_store_vec_lane (w: vwidth) (marg: memarg) (x: laneidx) :=
   xfd :: x0c :: (List.repeat x00 16).
 
 Fixpoint binary_of_be (be : basic_instruction) : list byte :=
@@ -394,11 +394,11 @@ Fixpoint binary_of_be (be : basic_instruction) : list byte :=
 
   | BI_const_vec v => binary_of_valvec v
 
-  | BI_unop_vec op => binary_of_unop_vec op
-  | BI_binop_vec op => binary_of_binop_vec op
-  | BI_ternop_vec op => binary_of_ternop_vec op
-  | BI_test_vec op => binary_of_test_vec op
-  | BI_shift_vec op => binary_of_shift_vec op
+  | BI_vunop sh op => binary_of_vunop sh op
+  | BI_vbinop sh op => binary_of_vbinop sh op
+  | BI_vternop sh op => binary_of_vternop sh op
+  | BI_vtestop sh op => binary_of_vtestop sh op
+  | BI_vshiftop sh op => binary_of_vshiftop sh op
   | BI_splat_vec sh => binary_of_splat_vec sh
   | BI_extract_vec sh s lanex => binary_of_extract_vec sh s lanex
   | BI_replace_vec sh lanex => binary_of_replace_vec sh lanex
diff --git a/theories/datatypes.v b/theories/datatypes.v
index aef8eb33..a7fa4cee 100644
--- a/theories/datatypes.v
+++ b/theories/datatypes.v
@@ -622,11 +622,11 @@ Extract lanes: consume a v128 operand and return the numeric value in a given la
 Replace lanes: consume a v128 operand and a numeric value for a given lane, and produce a v128 result.
 **)
   | BI_const_vec : value_vec -> basic_instruction
-  | BI_unop_vec: vshape -> vunop -> basic_instruction
-  | BI_binop_vec: vshape -> vbinop -> basic_instruction
-  | BI_ternop_vec: vshape -> vternop -> basic_instruction
-  | BI_test_vec: vshape -> vtestop -> basic_instruction
-  | BI_shift_vec: vshape -> vshiftop -> basic_instruction
+  | BI_vunop: vshape -> vunop -> basic_instruction
+  | BI_vbinop: vshape -> vbinop -> basic_instruction
+  | BI_vternop: vshape -> vternop -> basic_instruction
+  | BI_vtestop: vshape -> vtestop -> basic_instruction
+  | BI_vshiftop: vshape -> vshiftop -> basic_instruction
   | BI_splat_vec: vshape -> basic_instruction
   | BI_extract_vec: vshape -> option sx -> laneidx -> basic_instruction (* sx only available for i8/i16 *)
   | BI_replace_vec: vshape -> laneidx -> basic_instruction
diff --git a/theories/interpreter_ctx.v b/theories/interpreter_ctx.v
index ab5f2777..3386fedb 100644
--- a/theories/interpreter_ctx.v
+++ b/theories/interpreter_ctx.v
@@ -534,14 +534,14 @@ Proof.
       (* BI_relop t op *) t op |
       (* BI_cvtop t2 cvtop t1 sx *) t2 cvtop t1 sx |
       (* BI_const_vec _ *) |
-      (* BI_unop_vec _ *) op |
-      (* BI_binop_vec _ *) op |
-      (* BI_ternop_vec _ *) op |
-      (* BI_test_vec _ *) tv |
-      (* BI_shift_vec _ *) sv |
-      (* BI_splat_vec _ *) shape |
-      (* BI_extract_vec shape_vec [Some sx | None] laneidx *) shape sx x |
-      (* BI_replace_vec shape_vec laneidx *) shape x |
+      (* BI_vunop sh op *) sh op |
+      (* BI_vbinop sh op *) sh op |
+      (* BI_vternop sh op *) sh op |
+      (* BI_vtestop sh op *) sh op |
+      (* BI_vshiftop sh op*) sh op |
+      (* BI_splat_vec sh *) sh |
+      (* BI_extract_vec shape_vec [Some sx | None] laneidx *) sh sx x |
+      (* BI_replace_vec shape_vec laneidx *) sh x |
       (* BI_ref_null t *) t |
       (* BI_ref_is_null *) |
       (* BI_ref_func x *) x |
@@ -705,58 +705,58 @@ the condition that all values should live in the operand stack. *)
     (* AI_basic BI_const_vec v *)
     - apply RSC_invalid => /=; by move => [??].
       
-    (* AI_basic BI_unop_vec op *)
+    (* AI_basic BI_vunop sh op *)
     - destruct vs0 as [|v vs0]; first by no_args.
       assert_value_vec v.
-      apply <<hs, (s, (fc, lcs) :: ccs', (VAL_vec (app_unop_vec op v) :: vs0, es0), None), d>> => //.
+      apply <<hs, (s, (fc, lcs) :: ccs', (VAL_vec (app_vunop sh op v) :: vs0, es0), None), d>> => //.
       resolve_reduce_ctx vs0 es0.
-      by apply r_simple, rs_unop_vec.
+      by apply r_simple, rs_vunop.
       
-    (* AI_basic BI_binop_vec op *)
+    (* AI_basic BI_vbinop sh op *)
     - destruct vs0 as [|v2 [|v1 vs0]]; try by no_args.
       assert_value_vec v1.
       assert_value_vec v2.
-      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_vec (app_binop_vec op v1 v2) :: vs0, es0), None), d>> => //.
+      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_vec (app_vbinop sh op v1 v2) :: vs0, es0), None), d>> => //.
       resolve_reduce_ctx vs0 es0.
-      by apply r_simple, rs_binop_vec.
+      by apply r_simple, rs_vbinop.
       
-    (* AI_basic BI_ternop_vec op *)
+    (* AI_basic BI_vternop sh op *)
     - destruct vs0 as [|v3 [|v2 [|v1 vs0]]]; try by no_args.
       assert_value_vec v1.
       assert_value_vec v2.
       assert_value_vec v3.
-      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_vec (app_ternop_vec op v1 v2 v3) :: vs0, es0), None), d>> => //.
+      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_vec (app_vternop sh op v1 v2 v3) :: vs0, es0), None), d>> => //.
       resolve_reduce_ctx vs0 es0.
-      by apply r_simple, rs_ternop_vec.
+      by apply r_simple, rs_vternop.
       
-    (* AI_basic BI_test_vec tv *)
+    (* AI_basic BI_vtestop sh tv *)
     - destruct vs0 as [|v vs0]; first by no_args.
       assert_value_vec v.
-      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_num (VAL_int32 (wasm_bool (app_test_vec tv v))) :: vs0, es0), None), d>> => //.
+      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_num (VAL_int32 (wasm_bool (app_vtestop sh op v))) :: vs0, es0), None), d>> => //.
       resolve_reduce_ctx vs0 es0.
-      by apply r_simple, rs_test_vec.
+      by apply r_simple, rs_vtestop.
       
-    (* AI_basic BI_shift_vec sv *)
+    (* AI_basic BI_vshiftop sh sv *)
     - destruct vs0 as [|v2 [|v1 vs0]]; try by no_args.
       assert_value_vec v1.
       assert_i32 v2.
-      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_vec (app_shift_vec sv v1 v2) :: vs0, es0), None), d>> => //.
+      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_vec (app_vshiftop sh op v1 v2) :: vs0, es0), None), d>> => //.
       resolve_reduce_ctx vs0 es0.
-      by apply r_simple, rs_shift_vec.
+      by apply r_simple, rs_vshiftop.
       
     (* AI_basic BI_splat_vec shape *)
     - destruct vs0 as [|v vs0]; first by no_args.
       assert_value_num v.
-      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_vec (app_splat_vec shape v) :: vs0, es0), None), d>> => //.
+      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_vec (app_splat_vec sh v) :: vs0, es0), None), d>> => //.
       resolve_reduce_ctx vs0 es0.
       by apply r_simple, rs_splat_vec.
       
     (* AI_basic (BI_extract_vec shape sx x) *)
     - destruct vs0 as [|v vs0]; first by no_args.
       assert_value_vec v.
-      destruct (N.ltb x (shape_dim shape)) eqn:Hlanebound.
+      destruct (N.ltb x (shape_dim sh)) eqn:Hlanebound.
       (* in bound *)
-      + apply <<hs, (s, (fc,lcs) :: ccs', (VAL_num (app_extract_vec shape sx x v) :: vs0, es0), None), d>> => //.
+      + apply <<hs, (s, (fc,lcs) :: ccs', (VAL_num (app_extract_vec sh sx x v) :: vs0, es0), None), d>> => //.
         resolve_reduce_ctx vs0 es0.
         by apply r_simple, rs_extract_vec.
       (* out of bound *)
@@ -766,9 +766,9 @@ the condition that all values should live in the operand stack. *)
     - destruct vs0 as [|v2 [|v1 vs0]]; try by no_args.
       assert_value_vec v1.
       assert_value_num v2.
-      destruct (N.ltb x (shape_dim shape)) eqn:Hlanebound.
+      destruct (N.ltb x (shape_dim sh)) eqn:Hlanebound.
       (* in bound *)
-      + apply <<hs, (s, (fc,lcs) :: ccs', (VAL_vec (app_replace_vec shape x v1 v2) :: vs0, es0), None), d>> => //.
+      + apply <<hs, (s, (fc,lcs) :: ccs', (VAL_vec (app_replace_vec sh x v1 v2) :: vs0, es0), None), d>> => //.
         resolve_reduce_ctx vs0 es0.
         by apply r_simple, rs_replace_vec.
       (* out of bound *)
diff --git a/theories/operations.v b/theories/operations.v
index 9b8a40bb..e5ce9591 100644
--- a/theories/operations.v
+++ b/theories/operations.v
@@ -91,20 +91,20 @@ Definition mem_grow (m : meminst) (len_delta : N) : option meminst :=
     end
   else None.
 
-Definition ptv_to_nm (ptv: packed_type_vec) : N * N :=
+Definition ptv_to_nm (ptv: vpacked_type) : N * N :=
   match ptv with
   | Tptv_8_8 => (8%N, 8%N)
   | Tptv_16_4 => (16%N, 4%N)
   | Tptv_32_2 => (32%N, 2%N)
   end.
 
-Definition ztv_to_n (ztv: zero_type_vec) : N :=
+Definition ztv_to_n (ztv: vzero_type) : N :=
   match ztv with
   | Tztv_32 => 32%N
   | Tztv_64 => 64%N
   end.
 
-Definition width_to_n (ww: width_vec) : N :=
+Definition width_to_n (ww: vwidth) : N :=
   match ww with
   | Twv_8 => 8%N
   | Twv_16 => 16%N
@@ -123,7 +123,7 @@ Definition load_vec_bounds (lv_arg: load_vec_arg) (m_arg: memarg) : bool :=
       N.leb (N.pow 2 m_arg.(memarg_align)) (N.div (width_to_n width) 8)
   end.
 
-Definition load_vec_lane_bounds (width: width_vec) (m_arg: memarg) (x: laneidx) : bool :=
+Definition load_vec_lane_bounds (width: vwidth) (m_arg: memarg) (x: laneidx) : bool :=
    (N.leb (N.pow 2 (memarg_align m_arg)) (width_to_n width / 8))%N &&
                          (N.ltb x (128 / width_to_n width)%N). 
 
@@ -155,7 +155,7 @@ Definition load_vec (m: meminst) (i: N) (lvarg: load_vec_arg) (marg: memarg) : o
   | _ => None
   end.
 
-Definition load_vec_lane (m: meminst) (i: N) (v: value_vec) (width: width_vec) (marg: memarg) (x: laneidx) : option value_vec :=
+Definition load_vec_lane (m: meminst) (i: N) (v: value_vec) (width: vwidth) (marg: memarg) (x: laneidx) : option value_vec :=
   match (i == marg.(memarg_offset)) with
   | true => Some v
   | _ => None
@@ -170,7 +170,7 @@ Definition store (m : meminst) (n : N) (off : static_offset) (bs : bytes) (l : n
 Might need a change in the future if this behaviour changes. *)
 Definition store_packed := store.
 
-Definition store_vec_lane (m: meminst) (n: N) (v: value_vec) (width: width_vec) (marg: memarg) (x: laneidx) : option meminst :=
+Definition store_vec_lane (m: meminst) (n: N) (v: value_vec) (width: vwidth) (marg: memarg) (x: laneidx) : option meminst :=
   match (n == marg.(memarg_offset)) with
   | true => Some m
   | _ => None
@@ -542,54 +542,54 @@ Definition app_relop (op: relop) (v1: value_num) (v2: value_num) :=
     end
   end.
 
-Definition app_unop_vec (op: unop_vec) (v1: value_vec) : value_vec :=
+Definition app_vunop (sh: vshape) (op: vunop) (v1: value_vec) : value_vec :=
   v1.
 
-Definition app_binop_vec (op: binop_vec) (v1 v2: value_vec) : value_vec :=
+Definition app_vbinop (sh: vshape) (op: vbinop) (v1 v2: value_vec) : value_vec :=
   v1.
 
-Definition app_ternop_vec (op: ternop_vec) (v1 v2 v3: value_vec) : value_vec :=
+Definition app_vternop (sh: vshape) (op: vternop) (v1 v2 v3: value_vec) : value_vec :=
   v1.
 
-Definition app_test_vec (op: test_vec) (v1: value_vec) : bool :=
+Definition app_vtestop (sh: vshape) (op: vtestop) (v1: value_vec) : bool :=
   true.
 
-Definition app_shift_vec (op: shift_vec) (v1: value_vec) (v2: i32) : value_vec :=
+Definition app_vshiftop (sh: vshape) (op: vshiftop) (v1: value_vec) (v2: i32) : value_vec :=
   v1.
 
-Definition app_splat_vec (shape: shape_vec) (v1: value_num) : value_vec :=
+Definition app_splat_vec (sh: vshape) (v1: value_num) : value_vec :=
   VAL_vec128 tt.
 
-Definition app_extract_vec (shape: shape_vec) (s: option sx) (n: laneidx) (v1: value_vec) : value_num :=
-  match shape with
-  | SV_ishape svi =>
-      match svi with
-      | SVI_64_2 => bitzero T_i64
+Definition app_extract_vec (sh: vshape) (s: option sx) (n: laneidx) (v1: value_vec) : value_num :=
+  match sh with
+  | VS_i vsi =>
+      match vsi with
+      | VSI_64_2 => bitzero T_i64
       | _ => bitzero T_i32
       end
-  | SV_fshape svf =>
-      match svf with
-      | SVF_32_4 => bitzero T_f32
-      | SVF_64_2 => bitzero T_f64
+  | VS_f vsf =>
+      match vsf with
+      | VSF_32_4 => bitzero T_f32
+      | VSF_64_2 => bitzero T_f64
       end
   end.
 
-Definition app_replace_vec (shape: shape_vec) (n: laneidx) (v1: value_vec) (v2: value_num) : value_vec :=
+Definition app_replace_vec (sh: vshape) (n: laneidx) (v1: value_vec) (v2: value_num) : value_vec :=
   v1.
 
-Definition shape_dim (shape: shape_vec) : N :=
-  match shape with
-  | SV_ishape svi =>
-      match svi with
-      | SVI_8_16 => 16
-      | SVI_16_8 => 8
-      | SVI_32_4 => 4
-      | SVI_64_2 => 2
+Definition shape_dim (sh: vshape) : N :=
+  match sh with
+  | VS_i vsi =>
+      match vsi with
+      | VSI_8_16 => 16
+      | VSI_16_8 => 8
+      | VSI_32_4 => 4
+      | VSI_64_2 => 2
       end
-  | SV_fshape svf =>
-      match svf with
-      | SVF_32_4 => 4
-      | SVF_64_2 => 2
+  | VS_f vsf =>
+      match vsf with
+      | VSF_32_4 => 4
+      | VSF_64_2 => 2
       end
   end.
 
@@ -773,7 +773,7 @@ Definition smem_store_packed (s: store_record) (inst: moduleinst) (n: N) (off: s
   | None => None
   end.
 
-Definition smem_store_vec_lane (s: store_record) (inst: moduleinst) (n: N) (v: value_vec) (width: width_vec) (marg: memarg) (x: laneidx) : option store_record :=
+Definition smem_store_vec_lane (s: store_record) (inst: moduleinst) (n: N) (v: value_vec) (width: vwidth) (marg: memarg) (x: laneidx) : option store_record :=
   match lookup_N inst.(inst_mems) 0%N with
   | Some addr =>
       match lookup_N s.(s_mems) addr with
diff --git a/theories/opsem.v b/theories/opsem.v
index 959cc258..4d8468b0 100644
--- a/theories/opsem.v
+++ b/theories/opsem.v
@@ -59,32 +59,32 @@ Inductive reduce_simple : seq administrative_instruction -> seq administrative_i
     reduce_simple [::$VN v; AI_basic (BI_cvtop t2 op t1 sx)] [::AI_trap]
 
   (** vector instructions **)
-  | rs_unop_vec: 
-    forall v op,
-    reduce_simple [:: $VV v; AI_basic (BI_unop_vec op)] [::$VV (app_unop_vec op v)]
-  | rs_binop_vec: 
-    forall v1 v2 op,
-    reduce_simple [:: $VV v1; $VV v2; AI_basic (BI_binop_vec op)] [::$VV (app_binop_vec op v1 v2)]
-  | rs_ternop_vec: 
-    forall v1 v2 v3 op,
-    reduce_simple [:: $VV v1; $VV v2; $VV v3; AI_basic (BI_ternop_vec op)] [::$VV (app_ternop_vec op v1 v2 v3)]
-  | rs_test_vec: 
-    forall v1 op,
-    reduce_simple [:: $VV v1; AI_basic (BI_test_vec op)] [::$VN (VAL_int32 (wasm_bool (app_test_vec op v1)))]
-  | rs_shift_vec: 
-    forall v1 v2 op,
-    reduce_simple [:: $VV v1; $VN (VAL_int32 v2); AI_basic (BI_shift_vec op)] [::$VV app_shift_vec op v1 v2]
+  | rs_vunop: 
+    forall v sh op,
+    reduce_simple [:: $VV v; AI_basic (BI_vunop sh op)] [::$VV (app_vunop sh op v)]
+  | rs_vbinop: 
+    forall v1 v2 sh op,
+    reduce_simple [:: $VV v1; $VV v2; AI_basic (BI_vbinop sh op)] [::$VV (app_vbinop sh op v1 v2)]
+  | rs_vternop: 
+    forall v1 v2 v3 sh op,
+    reduce_simple [:: $VV v1; $VV v2; $VV v3; AI_basic (BI_vternop sh op)] [::$VV (app_vternop sh op v1 v2 v3)]
+  | rs_vtestop: 
+    forall v1 sh op,
+    reduce_simple [:: $VV v1; AI_basic (BI_vtestop sh op)] [::$VN (VAL_int32 (wasm_bool (app_vtestop sh op v1)))]
+  | rs_vshiftop: 
+    forall v1 v2 sh op,
+    reduce_simple [:: $VV v1; $VN (VAL_int32 v2); AI_basic (BI_vshiftop sh op)] [::$VV app_vshiftop sh op v1 v2]
   | rs_splat_vec: 
-    forall v1 shape,
-    reduce_simple [:: $VN v1; AI_basic (BI_splat_vec shape)] [::$VV (app_splat_vec shape v1)]
+    forall v1 sh,
+    reduce_simple [:: $VN v1; AI_basic (BI_splat_vec sh)] [::$VV (app_splat_vec sh v1)]
   | rs_extract_vec: 
-    forall v1 shape sx x,
-    N.ltb x (shape_dim shape) = true ->
-    reduce_simple [:: $VV v1; AI_basic (BI_extract_vec shape sx x)] [::$VN (app_extract_vec shape sx x v1)]
+    forall v1 sh sx x,
+    N.ltb x (shape_dim sh) = true ->
+    reduce_simple [:: $VV v1; AI_basic (BI_extract_vec sh sx x)] [::$VN (app_extract_vec sh sx x v1)]
   | rs_replace_vec: 
-    forall v1 v2 shape x,
-    N.ltb x (shape_dim shape) = true ->
-    reduce_simple [:: $VV v1; $VN v2; AI_basic (BI_replace_vec shape x)] [::$VV (app_replace_vec shape x v1 v2)]
+    forall v1 v2 sh x,
+    N.ltb x (shape_dim sh) = true ->
+    reduce_simple [:: $VV v1; $VN v2; AI_basic (BI_replace_vec sh x)] [::$VV (app_replace_vec sh x v1 v2)]
     
   (** reference operations **)
   | rs_ref_is_null_true:
diff --git a/theories/pp.v b/theories/pp.v
index 9d4fc72b..2133e712 100644
--- a/theories/pp.v
+++ b/theories/pp.v
@@ -516,38 +516,38 @@ Definition pp_cvtop (cvt: cvtop) : string :=
 (* placeholder for vector operations added in 2.0, to be filled in a future update
 https://webassembly.github.io/spec/core/binary/instructions.html#vector-instructions
 *)
-Definition pp_unop_vec (op: unop_vec) :=
+Definition pp_vunop (sh: vshape) (op: vunop) :=
   "(not implemented)".
 
-Definition pp_binop_vec (op: binop_vec) :=
+Definition pp_vbinop (sh: vshape) (op: vbinop) :=
   "(not implemented)".
 
-Definition pp_ternop_vec (op: ternop_vec) :=
+Definition pp_vternop (sh: vshape) (op: vternop) :=
   "(not implemented)".
 
-Definition pp_test_vec (op: test_vec) :=
+Definition pp_vtestop (sh: vshape) (op: vtestop) :=
   "(not implemented)".
 
-Definition pp_shift_vec (op: shift_vec) :=
+Definition pp_vshiftop (sh: vshape) (op: vshiftop) :=
   "(not implemented)".
 
-Definition pp_splat_vec (sh: shape_vec) :=
+Definition pp_splat_vec (sh: vshape) :=
   "(not implemented)".
 
-Definition pp_extract_vec (sh: shape_vec) (s: option sx) (x: laneidx) :=
+Definition pp_extract_vec (sh: vshape) (s: option sx) (x: laneidx) :=
   "(not implemented)".
 
-Definition pp_replace_vec (sh: shape_vec) (x: laneidx) :=
+Definition pp_replace_vec (sh: vshape) (x: laneidx) :=
   "(not implemented)".
   
 Definition pp_load_vec (lvarg: load_vec_arg) (marg: memarg) :=
   "(not implemented)".
 
-Definition pp_load_vec_lane (w: width_vec) (marg: memarg) (x: laneidx) :=
+Definition pp_load_vec_lane (w: vwidth) (marg: memarg) (x: laneidx) :=
   "(not implemented)".
 
 (* store_vec_lane and load_vec uses the same args. Maybe it's better to find a new name *)
-Definition pp_store_vec_lane (w: width_vec) (marg: memarg) (x: laneidx) :=
+Definition pp_store_vec_lane (w: vwidth) (marg: memarg) (x: laneidx) :=
   "(not implemented)".
 
 Fixpoint pp_basic_instruction (i : indentation) (be : basic_instruction) : string :=
@@ -668,16 +668,16 @@ Fixpoint pp_basic_instruction (i : indentation) (be : basic_instruction) : strin
       indent i (pp_number_type vt1 ++ "." ++ pp_cvtop cvtop ++ "_" ++ pp_number_type vt2 ++ pp_sx_o sxo ++ newline)
 
   (* vector instructions currently unimplemented *)
-  | BI_unop_vec op =>
-      indent i (pp_unop_vec op)
-  | BI_binop_vec op =>
-      indent i (pp_binop_vec op)
-  | BI_ternop_vec op =>
-      indent i (pp_ternop_vec op)
-  | BI_test_vec op =>
-      indent i (pp_test_vec op)
-  | BI_shift_vec op =>
-      indent i (pp_shift_vec op)
+  | BI_vunop sh op =>
+      indent i (pp_vunop sh op)
+  | BI_vbinop sh op =>
+      indent i (pp_vbinop sh op)
+  | BI_vternop sh op =>
+      indent i (pp_vternop sh op)
+  | BI_vtestop sh op =>
+      indent i (pp_vtestop sh op)
+  | BI_vshiftop sh op =>
+      indent i (pp_vshiftop sh op)
   | BI_splat_vec sh =>
       indent i (pp_splat_vec sh)
   | BI_extract_vec sh s lanex =>
diff --git a/theories/type_checker.v b/theories/type_checker.v
index c5d87eaa..257562ca 100644
--- a/theories/type_checker.v
+++ b/theories/type_checker.v
@@ -190,15 +190,15 @@ Fixpoint check_single (C : t_context) (ct : option checker_type) (be : basic_ins
           if cvtop_valid t2 op t1 sx
           then type_update ts [::(T_num t1)] [::(T_num t2)]
           else None
-      | BI_unop_vec op =>
+      | BI_vunop sh op =>
           type_update ts [::T_vec T_v128] [::T_vec T_v128]
-      | BI_binop_vec op =>
+      | BI_vbinop sh op =>
           type_update ts [::T_vec T_v128; T_vec T_v128] [::T_vec T_v128]
-      | BI_ternop_vec op =>
+      | BI_vternop sh op =>
           type_update ts [::T_vec T_v128; T_vec T_v128; T_vec T_v128] [::T_vec T_v128]
-      | BI_test_vec tv =>
+      | BI_vtestop sh tv =>
           type_update ts [::T_vec T_v128] [::T_num T_i32]
-      | BI_shift_vec sv =>
+      | BI_vshiftop sh sv =>
           type_update ts [::T_num T_i32; T_vec T_v128] [::T_vec T_v128]
       | BI_splat_vec shape =>
           type_update ts [::T_num (typeof_shape_unpacked shape)] [::T_vec T_v128]
diff --git a/theories/type_preservation.v b/theories/type_preservation.v
index 97a0fe6d..e9b5daac 100644
--- a/theories/type_preservation.v
+++ b/theories/type_preservation.v
@@ -75,27 +75,27 @@ Proof.
   - (* Cvtop *)
     erewrite eval_cvt_type_preserve; eauto.
     by resolve_subtyping.
-  - (* Unop_vec *)
-    replace (typeof_vec (app_unop_vec op v)) with (typeof_vec v); last by destruct op, v.
+  - (* vunop *)
+    replace (typeof_vec (app_vunop sh op v)) with (typeof_vec v); last by destruct op, v.
     rewrite H0.
     by resolve_subtyping.
-  - (* Binop_vec *)
-    replace (typeof_vec (app_binop_vec op v1 v2)) with (typeof_vec v1); last by destruct op, v1, v2.
+  - (* vbinop *)
+    replace (typeof_vec (app_vbinop sh op v1 v2)) with (typeof_vec v1); last by destruct op, v1, v2.
     rewrite H0.
     by resolve_subtyping.
-  - (* Ternop_vec *)
-    replace (typeof_vec (app_ternop_vec op v1 v2 v3)) with (typeof_vec v1); last by destruct op, v1, v2, v3.
+  - (* vternop *)
+    replace (typeof_vec (app_vternop sh op v1 v2 v3)) with (typeof_vec v1); last by destruct op, v1, v2, v3.
     rewrite H0.
     by resolve_subtyping.
-  - (* Shift_vec *)
-    replace (typeof_vec (app_shift_vec op v1 v2)) with (typeof_vec v1); last by destruct op, v1, v2.
+  - (* vshiftop *)
+    replace (typeof_vec (app_vshiftop sh op v1 v2)) with (typeof_vec v1); last by destruct op, v1, v2.
     rewrite H0.
     by resolve_subtyping.
   - (* Extract_vec *)
-    replace (typeof_num (app_extract_vec shape sx x v1)) with (typeof_shape_unpacked shape); last by destruct shape as [shape | shape]; destruct shape.
+    replace (typeof_num (app_extract_vec sh sx x v1)) with (typeof_shape_unpacked sh); last by destruct sh as [[] | []].
     by resolve_subtyping.
   - (* Replace_vec *)
-    replace (typeof_vec (app_replace_vec shape x v1 v2)) with (typeof_vec v1); last by destruct shape as [shape | shape]; destruct shape.
+    replace (typeof_vec (app_replace_vec sh x v1 v2)) with (typeof_vec v1); last by destruct sh as [[] | []].
     rewrite H1.
     by resolve_subtyping.
   - (* Br *)
diff --git a/theories/typing.v b/theories/typing.v
index 4d497451..0ad1e7b6 100644
--- a/theories/typing.v
+++ b/theories/typing.v
@@ -79,19 +79,19 @@ Definition value_typing (s: store_record) (v: value) (t: value_type) : bool :=
 Definition values_typing (s: store_record) (vs: list value) (tf: list value_type) : bool :=
   all2 (value_typing s) vs tf.
 
-Definition typeof_shape_unpacked (shape: shape_vec) : number_type :=
+Definition typeof_shape_unpacked (shape: vshape) : number_type :=
   match shape with
-  | SV_ishape svi =>
-      match svi with
-      | SVI_8_16 => T_i32
-      | SVI_16_8 => T_i32
-      | SVI_32_4 => T_i32
-      | SVI_64_2 => T_i64
+  | VS_i vsi =>
+      match vsi with
+      | VSI_8_16 => T_i32
+      | VSI_16_8 => T_i32
+      | VSI_32_4 => T_i32
+      | VSI_64_2 => T_i64
       end
-  | SV_fshape svf =>
-      match svf with
-      | SVF_32_4 => T_f32
-      | SVF_64_2 => T_f64
+  | VS_f vsf =>
+      match vsf with
+      | VSF_32_4 => T_f32
+      | VSF_64_2 => T_f64
       end
   end.
   
@@ -221,21 +221,21 @@ Inductive be_typing : t_context -> seq basic_instruction -> instr_type -> Prop :
 | bet_cvtop : forall C op t1 t2 sx,
     cvtop_valid t2 op t1 sx ->
     be_typing C [::BI_cvtop t2 op t1 sx] (Tf [::T_num t1] [::T_num t2])
-| bet_unop_vec: forall C op,
-    be_typing C [::BI_unop_vec op] (Tf [::T_vec T_v128] [::T_vec T_v128])
-| bet_binop_vec: forall C op,
-    be_typing C [::BI_binop_vec op] (Tf [::T_vec T_v128; T_vec T_v128] [::T_vec T_v128])
-| bet_ternop_vec: forall C op,
-    be_typing C [::BI_ternop_vec op] (Tf [::T_vec T_v128; T_vec T_v128; T_vec T_v128] [::T_vec T_v128])
-| bet_test_vec: forall C op,
-    be_typing C [::BI_test_vec op] (Tf [::T_vec T_v128] [::T_num T_i32])
-| bet_shift_vec: forall C op,
-    be_typing C [::BI_shift_vec op] (Tf [::T_vec T_v128; T_num T_i32] [::T_vec T_v128])
-| bet_splat_vec: forall C shape,
-    be_typing C [::BI_splat_vec shape] (Tf [::T_num (typeof_shape_unpacked shape)] [::T_vec T_v128])
-| bet_extract_vec: forall C shape sx x,
-    N.ltb x (shape_dim shape) = true ->
-    be_typing C [::BI_extract_vec shape sx x] (Tf [::T_vec T_v128] [::T_num (typeof_shape_unpacked shape)])
+| bet_vunop: forall C sh op,
+    be_typing C [::BI_vunop sh op] (Tf [::T_vec T_v128] [::T_vec T_v128])
+| bet_vbinop: forall C sh op,
+    be_typing C [::BI_vbinop sh op] (Tf [::T_vec T_v128; T_vec T_v128] [::T_vec T_v128])
+| bet_vternop: forall C sh op,
+    be_typing C [::BI_vternop sh op] (Tf [::T_vec T_v128; T_vec T_v128; T_vec T_v128] [::T_vec T_v128])
+| bet_vtestop: forall C sh op,
+    be_typing C [::BI_vtestop sh op] (Tf [::T_vec T_v128] [::T_num T_i32])
+| bet_vshiftop: forall C sh op,
+    be_typing C [::BI_vshiftop sh op] (Tf [::T_vec T_v128; T_num T_i32] [::T_vec T_v128])
+| bet_splat_vec: forall C sh,
+    be_typing C [::BI_splat_vec sh] (Tf [::T_num (typeof_shape_unpacked sh)] [::T_vec T_v128])
+| bet_extract_vec: forall C sh sx x,
+    N.ltb x (shape_dim sh) = true ->
+    be_typing C [::BI_extract_vec sh sx x] (Tf [::T_vec T_v128] [::T_num (typeof_shape_unpacked sh)])
 | bet_replace_vec: forall C shape x,
     N.ltb x (shape_dim shape) = true ->
     be_typing C [::BI_replace_vec shape x] (Tf [::T_vec T_v128; T_num (typeof_shape_unpacked shape)] [::T_vec T_v128])
diff --git a/theories/typing_inversion.v b/theories/typing_inversion.v
index ba71830b..998965cc 100644
--- a/theories/typing_inversion.v
+++ b/theories/typing_inversion.v
@@ -96,15 +96,15 @@ Definition be_principal_typing (C: t_context) (be: basic_instruction) (tf: instr
   | BI_cvtop t2 op t1 sx =>
       tf = (Tf [::T_num t1] [::T_num t2]) /\
         cvtop_valid t2 op t1 sx
-  | BI_unop_vec op =>
+  | BI_vunop sh op =>
       tf = (Tf [::T_vec T_v128] [::T_vec T_v128])
-  | BI_binop_vec op =>
+  | BI_vbinop sh op =>
       tf = (Tf [::T_vec T_v128; T_vec T_v128] [::T_vec T_v128])
-  | BI_ternop_vec op =>
+  | BI_vternop sh op =>
       tf = (Tf [::T_vec T_v128; T_vec T_v128; T_vec T_v128] [::T_vec T_v128])
-  | BI_test_vec op =>
+  | BI_vtestop sh op =>
       tf = (Tf [::T_vec T_v128] [::T_num T_i32])
-  | BI_shift_vec op =>
+  | BI_vshiftop sh op =>
       tf = (Tf [::T_vec T_v128; T_num T_i32] [::T_vec T_v128])
   | BI_splat_vec shape =>
       tf = (Tf [::T_num (typeof_shape_unpacked shape)] [::T_vec T_v128])

From 2ea022255db3d42eb753bf2a015ad02f2732baf6 Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Tue, 12 Aug 2025 05:40:24 -0700
Subject: [PATCH 03/19] simd parsing, lane extraction and serialisation, and
 wast related implementations; proof updates

---
 src/shim.ml                             |   5 +
 src/shim.mli                            |   2 +
 src/wast_execute.ml                     |  47 ++++++--
 theories/binary_format_parser.v         |  63 ++++++++++
 theories/binary_format_printer.v        |   4 +
 theories/datatypes.v                    |  44 +++----
 theories/extraction_instance.v          |   3 +
 theories/instantiation_properties.v     |   2 +-
 theories/interpreter_ctx.v              |  13 ++
 theories/operations.v                   | 154 +++++++++++++++++++-----
 theories/opsem.v                        |   8 ++
 theories/pp.v                           |  15 ++-
 theories/simd.v                         |  14 ++-
 theories/tactic.v                       |   2 +
 theories/text_format_parser.v           |  73 ++++++++++-
 theories/type_checker.v                 |  12 +-
 theories/type_checker_reflects_typing.v |   2 +
 theories/type_preservation.v            |  53 +++++++-
 theories/typing.v                       |   8 +-
 theories/typing_inversion.v             |   9 +-
 20 files changed, 457 insertions(+), 76 deletions(-)

diff --git a/src/shim.ml b/src/shim.ml
index c657f091..320beb01 100644
--- a/src/shim.ml
+++ b/src/shim.ml
@@ -91,6 +91,8 @@ module type InterpreterType = sig
 
   val is_arithmetic_nan: Extract.number_type -> value -> bool
 
+  val v128_extract_lanes: Extract.vshape -> Extract.SIMD.v128 -> Extract.value_num list
+
 end
 
 module Interpreter =
@@ -183,4 +185,7 @@ functor (EH : Host) -> struct
 
   let is_arithmetic_nan =
     Extraction_instance.is_arithmetic_nan
+
+  let v128_extract_lanes sh v = 
+    Extraction_instance.v128_extract_lanes sh v
 end
diff --git a/src/shim.mli b/src/shim.mli
index c7e6a33e..32ede3d9 100644
--- a/src/shim.mli
+++ b/src/shim.mli
@@ -103,6 +103,8 @@ module type InterpreterType = sig
 
   val is_arithmetic_nan: Extract.number_type -> value -> bool
 
+  val v128_extract_lanes: Extract.vshape -> Extract.SIMD.v128 -> Extract.value_num list
+
 end
 
 module Interpreter : functor (EH : Host) ->
diff --git a/src/wast_execute.ml b/src/wast_execute.ml
index 9e83a4da..b5946e22 100644
--- a/src/wast_execute.ml
+++ b/src/wast_execute.ml
@@ -46,7 +46,11 @@ let wasm_val_to_string wval =
   match wval.it with
   | Num num -> Some (wasm_num_to_hexstring num)
   | Ref ref -> Some (wasm_ref_to_string ref)
-  | _-> Some (string_of_value wval.it)
+  | Vec vec -> 
+    (* Wasm ref interpreter always prints v128 as I32x4. *)
+    let vec_string = "v128.const i32x4 " ^ string_of_value wval.it in
+    (* Printf.printf "%s" vec_string; *)
+    Some vec_string
 
 let wasm_numpat_to_string numpat =
   match numpat with
@@ -58,11 +62,24 @@ let wasm_refpat_to_string refpat =
   | RefPat ref -> string_of_ref ref.it
   | _ -> "Unsupported refpat printout"
 
+let wast_shape_to_coq sh = 
+  match sh with
+  | Wasm.V128.I8x16 _ -> Extract.VS_i (Extract.VSI_8_16)
+  | Wasm.V128.I16x8 _ -> Extract.VS_i (Extract.VSI_16_8)
+  | Wasm.V128.I32x4 _ -> Extract.VS_i (Extract.VSI_32_4)
+  | Wasm.V128.I64x2 _ -> Extract.VS_i (Extract.VSI_64_2)
+  | Wasm.V128.F32x4 _ -> Extract.VS_f (Extract.VSF_32_4)
+  | Wasm.V128.F64x2 _ -> Extract.VS_f (Extract.VSF_64_2)
+
+let wasm_vecpat_to_string vecpat = 
+  match vecpat with
+  | VecPat (V128 (_sh, numpats)) -> "\x1b[36mv128\x1b[0m: [" ^ String.concat "; " (List.map wasm_numpat_to_string numpats) ^ "]"
+
 let wasm_result_to_string ret = 
   match ret.it with
   | NumResult numpat -> wasm_numpat_to_string numpat
   | RefResult refpat -> wasm_refpat_to_string refpat
-  | _ -> "Unsupported result type: vec"
+  | VecResult vecpat -> wasm_vecpat_to_string vecpat
 
 let wasm_results_to_string rets = 
   String.concat "; " (List.map wasm_result_to_string rets)
@@ -98,7 +115,7 @@ let wasm_assert_nanpat ret nanop =
 
 let wasm_assert_numpat ret numpat = 
   match numpat with
-  | NumPat num -> (wasm_num_to_coq num.it = Some ret)
+  | NumPat num -> wasm_num_to_coq num.it = Some ret
   | NanPat nanop -> wasm_assert_nanpat ret nanop
 
 let wasm_assert_refpat ret refpat =
@@ -107,14 +124,26 @@ let wasm_assert_refpat ret refpat =
   | RefTypePat Wasm.Types.FuncRefType -> Extract.Extraction_instance.is_funcref ret
   | RefTypePat Wasm.Types.ExternRefType -> Extract.Extraction_instance.is_externref ret
 
-let wasm_assert_ret ret ret_exp = 
+let wasm_assert_vecpat verbosity ret vecpat =
+  match vecpat, ret with
+  | VecPat (V128 (sh, numpats)), Extract.VAL_vec vret -> 
+    let coq_sh = wast_shape_to_coq sh in 
+    let coq_vals = List.map (fun v -> Extract.VAL_num v) (Interpreter.v128_extract_lanes coq_sh vret) in
+    debug_info verbosity intermediate (fun _ -> "The returned v128 extracts to: [" ^ (pp_values coq_vals) ^ "].\n");
+    if List.length coq_vals = List.length numpats then 
+      List.for_all2 wasm_assert_numpat coq_vals numpats
+    else 
+      false
+  | _, _-> assert false
+
+let wasm_assert_ret verbosity ret ret_exp = 
   match ret_exp.it with
   | NumResult numpat -> wasm_assert_numpat ret numpat
   | RefResult refpat -> wasm_assert_refpat ret refpat
-  | _ -> false
+  | VecResult vecpat -> wasm_assert_vecpat verbosity ret vecpat
 
-let wasm_assert_rets rets ret_exps = 
-  let assert_results = List.map2 wasm_assert_ret rets ret_exps in
+let wasm_assert_rets verbosity rets ret_exps = 
+  let assert_results = List.map2 (fun x y -> wasm_assert_ret verbosity x y) rets ret_exps in
   List.fold_left (fun a b -> a && b) true assert_results
 
 let load_wast_module verbosity hs s ovar moddef mod_counter =
@@ -231,7 +260,7 @@ let run_wast_command verbosity max_call_depth cmd hs s mod_counter default_modul
           debug_info verbosity stage (fun _ -> "Returned: ");
           debug_info verbosity stage (fun _ -> Execute.print_invoke_result verbosity res; "");
           debug_info verbosity stage (fun _ -> "Expected: " ^ wasm_results_to_string expect_rets ^ "\n");
-          let assert_result = wasm_assert_rets vs expect_rets in
+          let assert_result = wasm_assert_rets verbosity vs expect_rets in
             if assert_result then
               (debug_info verbosity stage (fun _ -> "Test passed: result matches asserted value\n");
               pure (hs, s', mod_counter, default_module_name, Verdict_OK))
@@ -242,7 +271,7 @@ let run_wast_command verbosity max_call_depth cmd hs s mod_counter default_modul
         end
       | Get (ovar, extname_utf8) ->
         let* res_v = run_get (ovar, extname_utf8) hs s default_module_name in
-        let assert_result = wasm_assert_rets [res_v] expect_rets in
+        let assert_result = wasm_assert_rets verbosity [res_v] expect_rets in
           if assert_result then
             (debug_info verbosity stage (fun _ -> "Test passed: result matches asserted value\n");
               pure (hs, s, mod_counter, default_module_name, Verdict_OK))
diff --git a/theories/binary_format_parser.v b/theories/binary_format_parser.v
index b05f158a..66841be8 100644
--- a/theories/binary_format_parser.v
+++ b/theories/binary_format_parser.v
@@ -45,6 +45,12 @@ Definition exact_byte (b : byte) {n} : byte_parser byte n :=
       else None)
     anyTok.
 
+Definition parse_u8_as_N {n} : byte_parser N n :=
+  guardM
+    (fun n => if (N.leb n 255%N) then Some n
+           else None)
+    (assert_sized (extract parse_unsigned n) (fun sz => N.leb sz 2)).
+
 Definition parse_u32_as_N {n} : byte_parser N n :=
   guardM
     (fun n => if (N.leb n 4294967295%N) then Some n
@@ -98,6 +104,9 @@ Definition parse_vec {B} {n} (f : byte_parser B n) : byte_parser (list B) n :=
   exact_byte x00 $> nil <|>
   (parse_vec_length >>= (fun k => parse_vec_aux f k)).
 
+Definition parse_16bytes {n} : byte_parser SIMD.v128 n :=
+  (fun bs => List.map compcert_byte_of_byte bs) <$> parse_vec_aux anyTok 16.
+
 Definition parse_f32 {n} : byte_parser Wasm_float.FloatSize32.T n :=
   (fun bs => Floats.Float32.of_bits (Integers.Int.repr (common.Memdata.decode_int (List.map compcert_byte_of_byte bs)))) <$> (parse_vec_aux anyTok 4%N).
 
@@ -132,6 +141,9 @@ Definition parse_elemidx {n} : byte_parser elemidx n :=
 Definition parse_dataidx {n} : byte_parser dataidx n :=
   parse_u32_as_N.
 
+Definition parse_laneidx {n} : byte_parser laneidx n :=
+  parse_u8_as_N.
+
 Definition parse_number_type {n} : byte_parser number_type n :=
   (exact_byte x7f $> T_i32) <|>
   (exact_byte x7e $> T_i64) <|>
@@ -399,6 +411,52 @@ Definition parse_memory_copy {n} : byte_parser basic_instruction n :=
 Definition parse_memory_fill {n} : byte_parser basic_instruction n :=
   exact_byte xfc &> assert_u32 11%N &> ((fun _ => BI_memory_fill) <$> exact_byte x00).
 
+Definition parse_v128_load {n} : byte_parser basic_instruction n :=
+  assert_u32 0%N &> (BI_load_vec (LVA_none) <$> parse_memarg).
+
+Definition parse_v128_load_packed {n} : byte_parser basic_instruction n :=
+  assert_u32 1%N &> (BI_load_vec (LVA_packed (8%N, 8%N) SX_S) <$> parse_memarg) <|>
+  assert_u32 2%N &> (BI_load_vec (LVA_packed (8%N, 8%N) SX_U) <$> parse_memarg) <|>
+  assert_u32 3%N &> (BI_load_vec (LVA_packed (16%N, 4%N) SX_S) <$> parse_memarg) <|>
+  assert_u32 4%N &> (BI_load_vec (LVA_packed (16%N, 4%N) SX_U) <$> parse_memarg) <|>
+  assert_u32 5%N &> (BI_load_vec (LVA_packed (32%N, 2%N) SX_S) <$> parse_memarg) <|>
+  assert_u32 6%N &> (BI_load_vec (LVA_packed (32%N, 2%N) SX_U) <$> parse_memarg).
+
+Definition parse_v128_load_splat {n} : byte_parser basic_instruction n :=
+  assert_u32 7%N &> (BI_load_vec (LVA_splat 8%N) <$> parse_memarg) <|>
+  assert_u32 8%N &> (BI_load_vec (LVA_splat 16%N) <$> parse_memarg) <|>
+  assert_u32 9%N &> (BI_load_vec (LVA_splat 32%N) <$> parse_memarg) <|>
+  assert_u32 10%N &> (BI_load_vec (LVA_splat 64%N) <$> parse_memarg).
+
+Definition parse_v128_load_zero {n} : byte_parser basic_instruction n :=
+  assert_u32 92%N &> (BI_load_vec (LVA_zero 32%N) <$> parse_memarg) <|>
+  assert_u32 93%N &> (BI_load_vec (LVA_zero 64%N) <$> parse_memarg).
+
+Definition parse_v128_store {n} : byte_parser basic_instruction n :=
+  assert_u32 11%N &> (BI_store_vec <$> parse_memarg).
+
+Definition parse_v128_load_lane {n} : byte_parser basic_instruction n :=
+  assert_u32 84%N &> ((BI_load_vec_lane 8%N <$> parse_memarg) <*> parse_laneidx) <|>
+  assert_u32 85%N &> ((BI_load_vec_lane 16%N <$> parse_memarg) <*> parse_laneidx) <|>
+  assert_u32 86%N &> ((BI_load_vec_lane 32%N <$> parse_memarg) <*> parse_laneidx) <|>
+  assert_u32 87%N &> ((BI_load_vec_lane 64%N <$> parse_memarg) <*> parse_laneidx).
+
+Definition parse_v128_store_lane {n} : byte_parser basic_instruction n :=
+  assert_u32 88%N &> ((BI_store_vec_lane 8%N <$> parse_memarg) <*> parse_laneidx) <|>
+  assert_u32 89%N &> ((BI_store_vec_lane 16%N <$> parse_memarg) <*> parse_laneidx) <|>
+  assert_u32 90%N &> ((BI_store_vec_lane 32%N <$> parse_memarg) <*> parse_laneidx) <|>
+  assert_u32 91%N &> ((BI_store_vec_lane 64%N <$> parse_memarg) <*> parse_laneidx).
+             
+Definition parse_vector_memory_instruction {n} : byte_parser basic_instruction n :=
+  exact_byte xfd &>
+  (parse_v128_load <|>
+  parse_v128_load_packed <|>
+  parse_v128_load_splat <|>
+  parse_v128_load_zero <|>
+  parse_v128_store <|>
+  parse_v128_load_lane <|>
+  parse_v128_store_lane).
+
 Definition parse_memory_instruction {n} : byte_parser basic_instruction n :=
   parse_i32_load <|>
   parse_i64_load <|>
@@ -423,6 +481,7 @@ Definition parse_memory_instruction {n} : byte_parser basic_instruction n :=
   parse_i64_store8 <|>
   parse_i64_store16 <|>
   parse_i64_store32 <|>
+  parse_vector_memory_instruction <|>
   parse_memory_size <|>
   parse_memory_grow <|>
   parse_memory_init <|>
@@ -442,12 +501,16 @@ Definition parse_f32_const {n} : be_parser n :=
 Definition parse_f64_const {n} : be_parser n :=
   exact_byte x44 &> ((fun x => BI_const_num (VAL_float64 x)) <$> parse_f64).
 
+Definition parse_v128_const {n} : be_parser n :=
+  exact_byte xfd &> assert_u32 12%N &> ((fun x => BI_const_vec (VAL_vec128 x)) <$> parse_16bytes).
+
 (* :-( *)
 Definition parse_numeric_instruction {n} : be_parser n :=
   parse_i32_const <|>
   parse_i64_const <|>
   parse_f32_const <|>
   parse_f64_const <|>
+  parse_v128_const <|>
   exact_byte x45 $> BI_testop T_i32 TO_eqz <|>
   exact_byte x46 $> BI_relop T_i32 (Relop_i ROI_eq) <|>
   exact_byte x47 $> BI_relop T_i32 (Relop_i ROI_ne) <|>
diff --git a/theories/binary_format_printer.v b/theories/binary_format_printer.v
index 2af736eb..7520d1f2 100644
--- a/theories/binary_format_printer.v
+++ b/theories/binary_format_printer.v
@@ -138,6 +138,9 @@ Definition binary_of_load_vec_lane (w: vwidth) (marg: memarg) (x: laneidx) :=
   xfd :: x0c :: (List.repeat x00 16).
 
 (* store_vec_lane and load_vec uses the same args. Maybe it's better to find a new name *)
+Definition binary_of_store_vec (marg: memarg) :=
+  xfd :: x0c :: (List.repeat x00 16).
+
 Definition binary_of_store_vec_lane (w: vwidth) (marg: memarg) (x: laneidx) :=
   xfd :: x0c :: (List.repeat x00 16).
 
@@ -405,6 +408,7 @@ Fixpoint binary_of_be (be : basic_instruction) : list byte :=
 
   | BI_load_vec lvarg marg => binary_of_load_vec lvarg marg
   | BI_load_vec_lane width marg lanex => binary_of_load_vec_lane width marg lanex
+  | BI_store_vec marg => binary_of_store_vec marg
   | BI_store_vec_lane width marg lanex => binary_of_store_vec_lane width marg lanex
   end.
 
diff --git a/theories/datatypes.v b/theories/datatypes.v
index a7fa4cee..b0b85283 100644
--- a/theories/datatypes.v
+++ b/theories/datatypes.v
@@ -3,7 +3,7 @@
     and https://webassembly.github.io/spec/core/exec/index.html **)
 (* (C) J. Pichon, M. Bodin - see LICENSE.txt *)
 
-From Wasm Require Export common numerics bytes memory.
+From Wasm Require Export common numerics bytes memory simd.
 From compcert Require common.Memdata.
 Require Import BinNat.
 
@@ -81,10 +81,9 @@ Inductive value_num : Type :=
 
 (* We are not implementing SIMD at the moment. *)
 Inductive value_vec : Set :=
-  | VAL_vec128: unit -> value_vec
+  | VAL_vec128: SIMD.v128 -> value_vec
 .
 
-(* TODO: Unicode support? *)
 Definition name := list Byte.byte.
 
 Section Types.
@@ -289,7 +288,14 @@ End Types.
 
 Definition static_offset := (* off *) u32. 
 
-Definition alignment_exponent := (* a *) u32. 
+Definition alignment_exponent := (* a *) u32.
+
+(* i8 and i16 are for v128 values *)
+Definition serialise_i8 (i : i32) : bytes :=
+  common.Memdata.encode_int 1%nat (numerics.Wasm_int.Int32.unsigned i).
+
+Definition serialise_i16 (i : i32) : bytes :=
+  common.Memdata.encode_int 2%nat (numerics.Wasm_int.Int32.unsigned i).
 
 Definition serialise_i32 (i : i32) : bytes :=
   common.Memdata.encode_int 4%nat (numerics.Wasm_int.Int32.unsigned i).
@@ -568,28 +574,20 @@ Inductive vshiftop : Set :=
   | VS_shr : sx -> vshiftop
   .
 
-Inductive vpacked_type :=
-  | Tptv_8_8
-  | Tptv_16_4
-  | Tptv_32_2
-.
+(* Available shapes: 8x8, 16x4, 32x2 *)
+Definition vpacked_type : Type := N * N.
 
-Inductive vzero_type :=
-  | Tztv_32
-  | Tztv_64
-.
+(* Available widths: 32, 64 *)
+Definition vzero_type := N.
 
-Inductive vwidth :=
-  | Twv_8
-  | Twv_16
-  | Twv_32
-  | Twv_64
-  .
+(* Available widths: 8, 16, 32, 64 *)
+Definition vwidth := N.
 
 Inductive load_vec_arg :=
-  | LVA_packed: vpacked_type -> sx -> load_vec_arg
-  | LVA_zero: vzero_type -> load_vec_arg
-  | LVA_splat: vwidth -> load_vec_arg
+  | LVA_packed: vpacked_type -> sx -> load_vec_arg (* v128.loadnxm_sx *)
+  | LVA_zero: vzero_type -> load_vec_arg (* v128.loadnn_zero *)
+  | LVA_splat: vwidth -> load_vec_arg (* v128.loadww_spat *)
+  | LVA_none (* v128.load *)
   .
 
 Record memarg : Set :=
@@ -668,6 +666,8 @@ Instructions in this group are concerned with linear memory.
   (* the lane version has a different type signature *)
   | BI_load_vec_lane : vwidth -> memarg -> laneidx -> basic_instruction
   | BI_store : number_type -> option packed_type -> memarg -> basic_instruction
+  (* v128.store *)
+  | BI_store_vec : memarg -> basic_instruction
   | BI_store_vec_lane : vwidth -> memarg -> laneidx -> basic_instruction
   | BI_memory_size
   | BI_memory_grow
diff --git a/theories/extraction_instance.v b/theories/extraction_instance.v
index 6ed7b140..8170b15d 100644
--- a/theories/extraction_instance.v
+++ b/theories/extraction_instance.v
@@ -178,6 +178,9 @@ Section Wast.
     | VAL_ref (VAL_ref_extern _) => true
     | _ => false
     end.
+
+  Definition v128_extract_lanes (sh: vshape) (v: SIMD.v128) :=
+    v128_extract_lanes sh v.
   
 End Wast.
 
diff --git a/theories/instantiation_properties.v b/theories/instantiation_properties.v
index 4008c6b7..1a1a2cf4 100644
--- a/theories/instantiation_properties.v
+++ b/theories/instantiation_properties.v
@@ -576,7 +576,7 @@ Proof.
   move => Hred.
   unfold reduce_trans, opsem.reduce_trans in *.
   apply Operators_Properties.clos_rt_rt1n_iff in Hred.
-  inversion Hred; subst; clear Hred; first by repeat destruct v.
+  inversion Hred; subst; clear Hred; first by do 2 destruct v.
   destruct y as [[[??]?]?].
   unfold reduce_tuple, opsem.reduce_tuple in H.
   apply reduce_global_get in H.
diff --git a/theories/interpreter_ctx.v b/theories/interpreter_ctx.v
index 3386fedb..c1eae8a8 100644
--- a/theories/interpreter_ctx.v
+++ b/theories/interpreter_ctx.v
@@ -564,6 +564,7 @@ Proof.
       (* BI_load_vec lvarg marg *) lvarg marg |
       (* BI_load_vec_lane width marg laneidx *) width marg x |
       (* BI_store t [Some tp|None] marg *) t op marg |
+      (* BI_store_vec marg *) marg |
       (* BI_store_vec_lane width marg laneidx *) width marg x |
       (* BI_memory_size *) |
       (* BI_memory_grow *) |
@@ -1266,6 +1267,18 @@ the condition that all values should live in the operand stack. *)
           resolve_reduce_ctx vs0 es0.
           by eapply r_store_failure; subst; eauto.
           
+    (* AI_basic (BI_store_vec marg) *)
+    - destruct vs0 as [|v2 [|v1 vs0]]; try by no_args.
+      assert_i32 v1.
+      assert_value_vec v2.
+      destruct (smem_store_vec s fc.(FC_frame).(f_inst) ($nou32 v1) v2 marg) as [s' | ] eqn:Hstore_vec.
+      - apply <<hs, (s', (fc, lcs) :: ccs', (vs0, es0), None), d>> => //.
+        resolve_reduce_ctx vs0 es0.
+        by eapply r_store_vec_success; subst; eauto.
+      - apply <<hs, (s, (fc, lcs) :: ccs', (vs0, es0), Some AI_trap), d>> => //.
+        resolve_reduce_ctx vs0 es0.
+        by eapply r_store_vec_failure; subst; eauto.
+        
     (* AI_basic (BI_store_vec_lane width marg x) *)
     - destruct vs0 as [|v2 [|v1 vs0]]; try by no_args.
       assert_i32 v1.
diff --git a/theories/operations.v b/theories/operations.v
index e5ce9591..028cac78 100644
--- a/theories/operations.v
+++ b/theories/operations.v
@@ -91,41 +91,25 @@ Definition mem_grow (m : meminst) (len_delta : N) : option meminst :=
     end
   else None.
 
-Definition ptv_to_nm (ptv: vpacked_type) : N * N :=
-  match ptv with
-  | Tptv_8_8 => (8%N, 8%N)
-  | Tptv_16_4 => (16%N, 4%N)
-  | Tptv_32_2 => (32%N, 2%N)
-  end.
-
-Definition ztv_to_n (ztv: vzero_type) : N :=
-  match ztv with
-  | Tztv_32 => 32%N
-  | Tztv_64 => 64%N
-  end.
-
-Definition width_to_n (ww: vwidth) : N :=
-  match ww with
-  | Twv_8 => 8%N
-  | Twv_16 => 16%N
-  | Twv_32 => 32%N
-  | Twv_64 => 64%N
-  end.
-
 Definition load_vec_bounds (lv_arg: load_vec_arg) (m_arg: memarg) : bool :=
   match lv_arg with
   | LVA_packed ptv sx =>
-      let (n, m) := ptv_to_nm ptv in
+      let (n, m) := ptv in
       N.leb (N.pow 2 m_arg.(memarg_align)) (N.mul (N.div n 8) m)
   | LVA_zero ztv =>
-      N.leb (N.pow 2 m_arg.(memarg_align)) (N.div (ztv_to_n ztv) 8)
+      N.leb (N.pow 2 m_arg.(memarg_align)) (N.div (ztv) 8)
   | LVA_splat width =>
-      N.leb (N.pow 2 m_arg.(memarg_align)) (N.div (width_to_n width) 8)
+      N.leb (N.pow 2 m_arg.(memarg_align)) (N.div (width) 8)
+  | LVA_none =>
+      N.leb (N.pow 2 m_arg.(memarg_align)) 16%N
   end.
 
-Definition load_vec_lane_bounds (width: vwidth) (m_arg: memarg) (x: laneidx) : bool :=
-   (N.leb (N.pow 2 (memarg_align m_arg)) (width_to_n width / 8))%N &&
-                         (N.ltb x (128 / width_to_n width)%N). 
+Definition load_store_vec_lane_bounds (width: vwidth) (m_arg: memarg) (x: laneidx) : bool :=
+   (N.leb (N.pow 2 (memarg_align m_arg)) (width / 8))%N &&
+     (N.ltb x (128 / width)%N).
+
+Definition store_vec_bounds (m_arg: memarg) : bool :=
+      N.leb (N.pow 2 m_arg.(memarg_align)) 16%N.
 
 (* TODO: We crucially need documentation here. *)
 
@@ -151,7 +135,7 @@ Definition load_vec (m: meminst) (i: N) (lvarg: load_vec_arg) (marg: memarg) : o
         let bs = int_of_bytes (take (drop m
   if ea + *)
   match (i == marg.(memarg_offset)) with
-  | true => Some (VAL_vec128 tt)
+  | true => Some (VAL_vec128 SIMD.v128_default)
   | _ => None
   end.
 
@@ -170,6 +154,12 @@ Definition store (m : meminst) (n : N) (off : static_offset) (bs : bytes) (l : n
 Might need a change in the future if this behaviour changes. *)
 Definition store_packed := store.
 
+Definition store_vec (m: meminst) (n: N) (v: value_vec) (marg: memarg) : option meminst :=
+  match (n == marg.(memarg_offset)) with
+  | true => Some m
+  | _ => None
+  end.
+
 Definition store_vec_lane (m: meminst) (n: N) (v: value_vec) (width: vwidth) (marg: memarg) (x: laneidx) : option meminst :=
   match (n == marg.(memarg_offset)) with
   | true => Some m
@@ -195,6 +185,7 @@ Definition sign_extend_n (n: N) (bytelen: N) : Z :=
   val_z.
 
 (* l is the byte length of the target type, therefore can only be 4/8 *)
+(* v128 also uses this function for i8/i16 decoding *)
 Definition sign_extend_bytes (s : sx) (l : N) (bs : bytes) : bytes :=
   match s with
   | SX_U => bytes_takefill #00 (N.to_nat l) bs
@@ -231,7 +222,7 @@ https://www.w3.org/TR/wasm-core-2/exec/runtime.html#default-val
 Definition default_val (t: value_type) : option value :=
   match t with
   | T_num t => Some (VAL_num (bitzero t))
-  | T_vec t => Some (VAL_vec (VAL_vec128 tt))
+  | T_vec t => Some (VAL_vec (VAL_vec128 SIMD.v128_default))
   | T_ref t => Some (VAL_ref (VAL_ref_null t))
   | T_bot => None
   end.
@@ -558,7 +549,7 @@ Definition app_vshiftop (sh: vshape) (op: vshiftop) (v1: value_vec) (v2: i32) :
   v1.
 
 Definition app_splat_vec (sh: vshape) (v1: value_num) : value_vec :=
-  VAL_vec128 tt.
+  VAL_vec128 SIMD.v128_default.
 
 Definition app_extract_vec (sh: vshape) (s: option sx) (n: laneidx) (v1: value_vec) : value_num :=
   match sh with
@@ -577,6 +568,22 @@ Definition app_extract_vec (sh: vshape) (s: option sx) (n: laneidx) (v1: value_v
 Definition app_replace_vec (sh: vshape) (n: laneidx) (v1: value_vec) (v2: value_num) : value_vec :=
   v1.
 
+Definition shape_width (sh: vshape) : N :=
+  match sh with
+  | VS_i vsi =>
+      match vsi with
+      | VSI_8_16 => 8
+      | VSI_16_8 => 16
+      | VSI_32_4 => 32
+      | VSI_64_2 => 64
+      end
+  | VS_f vsf =>
+      match vsf with
+      | VSF_32_4 => 32
+      | VSF_64_2 => 64
+      end
+  end.
+
 Definition shape_dim (sh: vshape) : N :=
   match sh with
   | VS_i vsi =>
@@ -593,6 +600,14 @@ Definition shape_dim (sh: vshape) : N :=
       end
   end.
 
+(* Sanity check *)
+Lemma shape_width_dim_match:
+  forall (sh: vshape), shape_width sh * shape_dim sh = 128%N.
+Proof.
+  by destruct sh as [[] | []].
+Qed.
+
+
 Definition rglob_is_mut (g : module_global) : bool :=
   g.(modglob_type).(tg_mut) == MUT_var.
 
@@ -773,6 +788,21 @@ Definition smem_store_packed (s: store_record) (inst: moduleinst) (n: N) (off: s
   | None => None
   end.
 
+Definition smem_store_vec (s: store_record) (inst: moduleinst) (n: N) (v: value_vec) (marg: memarg) : option store_record :=
+  match lookup_N inst.(inst_mems) 0%N with
+  | Some addr =>
+      match lookup_N s.(s_mems) addr with
+      | Some mem =>
+        match store_vec mem n v marg with
+        | Some mem' =>
+           Some (upd_s_mem s (set_nth mem' s.(s_mems) (N.to_nat addr) mem'))
+        | None => None
+        end
+      | None => None
+      end
+  | None => None
+  end.
+
 Definition smem_store_vec_lane (s: store_record) (inst: moduleinst) (n: N) (v: value_vec) (width: vwidth) (marg: memarg) (x: laneidx) : option store_record :=
   match lookup_N inst.(inst_mems) 0%N with
   | Some addr =>
@@ -1325,6 +1355,70 @@ Definition cvtop_valid (t2: number_type) (op: cvtop) (t1: number_type) (s: optio
   ((op == CVO_promote) && (t2 == T_f64) && (t1 == T_f32) && (s == None)) ||
   ((op == CVO_reinterpret) && ((is_int_t t2 && is_float_t t1) || (is_float_t t2 && is_int_t t1)) && (s == None)).
 
+Program Fixpoint v128_extract_bytes (width dim: N) (v: SIMD.v128) {measure (N.to_nat dim)}: list bytes :=
+  match dim with
+  | 0%N => nil
+  | _ =>
+      let bytes := take width v in
+      let bytes_remaining := drop width v in
+      cons bytes (v128_extract_bytes width (N.pred dim) bytes_remaining)
+  end.
+Next Obligation.
+  by lias.
+Qed.
+
+Definition unpacked (sh: vshape) : number_type :=
+    match sh with
+    | VS_i VSI_64_2 => T_i64
+    | VS_i _ => T_i32
+    | VS_f VSF_64_2 => T_f64
+    | VS_f VSF_32_4 => T_f32
+    end.
+
+(* Extract values from a v128 *)
+Definition v128_extract_lanes (sh: vshape) (v: SIMD.v128) : list value_num :=
+  let byte_width := N.div (shape_width sh) 8%N in
+  let dim := shape_dim sh in
+  let v128_bytes_grouped := v128_extract_bytes byte_width dim v in
+  let vt := unpacked sh in
+  match sh with
+  | VS_f _ =>
+      map (fun bs => wasm_deserialise bs vt) v128_bytes_grouped
+  | VS_i VSI_64_2 =>
+      map (fun bs =>
+             let val_n := BinInt.Z.to_N (common.Memdata.decode_int bs) in
+             let val_z := sign_extend_n val_n byte_width in
+             VAL_int64 (Wasm_int.Int64.repr val_z)) v128_bytes_grouped
+  (* All the other shapes extract to Int32. *)
+  | VS_i _ =>
+      map (fun bs =>
+             let val_n := BinInt.Z.to_N (common.Memdata.decode_int bs) in
+             let val_z := sign_extend_n val_n byte_width in
+             VAL_int32 (Wasm_int.Int32.repr val_z)) v128_bytes_grouped
+  end.
+
+Definition v128_serialise (sh: vshape) (vs: list value_num) : SIMD.v128 :=
+  let value_bytes := map bits vs in
+  match sh with
+  | VS_f _
+  | VS_i VSI_64_2
+  | VS_i VSI_32_4 =>
+      List.concat (map bits vs)
+  (* i16x8 and i8x16 needs truncating bytes before merging. *)
+  | VS_i VSI_16_8 =>
+      List.concat (map (fun v => match v with
+                              | VAL_int32 vi => serialise_i16 vi
+                              | _ => nil
+                              end
+                     ) vs)
+  | VS_i VSI_8_16 =>
+      List.concat (map (fun v => match v with
+                              | VAL_int32 vi => serialise_i8 vi
+                              | _ => nil
+                              end
+                     ) vs)
+  end.
+
 End Operations.
 
 #[export]
diff --git a/theories/opsem.v b/theories/opsem.v
index 4d8468b0..8e45fe32 100644
--- a/theories/opsem.v
+++ b/theories/opsem.v
@@ -527,6 +527,14 @@ Inductive reduce : host_state -> store_record -> frame -> list administrative_in
     smem_store_packed s f.(f_inst) (Wasm_int.N_of_uint i32m k) marg.(memarg_offset) v tp = None ->
     typeof_num v = t ->
     reduce hs s f [::$VN (VAL_int32 k); $VN v; AI_basic (BI_store t (Some tp) marg)] hs s f [::AI_trap]
+| r_store_vec_success :
+  forall s s' f marg k v hs,
+    smem_store_vec s f.(f_inst) (Wasm_int.N_of_uint i32m k) v marg = Some s' ->
+    reduce hs s f [::$VN (VAL_int32 k); $VV v; AI_basic (BI_store_vec marg)] hs s' f [::]
+| r_store_vec_failure :
+  forall s f marg k v hs,
+    smem_store_vec s f.(f_inst) (Wasm_int.N_of_uint i32m k) v marg = None ->
+    reduce hs s f [::$VN (VAL_int32 k); $VV v; AI_basic (BI_store_vec marg)] hs s f [::AI_trap]
 | r_store_vec_lane_success :
   forall s s' f width marg x k v hs,
     smem_store_vec_lane s f.(f_inst) (Wasm_int.N_of_uint i32m k) v width marg x = Some s' ->
diff --git a/theories/pp.v b/theories/pp.v
index 2133e712..1c696241 100644
--- a/theories/pp.v
+++ b/theories/pp.v
@@ -376,9 +376,12 @@ Definition pp_value_num (v : value_num) : string :=
   | VAL_float64 f => pp_number_type T_f64 ++ ".const " ++ with_fg FG_green (pp_f64 f) ++ newline
   end.
 
+Definition pp_v128 (v: SIMD.v128) : string :=
+  hex_small_no_prefix_of_bytes_compact (map byte_of_compcert_byte v).
+
 Definition pp_value_vec (v : value_vec) : string :=
   match v with
-  | VAL_vec128 t => pp_vector_type T_v128 ++ ".const" ++ with_fg FG_yellow " (unimplemented)" ++ newline
+  | VAL_vec128 bs => pp_vector_type T_v128 ++ ".const " ++ pp_v128 bs ++ newline
   end.
 
 Definition pp_value_ref (v : value_ref) : string :=
@@ -547,6 +550,9 @@ Definition pp_load_vec_lane (w: vwidth) (marg: memarg) (x: laneidx) :=
   "(not implemented)".
 
 (* store_vec_lane and load_vec uses the same args. Maybe it's better to find a new name *)
+Definition pp_store_vec (marg: memarg) :=
+  "(not implemented)".
+
 Definition pp_store_vec_lane (w: vwidth) (marg: memarg) (x: laneidx) :=
   "(not implemented)".
 
@@ -689,6 +695,8 @@ Fixpoint pp_basic_instruction (i : indentation) (be : basic_instruction) : strin
       indent i (pp_load_vec lvarg marg)
   | BI_load_vec_lane width marg lanex =>
       indent i (pp_load_vec_lane width marg lanex)
+  | BI_store_vec marg =>
+      indent i (pp_store_vec marg)
   | BI_store_vec_lane width marg lanex =>
       indent i (pp_store_vec_lane width marg lanex)
   end.
@@ -783,7 +791,6 @@ Definition pp_store (n : indentation) (s : store_record) : string :=
   indent n ("tables" ++ newline) ++
   pp_tables (n.+1) s.(s_tables).
 
-(* XXX disambiguate between cfg/res tuple with/without hs? *)
 Definition pp_config_tuple_except_store (cfg : store_record * frame * list administrative_instruction) : string :=
   let '(s, f, es) := cfg in
   pp_administrative_instructions_hint_empty 0 es ++
@@ -802,9 +809,9 @@ Definition pp_res_cfg_except_store {hs: host_state} {cfg: cfg_tuple_ctx} {d: N}
   | RSC_trap _ _ _ =>
       "Trap" ++ newline
   | RSC_invalid _ =>
-      "Invalid context decomposition. This result should not be observed when invoking valid Wasm module functions without arguments. Please submit a bug report at GitHub/WasmCert-Coq." ++ newline
+      "Invalid context decomposition. This result should not be observed when invoking valid Wasm module functions. Please submit a bug report at GitHub/WasmCert-Coq." ++ newline
   | RSC_error _ =>
-      "Ill-typed input configuration. This result should not be observed when invoking valid Wasm module functions without arguments. Please submit a bug report at GitHub/WasmCert-Coq." ++ newline
+      "Ill-typed input configuration. This result should not be observed when invoking valid Wasm module functions. Please submit a bug report at GitHub/WasmCert-Coq." ++ newline
   end.
 
 End Host.
diff --git a/theories/simd.v b/theories/simd.v
index de23690c..42a027a0 100644
--- a/theories/simd.v
+++ b/theories/simd.v
@@ -6,8 +6,18 @@ Set Implicit Arguments.
 Unset Strict Implicit.
 Unset Printing Implicit Defensive.
 
-Module Type SIMD.
+Module Type SIMD_Type.
 
-Parameter v128: Type.
+  Parameter v128: Type.
 
+  Parameter v128_default: v128.
+
+End SIMD_Type.
+
+Module SIMD <: SIMD_Type.
+
+  Definition v128 := bytes.
+
+  Definition v128_default : v128 := nil.
+  
 End SIMD.
diff --git a/theories/tactic.v b/theories/tactic.v
index c0b79039..957b0f3c 100644
--- a/theories/tactic.v
+++ b/theories/tactic.v
@@ -187,6 +187,8 @@ Ltac unfold_store_operations :=
         unfold smem_store in *; remove_bools_options
     | _: context [ smem_store_packed _ _ ] |- _ =>
         unfold smem_store_packed in *; remove_bools_options
+    | _: context [ smem_store_vec _ _ ] |- _ =>
+        unfold smem_store_vec in *; remove_bools_options
     | _: context [ smem_store_vec_lane _ _ ] |- _ =>
         unfold smem_store_vec_lane in *; remove_bools_options
     | _: context [ smem_grow _ _ ] |- _ =>
diff --git a/theories/text_format_parser.v b/theories/text_format_parser.v
index 512a8b5b..bd9c53f3 100644
--- a/theories/text_format_parser.v
+++ b/theories/text_format_parser.v
@@ -349,12 +349,83 @@ Definition parse_ref {n} : byte_parser value_ref n :=
     exact_string "ref.func " &> ((fun a => VAL_ref_func a) <$> parse_addr_text) <|>
     exact_string "ref.extern " &> ((fun a => VAL_ref_extern a) <$> parse_addr_text).
 
+Definition parse_vshape {n} : byte_parser vshape n :=
+  exact_string "i8x16" $> VS_i VSI_8_16 <|>
+  exact_string "i16x8" $> VS_i VSI_16_8 <|>
+  exact_string "i32x4" $> VS_i VSI_32_4 <|>
+  exact_string "i64x2" $> VS_i VSI_64_2 <|>
+  exact_string "f32x4" $> VS_f VSF_32_4 <|>
+  exact_string "f64x2" $> VS_f VSF_64_2.
+
+Fixpoint parse_vec_aux_positive {B} {n} (f: byte_parser B n) (p: positive) :
+  byte_parser (list B) n :=
+  match p with
+  | xH => (fun x => cons x nil) <$> f
+  | xO p' => (fun '(l1, l2) => List.app l1 l2) <$> (parse_vec_aux_positive f p' &>>= (fun _ => parse_vec_aux_positive f p'))
+  | xI p' => (((fun '(h, (l1, l2)) => cons h (List.app l1 l2))) <$> (f &>>= (fun _ => (parse_vec_aux_positive f p' &>>= (fun _ => parse_vec_aux_positive f p')))))
+  end.
+
+Definition parse_lanes_int {n} (lanes_p: positive) : byte_parser (list Z) n :=
+  parse_vec_aux_positive
+    (exact_string " " &> parse_uninterpreted_int) lanes_p.
+
+Definition parse_lanes_float {n} ftype fmixin (lanes_p: positive): byte_parser (list ftype) n :=
+  parse_vec_aux_positive
+    (exact_string " " &> parse_float ftype fmixin) lanes_p.
+
+Definition shape_dim_pos (sh: vshape) : positive :=
+  match sh with
+  | VS_i vsi =>
+      match vsi with
+      | VSI_8_16 => 16
+      | VSI_16_8 => 8
+      | VSI_32_4 => 4
+      | VSI_64_2 => 2
+      end
+  | VS_f vsf =>
+      match vsf with
+      | VSF_32_4 => 4
+      | VSF_64_2 => 2
+      end
+  end.
+
+Definition parse_lanes {n} (sh: vshape) : byte_parser (list value_num) n :=
+  match sh with
+  | VS_i VSI_64_2 =>
+      (fun zs => List.map (fun z => VAL_int64 (Wasm_int.Int64.repr z)) zs) <$>
+      (parse_lanes_int 2%positive)
+  | VS_f VSF_64_2 =>
+      (fun fs => List.map (fun f => VAL_float64 f) fs) <$>
+      (parse_lanes_float f64 f64m 2%positive)
+  | VS_f VSF_32_4 =>
+      (fun fs => List.map (fun f => VAL_float32 f) fs) <$>
+      (parse_lanes_float f32 f32m 4%positive)
+  | VS_i _ =>
+      (fun zs => List.map (fun z => VAL_int32 (Wasm_int.Int32.repr z)) zs) <$>
+      (parse_lanes_int (shape_dim_pos sh))
+  end.
+
+Definition parse_shape {n} : byte_parser vshape n :=
+  exact_string "i8x16" $> VS_i VSI_8_16 <|>
+  exact_string "i16x8" $> VS_i VSI_16_8 <|>
+  exact_string "i32x4" $> VS_i VSI_32_4 <|>
+  exact_string "i64x2" $> VS_i VSI_64_2 <|>
+  exact_string "f32x4" $> VS_f VSF_32_4 <|>
+  exact_string "f64x2" $> VS_f VSF_64_2.
+
+Definition parse_v128 {n} : byte_parser value_vec n :=
+  exact_string "v128.const " &>
+    parse_shape >>=
+    (fun sh => (VAL_vec128 <$> ((v128_serialise sh) <$> (parse_lanes sh)
+      ))).
+
 Definition parse_arg {n} : byte_parser datatypes.value n :=
   parse_i32_sym &> parse_dotconst &> exact_byte " " &> ((fun z => VAL_num (VAL_int32 (Wasm_int.Int32.repr z))) <$> parse_uninterpreted_int) <|>
   parse_i64_sym &> parse_dotconst &> exact_byte " " &> ((fun z => VAL_num (VAL_int64 (Wasm_int.Int64.repr z))) <$> parse_uninterpreted_int) <|>
   parse_f32_sym &> parse_dotconst &> exact_byte " " &> ((fun f => VAL_num (VAL_float32 f)) <$> parse_float f32 f32m) <|>
   parse_f64_sym &> parse_dotconst &> exact_byte " " &> ((fun f => VAL_num (VAL_float64 f)) <$> parse_float f64 f64m) <|>
-  VAL_ref <$> parse_ref.
+  VAL_ref <$> parse_ref <|>
+  VAL_vec <$> parse_v128.
 
 Record Language (n : nat) : Type := MkLanguage {
   _parse_arg: byte_parser datatypes.value n;
diff --git a/theories/type_checker.v b/theories/type_checker.v
index 257562ca..6f7e4445 100644
--- a/theories/type_checker.v
+++ b/theories/type_checker.v
@@ -413,7 +413,7 @@ Fixpoint check_single (C : t_context) (ct : option checker_type) (be : basic_ins
       | BI_load_vec_lane width marg x =>
           match lookup_N C.(tc_mems) 0%N with
           | Some _ =>
-              if load_vec_lane_bounds width marg x
+              if load_store_vec_lane_bounds width marg x
               then type_update ts [:: T_vec T_v128; T_num T_i32] [::T_vec T_v128]
               else None
           | None => None
@@ -426,10 +426,18 @@ Fixpoint check_single (C : t_context) (ct : option checker_type) (be : basic_ins
               else None
           | None => None
           end
+      | BI_store_vec marg =>
+          match lookup_N C.(tc_mems) 0%N with
+          | Some _ =>
+              if store_vec_bounds marg
+              then type_update ts [::T_vec T_v128; T_num T_i32] [::]
+              else None
+          | None => None
+          end
       | BI_store_vec_lane width marg x =>
           match lookup_N C.(tc_mems) 0%N with
           | Some _ =>
-              if load_vec_lane_bounds width marg x
+              if load_store_vec_lane_bounds width marg x
               then type_update ts [::T_vec T_v128; T_num T_i32] [::]
               else None
           | None => None
diff --git a/theories/type_checker_reflects_typing.v b/theories/type_checker_reflects_typing.v
index addd0c3d..4daf7c6c 100644
--- a/theories/type_checker_reflects_typing.v
+++ b/theories/type_checker_reflects_typing.v
@@ -1380,6 +1380,8 @@ Proof.
     + by resolve_tc_be_typing.
     (* Store *)
     + by resolve_tc_be_typing.
+    (* Store_vec *)
+    + by resolve_tc_be_typing.
     (* Store_vec_lane *)
     + by resolve_tc_be_typing.
     (* Memory_size *)
diff --git a/theories/type_preservation.v b/theories/type_preservation.v
index e9b5daac..afcc4aba 100644
--- a/theories/type_preservation.v
+++ b/theories/type_preservation.v
@@ -1287,6 +1287,51 @@ Proof.
     + by apply nth_error_Some_length in Hoption2; lias.
 Qed.
 
+Lemma smem_store_vec_extension: forall s f n v marg s' C mt,
+  store_typing s ->
+  smem_store_vec s (f_inst f) n v marg = Some s' ->
+  inst_typing s (f_inst f) = Some C -> 
+  lookup_N (tc_mems C) 0%N = Some mt ->
+  store_extension s s'.
+Proof.
+  move => s f n v marg s' C a Hst Hupd Hit Hnth.
+  unfold store_extension; unfold_store_operations => /=; resolve_store_extension; resolve_store_inst_lookup; remove_bools_options => /=.
+  rewrite Hnth in Hnthmt; injection Hnthmt as ->.
+  unfold store_vec in Hoption1.
+  destruct m0; simpl in *; remove_bools_options; simpl in *.
+  eapply component_extension_update; eauto; first by apply mem_extension_refl.
+  by apply mem_extension_refl.
+Qed.
+
+Lemma smem_store_vec_typing: forall s f n v marg s' C mt,
+  store_typing s ->
+  smem_store_vec s (f_inst f) n v marg = Some s' ->
+  inst_typing s (f_inst f) = Some C -> 
+  lookup_N (tc_mems C) 0%N = Some mt ->
+  store_typing s'.
+Proof.
+  move => s f n v marg s' C a Hst Hupd Hit Hnth.
+  assert (store_extension s s') as Hstext; first by eapply smem_store_vec_extension; eauto.
+  unfold_store_operations; remove_bools_options.
+  unfold store_vec in Hoption1.
+  resolve_store_inst_lookup; destruct m0; simpl in *; remove_bools_options; simpl in *.
+  rewrite Hnthmt in Hnth; injection Hnth as <-.
+  unfold store_typing in *; destruct s; simpl in *.
+  destruct Hst as [Hft [Htt [Hmt [Hgt [Het Hdt]]]]].
+  resolve_store_typing; simpl in *; clear Hft Htt Hgt Het Hdt.
+  apply List.Forall_forall.
+  move => x0 Hin.
+  apply set_nth_In in Hin.
+  destruct Hin as [-> | [m0 [Hneq Hnth']]] => //=.
+  - exists mt.
+    rewrite Hif.
+    resolve_if_true_eq.
+    by rewrite Hif0.
+  - eapply List.Forall_forall in Hmt; eauto.
+    + by apply List.nth_error_In in Hnth'.
+    + by apply nth_error_Some_length in Hoption2; lias.
+Qed.
+
 Lemma smem_store_vec_lane_extension: forall s f n v width marg x s' C mt,
   store_typing s ->
   smem_store_vec_lane s (f_inst f) n v width marg x = Some s' ->
@@ -1456,6 +1501,9 @@ Proof.
   - (* memory store_packed *)
     eapply smem_store_packed_extension; eauto.
     by unfold inst_match in Hmatch; remove_bools_options; uapply Hconjl0; f_equal.
+  - (* memory store_vec *)
+    eapply smem_store_vec_extension; eauto.
+    by unfold inst_match in Hmatch; remove_bools_options; uapply Hconjl0; f_equal.
   - (* memory store_vec_lane *)
     eapply smem_store_vec_lane_extension; eauto.
     by unfold inst_match in Hmatch; remove_bools_options; uapply Hconjl0; f_equal.
@@ -1510,7 +1558,10 @@ Proof.
   - (* memory store_packed *)
     eapply smem_store_packed_typing; eauto.
     by unfold inst_match in Hmatch; remove_bools_options; uapply Hconjl0; f_equal.
-  - (* memory store_vec_lane *)
+  - (* memory store_vec *)
+    eapply smem_store_vec_typing; eauto.
+    by unfold inst_match in Hmatch; remove_bools_options; uapply Hconjl0; f_equal.
+  - (* memory store_vec *)
     eapply smem_store_vec_lane_typing; eauto.
     by unfold inst_match in Hmatch; remove_bools_options; uapply Hconjl0; f_equal.
   - (* memory grow *)
diff --git a/theories/typing.v b/theories/typing.v
index 0ad1e7b6..be93b431 100644
--- a/theories/typing.v
+++ b/theories/typing.v
@@ -349,15 +349,19 @@ Inductive be_typing : t_context -> seq basic_instruction -> instr_type -> Prop :
   be_typing C [::BI_load_vec lv_arg m_arg] (Tf [::T_num T_i32] [::T_vec T_v128])
 | bet_load_vec_lane : forall C width m_arg x mem,
   lookup_N (tc_mems C) 0%N = Some mem ->
-  load_vec_lane_bounds width m_arg x ->
+  load_store_vec_lane_bounds width m_arg x ->
   be_typing C [::BI_load_vec_lane width m_arg x] (Tf [::T_num T_i32; T_vec T_v128] [::T_vec T_v128])
 | bet_store : forall C m_arg tp t mem,
   lookup_N (tc_mems C) 0%N = Some mem ->
   load_store_t_bounds m_arg.(memarg_align) tp t ->
   be_typing C [::BI_store t tp m_arg] (Tf [::T_num T_i32; T_num t] [::])
+| bet_store_vec : forall C m_arg mem,
+  lookup_N (tc_mems C) 0%N = Some mem ->
+  store_vec_bounds m_arg ->
+  be_typing C [::BI_store_vec m_arg] (Tf [::T_num T_i32; T_vec T_v128] [::])
 | bet_store_vec_lane : forall C width m_arg x mem,
   lookup_N (tc_mems C) 0%N = Some mem ->
-  load_vec_lane_bounds width m_arg x ->
+  load_store_vec_lane_bounds width m_arg x ->
   be_typing C [::BI_store_vec_lane width m_arg x] (Tf [::T_num T_i32; T_vec T_v128] [::])
 | bet_memory_size : forall C mem,
   lookup_N (tc_mems C) 0%N = Some mem ->
diff --git a/theories/typing_inversion.v b/theories/typing_inversion.v
index 998965cc..8d81735c 100644
--- a/theories/typing_inversion.v
+++ b/theories/typing_inversion.v
@@ -259,17 +259,22 @@ Definition be_principal_typing (C: t_context) (be: basic_instruction) (tf: instr
       exists mt,
       tf = (Tf [::T_num T_i32; T_vec T_v128] [::T_vec T_v128]) /\
         lookup_N (tc_mems C) 0%N = Some mt /\
-        load_vec_lane_bounds width marg x
+        load_store_vec_lane_bounds width marg x
   | BI_store t tp marg =>
       exists mt,
       tf = (Tf [::T_num T_i32; T_num t] [::]) /\
         lookup_N (tc_mems C) 0%N = Some mt /\
         load_store_t_bounds marg.(memarg_align) tp t
+  | BI_store_vec marg =>
+      exists mt,
+      tf = (Tf [::T_num T_i32; T_vec T_v128] [::]) /\
+        lookup_N (tc_mems C) 0%N = Some mt /\
+        load_vec_bounds LVA_none marg
   | BI_store_vec_lane width marg x =>
       exists mt,
       tf = (Tf [::T_num T_i32; T_vec T_v128] [::]) /\
         lookup_N (tc_mems C) 0%N = Some mt /\
-        load_vec_lane_bounds width marg x
+        load_store_vec_lane_bounds width marg x
   | BI_memory_size =>
       exists mt,
       tf = (Tf [::] [::T_num T_i32]) /\

From ed70d597ba9c3102036beb17dffc567205a66301 Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Tue, 12 Aug 2025 10:14:51 -0700
Subject: [PATCH 04/19] redesigned numeric vector ops to be encoded by binary
 format instead (going towards using parametric implementation)

---
 theories/binary_format_parser.v  | 202 ++++++++++++++++++++++++++++++-
 theories/binary_format_printer.v |  20 +--
 theories/datatypes.v             |  51 +++++++-
 theories/interpreter_ctx.v       |  30 ++---
 theories/operations.v            |  10 +-
 theories/opsem.v                 |  20 +--
 theories/pp.v                    |  59 ++++-----
 theories/type_checker.v          |  10 +-
 theories/type_preservation.v     |   8 +-
 theories/typing.v                |  20 +--
 theories/typing_inversion.v      |  10 +-
 11 files changed, 342 insertions(+), 98 deletions(-)

diff --git a/theories/binary_format_parser.v b/theories/binary_format_parser.v
index 66841be8..86ed98dc 100644
--- a/theories/binary_format_parser.v
+++ b/theories/binary_format_parser.v
@@ -504,6 +504,203 @@ Definition parse_f64_const {n} : be_parser n :=
 Definition parse_v128_const {n} : be_parser n :=
   exact_byte xfd &> assert_u32 12%N &> ((fun x => BI_const_vec (VAL_vec128 x)) <$> parse_16bytes).
 
+(*
+Inductive vvunop : Set :=
+  | VVU_not
+  .
+
+Inductive viunop : Set :=
+  | VUOI_abs
+  | VUOI_neg
+  | VUOI_popcnt (* has shape constraints: i8x16 *)
+  .
+
+Inductive vfunop : Set :=
+  | VUOF_abs
+  | VUOF_neg
+  | VUOF_sqrt
+  | VUOF_ceil
+  | VUOF_floor
+  | VUOF_trunc
+  | VUOF_nearest
+  .
+
+(* Indicate whether a cvtop has a zero flag. *)
+Inductive vec_zero : Set :=
+  | VZ_zero
+  .
+  
+Inductive vcvtop : Set :=
+  | VCVT_extend : vec_half -> sx -> vcvtop
+  | VCVT_trunc_sat : sx -> option vec_zero -> vcvtop
+  | VCVT_convert : option vec_half -> sx -> vcvtop
+  | VCVT_demote : vec_zero -> vcvtop (* Superficial tag since zero has to be present for demote; only available on f64x2 *)
+  | VCVT_promote (* only available on LOW half and f32x4 *)
+  .
+  
+Inductive vunop : Set :=
+  | VI_unop: viunop -> vunop
+  | VF_unop: vfunop -> vunop
+  | VV_unop: vvunop -> vunop
+  | V_cvtop: vcvtop -> vunop
+  | V_extadd_pairwise: sx -> vunop (* i16 and i32 only *)
+*)
+
+(* Slightly stupid, but no better solution without defining the concrete constructors *)
+
+Section Opcode_classes.
+
+  Open Scope N_scope.
+
+  Local Notation " x == y " := (N.eqb x y) (at level 5).
+  Local Notation " x >= y " := (N.leb y x) (at level 70).
+  Local Notation " x <= y " := (N.leb x y) (at level 70).
+
+Definition opcode_is_unop (n: N) : bool :=
+  (* v128 *)
+  (n == 77) ||
+  (* i8*16 *)
+    (n == 96) ||
+    (n == 97) ||
+    (n == 98) ||
+    (* i16x8 *)
+    (n == 128) ||
+    (n == 129) ||
+    (* i32x2 *)
+    (n == 160) ||
+    (n == 161) ||
+    (* i64x2 *)
+    (n == 192) ||
+    (n == 193) ||
+    (* f32x4 *)
+    ((n >= 103) && (n <= 106)) ||
+    (n == 224) ||
+    (n == 225) ||
+    (n == 227) ||
+    (* f64x2 *)
+    ((n >= 116) && (n <= 117)) ||
+    (n == 122) ||
+    (n == 248) ||
+    (n == 236) ||
+    (n == 237) ||
+    (n == 239) ||
+    (* cvtop *)
+    ((n >= 248) && (n <= 255)) ||
+    ((n >= 94) && (n <= 95)) ||
+    (* extend *)
+    ((n >= 135) && (n <= 138)) ||
+    ((n >= 167) && (n <= 170)) ||
+    ((n >= 199) && (n <= 202)).
+    
+Definition opcode_is_binop (n: N) : bool :=
+  (n == 14) ||
+    ((n >= 35) && (n <= 76)) ||
+    ((n >= 78) && (n <= 81)) ||
+    ((n >= 214) && (n <= 219)) ||
+    ((n >= 101) && (n <= 102)) ||
+    ((n >= 110) && (n <= 115)) ||
+    ((n >= 118) && (n <= 121)) || (n == 123) ||
+    ((n >= 124) && (n <= 125)) || (n == 130) ||
+    ((n >= 133) && (n <= 134)) ||
+    ((n >= 142) && (n <= 147)) ||
+    ((n >= 149) && (n <= 153)) ||
+    ((n >= 155) && (n <= 159)) ||
+    ((n >= 126) && (n <= 127)) ||
+    (n == 174) ||
+    (n == 177) ||
+    ((n >= 181) && (n <= 186)) ||
+    ((n >= 188) && (n <= 191)) ||
+    (n == 206) ||
+    (n == 209) ||
+    (n == 213) ||
+    ((n >= 220) && (n <= 223)) ||
+    ((n >= 228) && (n <= 235)) ||
+    ((n >= 240) && (n <= 247)).
+
+
+Definition opcode_is_ternop (n: N) : bool :=
+  (n == 82).
+
+Definition opcode_is_testop (n: N) : bool :=
+  (n == 83) ||
+    (n == 99) || (n == 100) ||
+    (n == 131) || (n == 132) ||
+    (n == 163) || (n == 164) ||
+    (n == 195) || (n == 196).
+
+Definition opcode_is_shiftop (n: N) : bool :=
+  ((n >= 107) && (n <= 109)) ||
+    ((n >= 139) && (n <= 141)) ||
+    ((n >= 171) && (n <= 173)) ||
+    ((n >= 203) && (n <= 205)).
+
+Close Scope N_scope.
+
+End Opcode_classes.
+
+Definition parse_simd_unop {n} : be_parser n :=
+  guardM
+    (fun n => if opcode_is_unop n then Some (BI_vunop n) else None)
+    parse_u32_as_N.
+
+Definition parse_simd_binop {n} : be_parser n :=
+  (* shuffle: carries a list of laneids *)
+  assert_u32 13%N &> ((fun bs => BI_vbinop (13%N, List.map to_N bs)) <$> parse_vec_aux anyTok 16%N) <|>
+  (* others *)
+  guardM
+    (fun n => if opcode_is_binop n then Some (BI_vbinop (n, nil)) else None)
+    parse_u32_as_N.
+
+Definition parse_simd_ternop {n} : be_parser n :=
+  guardM
+    (fun n => if opcode_is_ternop n then Some (BI_vternop n) else None)
+    parse_u32_as_N.
+
+Definition parse_simd_testop {n} : be_parser n :=
+  guardM
+    (fun n => if opcode_is_testop n then Some (BI_vtestop n) else None)
+    parse_u32_as_N.
+
+Definition parse_simd_shiftop {n} : be_parser n :=
+  guardM
+    (fun n => if opcode_is_shiftop n then Some (BI_vshiftop n) else None)
+    parse_u32_as_N.
+
+Definition parse_simd_splat {n} : be_parser n :=
+  assert_u32 15%N $> BI_splat_vec (VS_i VSI_8_16) <|>
+  assert_u32 16%N $> BI_splat_vec (VS_i VSI_16_8) <|>
+  assert_u32 17%N $> BI_splat_vec (VS_i VSI_32_4) <|>
+  assert_u32 18%N $> BI_splat_vec (VS_i VSI_64_2) <|>
+  assert_u32 19%N $> BI_splat_vec (VS_f VSF_32_4) <|>
+  assert_u32 20%N $> BI_splat_vec (VS_f VSF_64_2).
+
+(* TBC *)
+Definition parse_extract_replace_lane {n} : be_parser n :=
+  assert_u32 21%N &> ((BI_extract_vec (VS_i VSI_8_16) (Some SX_S)) <$> parse_laneidx) <|>
+  assert_u32 22%N &> ((BI_extract_vec (VS_i VSI_8_16) (Some SX_U)) <$> parse_laneidx) <|>
+  assert_u32 23%N &> ((BI_replace_vec (VS_i VSI_8_16)) <$> parse_laneidx) <|>
+  assert_u32 24%N &> ((BI_extract_vec (VS_i VSI_16_8) (Some SX_S)) <$> parse_laneidx) <|>
+  assert_u32 25%N &> ((BI_extract_vec (VS_i VSI_16_8) (Some SX_U)) <$> parse_laneidx) <|>
+  assert_u32 26%N &> ((BI_replace_vec (VS_i VSI_16_8)) <$> parse_laneidx) <|>
+  assert_u32 27%N &> ((BI_extract_vec (VS_i VSI_32_4) None) <$> parse_laneidx) <|>
+  assert_u32 28%N &> ((BI_replace_vec (VS_i VSI_32_4)) <$> parse_laneidx) <|>
+  assert_u32 29%N &> ((BI_extract_vec (VS_i VSI_64_2) None) <$> parse_laneidx) <|>
+  assert_u32 30%N &> ((BI_replace_vec (VS_i VSI_64_2)) <$> parse_laneidx) <|>
+  assert_u32 31%N &> ((BI_extract_vec (VS_f VSF_32_4) None) <$> parse_laneidx) <|>
+  assert_u32 32%N &> ((BI_replace_vec (VS_f VSF_32_4)) <$> parse_laneidx) <|>
+  assert_u32 33%N &> ((BI_extract_vec (VS_f VSF_64_2) None) <$> parse_laneidx) <|>
+  assert_u32 34%N &> ((BI_replace_vec (VS_f VSF_64_2)) <$> parse_laneidx).
+
+(* It is only important to parse the instructions into categories by their type signatures. Execution will be delegated outside Coq. *)
+Definition parse_simd_instruction {n}: be_parser n :=
+  parse_simd_unop <|>
+  parse_simd_binop <|>
+  parse_simd_ternop <|>
+  parse_simd_testop <|>
+  parse_simd_shiftop <|>
+  parse_simd_splat <|>
+  parse_extract_replace_lane.
+
 (* :-( *)
 Definition parse_numeric_instruction {n} : be_parser n :=
   parse_i32_const <|>
@@ -657,7 +854,10 @@ Definition parse_numeric_instruction {n} : be_parser n :=
   (exact_byte xfc &> assert_u32 4%N $> (BI_cvtop T_i64 CVO_trunc_sat T_f32 (Some SX_S))) <|>
   (exact_byte xfc &> assert_u32 5%N $> (BI_cvtop T_i64 CVO_trunc_sat T_f32 (Some SX_U))) <|>
   (exact_byte xfc &> assert_u32 6%N $> (BI_cvtop T_i64 CVO_trunc_sat T_f64 (Some SX_S))) <|>
-  (exact_byte xfc &> assert_u32 7%N $> (BI_cvtop T_i64 CVO_trunc_sat T_f64 (Some SX_U))).
+  (exact_byte xfc &> assert_u32 7%N $> (BI_cvtop T_i64 CVO_trunc_sat T_f64 (Some SX_U))) <|>
+
+  (* SIMD instructions, all starting with 0xfd *)
+  exact_byte xfd &> parse_simd_instruction.
 
 Record Language (n : nat) : Type := MkLanguage {
   _be : byte_parser basic_instruction n;
diff --git a/theories/binary_format_printer.v b/theories/binary_format_printer.v
index 7520d1f2..170ee6b5 100644
--- a/theories/binary_format_printer.v
+++ b/theories/binary_format_printer.v
@@ -107,19 +107,19 @@ Definition binary_of_valvec (v: value_vec) :=
 (* placeholder for vector operations added in 2.0, to be filled in a future update
 https://webassembly.github.io/spec/core/binary/instructions.html#vector-instructions
 *)
-Definition binary_of_vunop (sh: vshape) (op: vunop) :=
+Definition binary_of_vunop (op: vunop) :=
   xfd :: x0c :: (List.repeat x00 16).
 
-Definition binary_of_vbinop (sh: vshape) (op: vbinop) :=
+Definition binary_of_vbinop (op: vbinop) :=
   xfd :: x0c :: (List.repeat x00 16).
 
-Definition binary_of_vternop (sh: vshape) (op: vternop) :=
+Definition binary_of_vternop (op: vternop) :=
   xfd :: x0c :: (List.repeat x00 16).
 
-Definition binary_of_vtestop (sh: vshape) (op: vtestop) :=
+Definition binary_of_vtestop (op: vtestop) :=
   xfd :: x0c :: (List.repeat x00 16).
 
-Definition binary_of_vshiftop (sh: vshape) (op: vshiftop) :=
+Definition binary_of_vshiftop (op: vshiftop) :=
   xfd :: x0c :: (List.repeat x00 16).
 
 Definition binary_of_splat_vec (sh: vshape) :=
@@ -397,11 +397,11 @@ Fixpoint binary_of_be (be : basic_instruction) : list byte :=
 
   | BI_const_vec v => binary_of_valvec v
 
-  | BI_vunop sh op => binary_of_vunop sh op
-  | BI_vbinop sh op => binary_of_vbinop sh op
-  | BI_vternop sh op => binary_of_vternop sh op
-  | BI_vtestop sh op => binary_of_vtestop sh op
-  | BI_vshiftop sh op => binary_of_vshiftop sh op
+  | BI_vunop op => binary_of_vunop op
+  | BI_vbinop op => binary_of_vbinop op
+  | BI_vternop op => binary_of_vternop op
+  | BI_vtestop op => binary_of_vtestop op
+  | BI_vshiftop op => binary_of_vshiftop op
   | BI_splat_vec sh => binary_of_splat_vec sh
   | BI_extract_vec sh s lanex => binary_of_extract_vec sh s lanex
   | BI_replace_vec sh lanex => binary_of_replace_vec sh lanex
diff --git a/theories/datatypes.v b/theories/datatypes.v
index b0b85283..8b101e42 100644
--- a/theories/datatypes.v
+++ b/theories/datatypes.v
@@ -418,7 +418,7 @@ Inductive packed_type : Set := (* tp *)
   .
 
 (** SIMD Definitions and Instructions **)
-  
+  (*
 Inductive vshape_i: Set :=
   | VSI_8_16
   | VSI_16_8
@@ -525,6 +525,8 @@ Inductive vbinop : Set :=
   | VV_binop: vvbinop -> vbinop
   | VE_binop: vextbinop -> vbinop
   | V_narrow: vshape_i -> sx -> vbinop (* resulting width needs to be 2x original and at most i32 *)
+  | V_shuffle: list laneidx -> vbinop (* i8x16 only *)
+  | V_swizzle (* i8x16 only *)
   .
 
 (* Technically this is vvternop. But this is the only ternary operation. *)
@@ -543,6 +545,7 @@ Inductive vitestop : Set :=
 Inductive vtestop : Set :=
   | VI_testop: vitestop -> vtestop
   | VV_testop: vvtestop -> vtestop
+  | V_bitmask
   .
 
 Inductive virelop : Set :=
@@ -573,6 +576,42 @@ Inductive vshiftop : Set :=
   | VS_shl
   | VS_shr : sx -> vshiftop
   .
+*)
+  
+Inductive vshape_i: Set :=
+  | VSI_8_16
+  | VSI_16_8
+  | VSI_32_4
+  | VSI_64_2
+  .
+  
+Inductive vshape_f: Set :=
+  | VSF_32_4
+  | VSF_64_2
+  .
+  
+Inductive vshape : Set := (* shape *)
+  | VS_i: vshape_i -> vshape
+  | VS_f: vshape_f -> vshape
+  .
+
+Inductive vec_half : Set :=
+  | VH_low
+  | VH_high
+  .
+
+Definition laneidx : Set := u8. 
+
+Definition vunop := N.
+
+(* Shuffle is included in this type, which needs to encode an additional list of laneidx. *)
+Definition vbinop : Type := N * list N.
+
+Definition vternop := N.
+
+Definition vtestop := N.
+
+Definition vshiftop := N.
 
 (* Available shapes: 8x8, 16x4, 32x2 *)
 Definition vpacked_type : Type := N * N.
@@ -620,11 +659,11 @@ Extract lanes: consume a v128 operand and return the numeric value in a given la
 Replace lanes: consume a v128 operand and a numeric value for a given lane, and produce a v128 result.
 **)
   | BI_const_vec : value_vec -> basic_instruction
-  | BI_vunop: vshape -> vunop -> basic_instruction
-  | BI_vbinop: vshape -> vbinop -> basic_instruction
-  | BI_vternop: vshape -> vternop -> basic_instruction
-  | BI_vtestop: vshape -> vtestop -> basic_instruction
-  | BI_vshiftop: vshape -> vshiftop -> basic_instruction
+  | BI_vunop: vunop -> basic_instruction
+  | BI_vbinop: vbinop -> basic_instruction
+  | BI_vternop: vternop -> basic_instruction
+  | BI_vtestop: vtestop -> basic_instruction
+  | BI_vshiftop: vshiftop -> basic_instruction
   | BI_splat_vec: vshape -> basic_instruction
   | BI_extract_vec: vshape -> option sx -> laneidx -> basic_instruction (* sx only available for i8/i16 *)
   | BI_replace_vec: vshape -> laneidx -> basic_instruction
diff --git a/theories/interpreter_ctx.v b/theories/interpreter_ctx.v
index c1eae8a8..0932fb22 100644
--- a/theories/interpreter_ctx.v
+++ b/theories/interpreter_ctx.v
@@ -534,11 +534,11 @@ Proof.
       (* BI_relop t op *) t op |
       (* BI_cvtop t2 cvtop t1 sx *) t2 cvtop t1 sx |
       (* BI_const_vec _ *) |
-      (* BI_vunop sh op *) sh op |
-      (* BI_vbinop sh op *) sh op |
-      (* BI_vternop sh op *) sh op |
-      (* BI_vtestop sh op *) sh op |
-      (* BI_vshiftop sh op*) sh op |
+      (* BI_vunop op *) op |
+      (* BI_vbinop op *) op |
+      (* BI_vternop op *) op |
+      (* BI_vtestop op *) op |
+      (* BI_vshiftop op *) op |
       (* BI_splat_vec sh *) sh |
       (* BI_extract_vec shape_vec [Some sx | None] laneidx *) sh sx x |
       (* BI_replace_vec shape_vec laneidx *) sh x |
@@ -706,42 +706,42 @@ the condition that all values should live in the operand stack. *)
     (* AI_basic BI_const_vec v *)
     - apply RSC_invalid => /=; by move => [??].
       
-    (* AI_basic BI_vunop sh op *)
+    (* AI_basic BI_vunop op *)
     - destruct vs0 as [|v vs0]; first by no_args.
       assert_value_vec v.
-      apply <<hs, (s, (fc, lcs) :: ccs', (VAL_vec (app_vunop sh op v) :: vs0, es0), None), d>> => //.
+      apply <<hs, (s, (fc, lcs) :: ccs', (VAL_vec (app_vunop op v) :: vs0, es0), None), d>> => //.
       resolve_reduce_ctx vs0 es0.
       by apply r_simple, rs_vunop.
       
-    (* AI_basic BI_vbinop sh op *)
+    (* AI_basic BI_vbinop op *)
     - destruct vs0 as [|v2 [|v1 vs0]]; try by no_args.
       assert_value_vec v1.
       assert_value_vec v2.
-      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_vec (app_vbinop sh op v1 v2) :: vs0, es0), None), d>> => //.
+      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_vec (app_vbinop op v1 v2) :: vs0, es0), None), d>> => //.
       resolve_reduce_ctx vs0 es0.
       by apply r_simple, rs_vbinop.
       
-    (* AI_basic BI_vternop sh op *)
+    (* AI_basic BI_vternop op *)
     - destruct vs0 as [|v3 [|v2 [|v1 vs0]]]; try by no_args.
       assert_value_vec v1.
       assert_value_vec v2.
       assert_value_vec v3.
-      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_vec (app_vternop sh op v1 v2 v3) :: vs0, es0), None), d>> => //.
+      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_vec (app_vternop op v1 v2 v3) :: vs0, es0), None), d>> => //.
       resolve_reduce_ctx vs0 es0.
       by apply r_simple, rs_vternop.
       
-    (* AI_basic BI_vtestop sh tv *)
+    (* AI_basic BI_vtestop tv *)
     - destruct vs0 as [|v vs0]; first by no_args.
       assert_value_vec v.
-      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_num (VAL_int32 (wasm_bool (app_vtestop sh op v))) :: vs0, es0), None), d>> => //.
+      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_num (VAL_int32 (wasm_bool (app_vtestop op v))) :: vs0, es0), None), d>> => //.
       resolve_reduce_ctx vs0 es0.
       by apply r_simple, rs_vtestop.
       
-    (* AI_basic BI_vshiftop sh sv *)
+    (* AI_basic BI_vshiftop sv *)
     - destruct vs0 as [|v2 [|v1 vs0]]; try by no_args.
       assert_value_vec v1.
       assert_i32 v2.
-      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_vec (app_vshiftop sh op v1 v2) :: vs0, es0), None), d>> => //.
+      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_vec (app_vshiftop op v1 v2) :: vs0, es0), None), d>> => //.
       resolve_reduce_ctx vs0 es0.
       by apply r_simple, rs_vshiftop.
       
diff --git a/theories/operations.v b/theories/operations.v
index 028cac78..fd6766ed 100644
--- a/theories/operations.v
+++ b/theories/operations.v
@@ -533,19 +533,19 @@ Definition app_relop (op: relop) (v1: value_num) (v2: value_num) :=
     end
   end.
 
-Definition app_vunop (sh: vshape) (op: vunop) (v1: value_vec) : value_vec :=
+Definition app_vunop (op: vunop) (v1: value_vec) : value_vec :=
   v1.
 
-Definition app_vbinop (sh: vshape) (op: vbinop) (v1 v2: value_vec) : value_vec :=
+Definition app_vbinop (op: vbinop) (v1 v2: value_vec) : value_vec :=
   v1.
 
-Definition app_vternop (sh: vshape) (op: vternop) (v1 v2 v3: value_vec) : value_vec :=
+Definition app_vternop (op: vternop) (v1 v2 v3: value_vec) : value_vec :=
   v1.
 
-Definition app_vtestop (sh: vshape) (op: vtestop) (v1: value_vec) : bool :=
+Definition app_vtestop (op: vtestop) (v1: value_vec) : bool :=
   true.
 
-Definition app_vshiftop (sh: vshape) (op: vshiftop) (v1: value_vec) (v2: i32) : value_vec :=
+Definition app_vshiftop (op: vshiftop) (v1: value_vec) (v2: i32) : value_vec :=
   v1.
 
 Definition app_splat_vec (sh: vshape) (v1: value_num) : value_vec :=
diff --git a/theories/opsem.v b/theories/opsem.v
index 8e45fe32..dcb35f24 100644
--- a/theories/opsem.v
+++ b/theories/opsem.v
@@ -60,20 +60,20 @@ Inductive reduce_simple : seq administrative_instruction -> seq administrative_i
 
   (** vector instructions **)
   | rs_vunop: 
-    forall v sh op,
-    reduce_simple [:: $VV v; AI_basic (BI_vunop sh op)] [::$VV (app_vunop sh op v)]
+    forall v op,
+    reduce_simple [:: $VV v; AI_basic (BI_vunop op)] [::$VV (app_vunop op v)]
   | rs_vbinop: 
-    forall v1 v2 sh op,
-    reduce_simple [:: $VV v1; $VV v2; AI_basic (BI_vbinop sh op)] [::$VV (app_vbinop sh op v1 v2)]
+    forall v1 v2 op,
+    reduce_simple [:: $VV v1; $VV v2; AI_basic (BI_vbinop op)] [::$VV (app_vbinop op v1 v2)]
   | rs_vternop: 
-    forall v1 v2 v3 sh op,
-    reduce_simple [:: $VV v1; $VV v2; $VV v3; AI_basic (BI_vternop sh op)] [::$VV (app_vternop sh op v1 v2 v3)]
+    forall v1 v2 v3 op,
+    reduce_simple [:: $VV v1; $VV v2; $VV v3; AI_basic (BI_vternop op)] [::$VV (app_vternop op v1 v2 v3)]
   | rs_vtestop: 
-    forall v1 sh op,
-    reduce_simple [:: $VV v1; AI_basic (BI_vtestop sh op)] [::$VN (VAL_int32 (wasm_bool (app_vtestop sh op v1)))]
+    forall v1 op,
+    reduce_simple [:: $VV v1; AI_basic (BI_vtestop op)] [::$VN (VAL_int32 (wasm_bool (app_vtestop op v1)))]
   | rs_vshiftop: 
-    forall v1 v2 sh op,
-    reduce_simple [:: $VV v1; $VN (VAL_int32 v2); AI_basic (BI_vshiftop sh op)] [::$VV app_vshiftop sh op v1 v2]
+    forall v1 v2 op,
+    reduce_simple [:: $VV v1; $VN (VAL_int32 v2); AI_basic (BI_vshiftop op)] [::$VV app_vshiftop op v1 v2]
   | rs_splat_vec: 
     forall v1 sh,
     reduce_simple [:: $VN v1; AI_basic (BI_splat_vec sh)] [::$VV (app_splat_vec sh v1)]
diff --git a/theories/pp.v b/theories/pp.v
index 1c696241..a7a5c655 100644
--- a/theories/pp.v
+++ b/theories/pp.v
@@ -519,42 +519,47 @@ Definition pp_cvtop (cvt: cvtop) : string :=
 (* placeholder for vector operations added in 2.0, to be filled in a future update
 https://webassembly.github.io/spec/core/binary/instructions.html#vector-instructions
 *)
-Definition pp_vunop (sh: vshape) (op: vunop) :=
-  "(not implemented)".
+Definition pp_vunop (op: vunop) :=
+  "vunop: " ++ pp_N op.
 
-Definition pp_vbinop (sh: vshape) (op: vbinop) :=
-  "(not implemented)".
+Definition pp_vbinop (op: vbinop) :=
+  let '(opcode, olanes) := op in
+  match olanes with
+  | nil => "vbinop: " ++ pp_N opcode
+  | _ => (* shuffle *)
+      "vbinop shuffle lanes: " ++ pp_list pp_N olanes
+  end.
 
-Definition pp_vternop (sh: vshape) (op: vternop) :=
-  "(not implemented)".
+Definition pp_vternop (op: vternop) :=
+  "vternop: " ++ pp_N op.
 
-Definition pp_vtestop (sh: vshape) (op: vtestop) :=
-  "(not implemented)".
+Definition pp_vtestop (op: vtestop) :=
+  "vtestop: " ++ pp_N op.
 
-Definition pp_vshiftop (sh: vshape) (op: vshiftop) :=
-  "(not implemented)".
+Definition pp_vshiftop (op: vshiftop) :=
+  "vshiftop: " ++ pp_N op.
 
 Definition pp_splat_vec (sh: vshape) :=
-  "(not implemented)".
+  "(not implemented: splat)".
 
 Definition pp_extract_vec (sh: vshape) (s: option sx) (x: laneidx) :=
-  "(not implemented)".
+  "(not implemented: extract_lane)".
 
 Definition pp_replace_vec (sh: vshape) (x: laneidx) :=
-  "(not implemented)".
+  "(not implemented: replace_lane)".
   
 Definition pp_load_vec (lvarg: load_vec_arg) (marg: memarg) :=
-  "(not implemented)".
+  "(not implemented: load_vec)".
 
 Definition pp_load_vec_lane (w: vwidth) (marg: memarg) (x: laneidx) :=
-  "(not implemented)".
+  "(not implemented: load_vec_lane)".
 
 (* store_vec_lane and load_vec uses the same args. Maybe it's better to find a new name *)
 Definition pp_store_vec (marg: memarg) :=
-  "(not implemented)".
+  "(not implemented: store_vec)".
 
 Definition pp_store_vec_lane (w: vwidth) (marg: memarg) (x: laneidx) :=
-  "(not implemented)".
+  "(not implemented: store_vec_lane)".
 
 Fixpoint pp_basic_instruction (i : indentation) (be : basic_instruction) : string :=
   let pp_basic_instructions bes i :=
@@ -674,16 +679,16 @@ Fixpoint pp_basic_instruction (i : indentation) (be : basic_instruction) : strin
       indent i (pp_number_type vt1 ++ "." ++ pp_cvtop cvtop ++ "_" ++ pp_number_type vt2 ++ pp_sx_o sxo ++ newline)
 
   (* vector instructions currently unimplemented *)
-  | BI_vunop sh op =>
-      indent i (pp_vunop sh op)
-  | BI_vbinop sh op =>
-      indent i (pp_vbinop sh op)
-  | BI_vternop sh op =>
-      indent i (pp_vternop sh op)
-  | BI_vtestop sh op =>
-      indent i (pp_vtestop sh op)
-  | BI_vshiftop sh op =>
-      indent i (pp_vshiftop sh op)
+  | BI_vunop op =>
+      indent i (pp_vunop op)
+  | BI_vbinop op =>
+      indent i (pp_vbinop op)
+  | BI_vternop op =>
+      indent i (pp_vternop op)
+  | BI_vtestop op =>
+      indent i (pp_vtestop op)
+  | BI_vshiftop op =>
+      indent i (pp_vshiftop op)
   | BI_splat_vec sh =>
       indent i (pp_splat_vec sh)
   | BI_extract_vec sh s lanex =>
diff --git a/theories/type_checker.v b/theories/type_checker.v
index 6f7e4445..11433b12 100644
--- a/theories/type_checker.v
+++ b/theories/type_checker.v
@@ -190,15 +190,15 @@ Fixpoint check_single (C : t_context) (ct : option checker_type) (be : basic_ins
           if cvtop_valid t2 op t1 sx
           then type_update ts [::(T_num t1)] [::(T_num t2)]
           else None
-      | BI_vunop sh op =>
+      | BI_vunop op =>
           type_update ts [::T_vec T_v128] [::T_vec T_v128]
-      | BI_vbinop sh op =>
+      | BI_vbinop op =>
           type_update ts [::T_vec T_v128; T_vec T_v128] [::T_vec T_v128]
-      | BI_vternop sh op =>
+      | BI_vternop op =>
           type_update ts [::T_vec T_v128; T_vec T_v128; T_vec T_v128] [::T_vec T_v128]
-      | BI_vtestop sh tv =>
+      | BI_vtestop tv =>
           type_update ts [::T_vec T_v128] [::T_num T_i32]
-      | BI_vshiftop sh sv =>
+      | BI_vshiftop sv =>
           type_update ts [::T_num T_i32; T_vec T_v128] [::T_vec T_v128]
       | BI_splat_vec shape =>
           type_update ts [::T_num (typeof_shape_unpacked shape)] [::T_vec T_v128]
diff --git a/theories/type_preservation.v b/theories/type_preservation.v
index afcc4aba..620b6635 100644
--- a/theories/type_preservation.v
+++ b/theories/type_preservation.v
@@ -76,19 +76,19 @@ Proof.
     erewrite eval_cvt_type_preserve; eauto.
     by resolve_subtyping.
   - (* vunop *)
-    replace (typeof_vec (app_vunop sh op v)) with (typeof_vec v); last by destruct op, v.
+    replace (typeof_vec (app_vunop op v)) with (typeof_vec v); last by destruct op, v.
     rewrite H0.
     by resolve_subtyping.
   - (* vbinop *)
-    replace (typeof_vec (app_vbinop sh op v1 v2)) with (typeof_vec v1); last by destruct op, v1, v2.
+    replace (typeof_vec (app_vbinop op v1 v2)) with (typeof_vec v1); last by destruct op, v1, v2.
     rewrite H0.
     by resolve_subtyping.
   - (* vternop *)
-    replace (typeof_vec (app_vternop sh op v1 v2 v3)) with (typeof_vec v1); last by destruct op, v1, v2, v3.
+    replace (typeof_vec (app_vternop op v1 v2 v3)) with (typeof_vec v1); last by destruct op, v1, v2, v3.
     rewrite H0.
     by resolve_subtyping.
   - (* vshiftop *)
-    replace (typeof_vec (app_vshiftop sh op v1 v2)) with (typeof_vec v1); last by destruct op, v1, v2.
+    replace (typeof_vec (app_vshiftop op v1 v2)) with (typeof_vec v1); last by destruct op, v1, v2.
     rewrite H0.
     by resolve_subtyping.
   - (* Extract_vec *)
diff --git a/theories/typing.v b/theories/typing.v
index be93b431..c17e95f8 100644
--- a/theories/typing.v
+++ b/theories/typing.v
@@ -221,16 +221,16 @@ Inductive be_typing : t_context -> seq basic_instruction -> instr_type -> Prop :
 | bet_cvtop : forall C op t1 t2 sx,
     cvtop_valid t2 op t1 sx ->
     be_typing C [::BI_cvtop t2 op t1 sx] (Tf [::T_num t1] [::T_num t2])
-| bet_vunop: forall C sh op,
-    be_typing C [::BI_vunop sh op] (Tf [::T_vec T_v128] [::T_vec T_v128])
-| bet_vbinop: forall C sh op,
-    be_typing C [::BI_vbinop sh op] (Tf [::T_vec T_v128; T_vec T_v128] [::T_vec T_v128])
-| bet_vternop: forall C sh op,
-    be_typing C [::BI_vternop sh op] (Tf [::T_vec T_v128; T_vec T_v128; T_vec T_v128] [::T_vec T_v128])
-| bet_vtestop: forall C sh op,
-    be_typing C [::BI_vtestop sh op] (Tf [::T_vec T_v128] [::T_num T_i32])
-| bet_vshiftop: forall C sh op,
-    be_typing C [::BI_vshiftop sh op] (Tf [::T_vec T_v128; T_num T_i32] [::T_vec T_v128])
+| bet_vunop: forall C op,
+    be_typing C [::BI_vunop op] (Tf [::T_vec T_v128] [::T_vec T_v128])
+| bet_vbinop: forall C op,
+    be_typing C [::BI_vbinop op] (Tf [::T_vec T_v128; T_vec T_v128] [::T_vec T_v128])
+| bet_vternop: forall C op,
+    be_typing C [::BI_vternop op] (Tf [::T_vec T_v128; T_vec T_v128; T_vec T_v128] [::T_vec T_v128])
+| bet_vtestop: forall C op,
+    be_typing C [::BI_vtestop op] (Tf [::T_vec T_v128] [::T_num T_i32])
+| bet_vshiftop: forall C op,
+    be_typing C [::BI_vshiftop op] (Tf [::T_vec T_v128; T_num T_i32] [::T_vec T_v128])
 | bet_splat_vec: forall C sh,
     be_typing C [::BI_splat_vec sh] (Tf [::T_num (typeof_shape_unpacked sh)] [::T_vec T_v128])
 | bet_extract_vec: forall C sh sx x,
diff --git a/theories/typing_inversion.v b/theories/typing_inversion.v
index 8d81735c..54689771 100644
--- a/theories/typing_inversion.v
+++ b/theories/typing_inversion.v
@@ -96,15 +96,15 @@ Definition be_principal_typing (C: t_context) (be: basic_instruction) (tf: instr
   | BI_cvtop t2 op t1 sx =>
       tf = (Tf [::T_num t1] [::T_num t2]) /\
         cvtop_valid t2 op t1 sx
-  | BI_vunop sh op =>
+  | BI_vunop op =>
       tf = (Tf [::T_vec T_v128] [::T_vec T_v128])
-  | BI_vbinop sh op =>
+  | BI_vbinop op =>
       tf = (Tf [::T_vec T_v128; T_vec T_v128] [::T_vec T_v128])
-  | BI_vternop sh op =>
+  | BI_vternop op =>
       tf = (Tf [::T_vec T_v128; T_vec T_v128; T_vec T_v128] [::T_vec T_v128])
-  | BI_vtestop sh op =>
+  | BI_vtestop op =>
       tf = (Tf [::T_vec T_v128] [::T_num T_i32])
-  | BI_vshiftop sh op =>
+  | BI_vshiftop op =>
       tf = (Tf [::T_vec T_v128; T_num T_i32] [::T_vec T_v128])
   | BI_splat_vec shape =>
       tf = (Tf [::T_num (typeof_shape_unpacked shape)] [::T_vec T_v128])

From ca6d3c4aecf2498da7644adaed7c4b585ea1bc9d Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Tue, 12 Aug 2025 11:28:25 -0700
Subject: [PATCH 05/19] makefile changes for custom filters

---
 Makefile    | 12 +++++++---
 run_wast.sh | 67 ++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/Makefile b/Makefile
index d1b74caf..d8f25c8f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,13 @@
 default:
 	opam install .
 
+SHELL := bash
+
+SCRIPT := ./run_wast.sh
+
+FOLDER ?=
+FILTER ?=
+
+.PHONY: run_wast
 run_wast:
-	@FOLDER=$$(if [ "$(filter-out run_wast,$(MAKECMDGOALS))" ]; then echo "$(word 2, $(MAKECMDGOALS))"; else echo "./wast_testsuite"; fi);\
-	chmod +x run_wast.sh; \
-	./run_wast.sh "$$FOLDER"
\ No newline at end of file
+	$(SCRIPT) "$(FOLDER)" "$(FILTER)"
diff --git a/run_wast.sh b/run_wast.sh
index 5bad2e63..81522cfd 100755
--- a/run_wast.sh
+++ b/run_wast.sh
@@ -1,33 +1,52 @@
 #!/bin/bash
-set -e
+set -euo pipefail
 
-folder="${1:-./wast_testsuite}"  # Use argument, or fallback to default
+folder="${1:-./wast_testsuite}"  
+filter="${2:-}" 
+
+shopt -s nullglob
+
+if [[ -n "$filter" ]]; then
+  files=( "$folder"/*"$filter"*.wast )
+else
+  files=( "$folder"/*.wast )
+fi
+
+if (( ${#files[@]} == 0 )); then
+  echo "No matching .wast files in '$folder'${filter:+ containing '$filter'}."
+  exit 0
+fi
 
 total_passed=0
 total_tests=0
 
-for wastfile in "$folder"/*.wast; do
-	if [[ "$wastfile" == *simd* ]]; then
-		echo "Skipping SIMD test: $wastfile"
-		continue
-	fi
-	echo "Running: $wastfile"
-	tmpfile=$(mktemp)
-	cleaned=$(mktemp)
-	dune exec -- wasm_coq_interpreter --wast "$wastfile" | tee "$tmpfile"
-	cat "$tmpfile" | tr -d '\r' | sed 's/\x1b\[[0-9;]*m//g' > "$cleaned"
-	result_line=$(grep "Result: " "$cleaned")
-	if [[ "$result_line" =~ Result:\ ([0-9]+)/([0-9]+) ]]; then
-		passed="${BASH_REMATCH[1]}"
-		total="${BASH_REMATCH[2]}"
-		total_passed=$((total_passed + passed))
-		total_tests=$((total_tests + total))
-	else
-		echo "Regex match failed for $$wastfile"
-	fi
-	rm -f "$tmpfile" "$cleaned"
+for wastfile in "${files[@]}"; do
+  echo "Running: $wastfile"
+  tmpfile=$(mktemp)
+  cleaned=$(mktemp)
+
+  dune exec -- wasm_coq_interpreter --wast "$wastfile" | tee "$tmpfile"
+  tr -d '\r' < "$tmpfile" | sed 's/\x1b\[[0-9;]*m//g' > "$cleaned"
+
+  if result_line=$(grep -m1 "Result: " "$cleaned"); then
+    if [[ "$result_line" =~ Result:\ ([0-9]+)/([0-9]+) ]]; then
+      passed="${BASH_REMATCH[1]}"
+      total="${BASH_REMATCH[2]}"
+      total_passed=$((total_passed + passed))
+      total_tests=$((total_tests + total))
+    else
+      echo "Regex match failed for $wastfile"
+    fi
+  else
+    echo "No 'Result:' line found for $wastfile"
+  fi
+  rm -f "$tmpfile" "$cleaned"
 done
 
 echo "================"
-percentage=$(awk "BEGIN { printf \"%.2f\", ($total_passed / $total_tests) * 100}")
-echo "Total Passed: $total_passed/$total_tests ($percentage%)"
\ No newline at end of file
+if (( total_tests > 0 )); then
+  percentage=$(awk "BEGIN { printf \"%.2f\", ($total_passed / $total_tests) * 100 }")
+  echo "Total Passed: $total_passed/$total_tests ($percentage%)"
+else
+  echo "No tests run."
+fi

From 8b06c945c28895d910601965234efcbcfe1f5740 Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Thu, 14 Aug 2025 06:42:06 -0700
Subject: [PATCH 06/19] set up parametric simd interface

---
 src/SIMD_ops.ml              |  8 +++++++
 src/SIMD_ops.mli             |  5 +++++
 src/convert.ml               | 33 ++++++-----------------------
 src/convert.mli              | 17 ++-------------
 src/dune                     |  2 +-
 src/execute.ml               |  2 +-
 src/extraction.v             | 16 +++++++++-----
 src/shim.ml                  |  4 ++--
 src/wast_execute.ml          |  2 +-
 theories/operations.v        | 27 ++++++++++++++++++------
 theories/simd.v              | 41 ++++++++++++++++++++++++++++++++++--
 theories/simd_execute.v      | 18 ++++++++++++++++
 theories/type_preservation.v |  6 +++---
 13 files changed, 117 insertions(+), 64 deletions(-)
 create mode 100644 src/SIMD_ops.ml
 create mode 100644 src/SIMD_ops.mli
 create mode 100644 theories/simd_execute.v

diff --git a/src/SIMD_ops.ml b/src/SIMD_ops.ml
new file mode 100644
index 00000000..f95046a7
--- /dev/null
+++ b/src/SIMD_ops.ml
@@ -0,0 +1,8 @@
+let app_vunop_str op v =
+  v
+
+let app_vbinop_str op v1 v2 = 
+  v1
+
+let app_vternop_str op v1 v2 v3 =
+  v1
\ No newline at end of file
diff --git a/src/SIMD_ops.mli b/src/SIMD_ops.mli
new file mode 100644
index 00000000..2047ae5d
--- /dev/null
+++ b/src/SIMD_ops.mli
@@ -0,0 +1,5 @@
+val app_vunop_str : Big_int_Z.big_int -> char list -> char list
+
+val app_vbinop_str : Big_int_Z.big_int * (Big_int_Z.big_int list) -> char list -> char list -> char list
+
+val app_vternop_str : Big_int_Z.big_int -> char list -> char list -> char list -> char list
\ No newline at end of file
diff --git a/src/convert.ml b/src/convert.ml
index 661e77df..7516c162 100644
--- a/src/convert.ml
+++ b/src/convert.ml
@@ -3,35 +3,14 @@ let rec to_nat = function
   | n when n > 0 -> Extract.S (to_nat (n - 1))
   | _ -> failwith "not a nat"
 
-let rec to_positive z =
-  if z < 1 then failwith "not a positive"
-  else if z = 1 then Extract.XH
-  else 
-    let pos' = to_positive (z / 2) in
-    let d = z mod 2 in
-    if d = 0 then
-      Extract.XO pos'
-    else 
-      Extract.XI pos'
-
-let to_n z =
-  if z = 0 then Extract.N0
-  else Npos (to_positive z)
-
 let rec from_nat = function
   | Extract.O -> 0
   | Extract.S n -> 1 + from_nat n
 
-let rec from_positive = function
-  | Extract.XH -> 1
-  | Extract.XO p -> 2 * from_positive p
-  | Extract.XI p -> 1 + 2 * from_positive p
-
-let from_z = function
-  | Extract.Z0 -> 0
-  | Extract.Zpos p -> from_positive p
-  | Extract.Zneg p -> - from_positive p
+let z_of_int x =
+  Big_int_Z.big_int_of_int x
 
-let from_n = function
-  | Extract.N0 -> 0
-  | Extract.Npos p -> from_positive p
\ No newline at end of file
+let int_of_z z =
+  if Big_int_Z.is_int_big_int z then 
+    Big_int_Z.int_of_big_int z
+else invalid_arg "int_of_z overflow"
\ No newline at end of file
diff --git a/src/convert.mli b/src/convert.mli
index dc105ec9..f91ba700 100644
--- a/src/convert.mli
+++ b/src/convert.mli
@@ -1,22 +1,9 @@
 (** Convert [int] to [Extract.nat]. **)
 val to_nat : int -> Extract.nat
 
-(** Convert [int] to [Extract.positive]. **)
-val to_positive : int -> Extract.positive
-
-(** Convert [int] to [Extract.n]. **)
-val to_n : int -> Extract.n
-
 (** Convert [Extract.nat] to [int]. **)
 val from_nat : Extract.nat -> int
 
-(** Convert [Extract.positive] to [int]. *)
-val from_positive : Extract.positive -> int
-
-(** Convert [Extract.z] to [int]. *)
-val from_z : Extract.z -> int
-
-(** Convert [Extract.n] to [int]. *)
-val from_n : Extract.n -> int
-
+val z_of_int: int -> Big_int_Z.big_int
 
+val int_of_z: Big_int_Z.big_int -> int
\ No newline at end of file
diff --git a/src/dune b/src/dune
index 7a9560f1..b58b0045 100644
--- a/src/dune
+++ b/src/dune
@@ -8,7 +8,7 @@
 
 (executable
   (name wasm_coq_interpreter)
-  (libraries cmdliner linenoise wasm unix parray)
+  (libraries cmdliner linenoise wasm unix parray zarith)
   (modes (best exe))
   (public_name wasm_coq_interpreter)
   (promote (into ../))
diff --git a/src/execute.ml b/src/execute.ml
index 5ad166e5..180c7fb3 100644
--- a/src/execute.ml
+++ b/src/execute.ml
@@ -63,7 +63,7 @@ let rec eval_interp_cfg verbosity gen max_call_depth cfg d =
       (fun _ -> pp_res_cfg_except_store cfg cfg_res);
     match cfg_res with
     | RSC_normal (_hs', cfg', d') ->
-      let d_int = Convert.from_n d' in
+      let d_int = Convert.int_of_z d' in
       if (d_int > max_call_depth) && (max_call_depth != -1) then begin
         debug_info verbosity stage ~style:red (fun _ -> "Call stack exhaustion\n");
         Cfg_exhaustion
diff --git a/src/extraction.v b/src/extraction.v
index 597807b8..d49c6636 100644
--- a/src/extraction.v
+++ b/src/extraction.v
@@ -2,6 +2,12 @@
 
 From Coq Require Extraction.
 From Coq Require PArray.
+From Coq Require Import
+  extraction.ExtrOcamlBasic
+  extraction.ExtrOcamlString
+  extraction.ExtrOcamlZBigInt
+  ExtrOCamlInt63
+.
 
 From Wasm Require Import
   efficient_extraction
@@ -13,14 +19,11 @@ From Wasm Require Import
   type_checker
   pp
   host
+  simd_execute
   extraction_instance
 .
 
-From Coq Require Import
-  extraction.ExtrOcamlBasic
-  extraction.ExtrOcamlString
-  ExtrOCamlInt63
-.
+Require Import compcert.lib.Integers.
 
 Extraction Language OCaml.
 
@@ -37,6 +40,9 @@ Extract Constant memory_vec.arr_set => "Parray.set".
 Extract Constant memory_vec.arr_length => "Parray.length".
 Extract Constant memory_vec.arr_copy => "Parray.copy".
 
+Extract Constant SIMD_ops.app_vunop_str => "SIMD_ops.app_vunop_str".
+Extract Constant SIMD_ops.app_vbinop_str => "SIMD_ops.app_vbinop_str".
+Extract Constant SIMD_ops.app_vternop_str => "SIMD_ops.app_vternop_str".
 
 Extraction "extract"
   EfficientExtraction
diff --git a/src/shim.ml b/src/shim.ml
index 320beb01..24e87c1e 100644
--- a/src/shim.ml
+++ b/src/shim.ml
@@ -131,7 +131,7 @@ functor (EH : Host) -> struct
 
   (** Run one step of the interpreter. *)
   let run_one_step cfg d = 
-    Extraction_instance.run_one_step cfg (Convert.to_n d)
+    Extraction_instance.run_one_step cfg (Convert.z_of_int d)
 
   let run_v_init = Extraction_instance.run_v_init
 
@@ -172,7 +172,7 @@ functor (EH : Host) -> struct
     
 (* Depth doesn't matter for pretty printing cfg *)
   let pp_res_cfg_except_store cfg res =
-    Utils.implode (Extraction_instance.pp_res_cfg_except_store cfg Extract.N0 res)
+    Utils.implode (Extraction_instance.pp_res_cfg_except_store cfg (Convert.z_of_int 0) res)
 
   let pp_es es =
     Utils.implode (Extraction_instance.pp_administrative_instructions O es)
diff --git a/src/wast_execute.ml b/src/wast_execute.ml
index b5946e22..2fd7048d 100644
--- a/src/wast_execute.ml
+++ b/src/wast_execute.ml
@@ -46,7 +46,7 @@ let wasm_val_to_string wval =
   match wval.it with
   | Num num -> Some (wasm_num_to_hexstring num)
   | Ref ref -> Some (wasm_ref_to_string ref)
-  | Vec vec -> 
+  | Vec _vec -> 
     (* Wasm ref interpreter always prints v128 as I32x4. *)
     let vec_string = "v128.const i32x4 " ^ string_of_value wval.it in
     (* Printf.printf "%s" vec_string; *)
diff --git a/theories/operations.v b/theories/operations.v
index fd6766ed..80205022 100644
--- a/theories/operations.v
+++ b/theories/operations.v
@@ -3,7 +3,7 @@
 
 From mathcomp Require Import ssreflect ssrfun ssrnat ssrbool eqtype seq.
 From compcert Require Floats.
-From Wasm Require Export common memory datatypes_properties list_extra.
+From Wasm Require Export common memory datatypes_properties list_extra simd_execute.
 From Coq Require Import BinNat.
 
 Set Implicit Arguments.
@@ -533,14 +533,27 @@ Definition app_relop (op: relop) (v1: value_num) (v2: value_num) :=
     end
   end.
 
-Definition app_vunop (op: vunop) (v1: value_vec) : value_vec :=
-  v1.
+Definition encode_vec (v: value_vec) : String.string :=
+  match v with
+  | VAL_vec128 vv => encode_v128 vv
+  end.
 
-Definition app_vbinop (op: vbinop) (v1 v2: value_vec) : value_vec :=
-  v1.
+Definition decode_vec (s: String.string) : value_vec :=
+  VAL_vec128 (decode_v128 s).
 
-Definition app_vternop (op: vternop) (v1 v2 v3: value_vec) : value_vec :=
-  v1.
+Definition app_vunop (op: vunop) (v1: value_vec) : value_vec :=
+  decode_vec (app_vunop_str op (encode_vec v1)).
+
+Definition app_vbinop (op: vbinop) (v1: value_vec) (v2: value_vec) : value_vec :=
+  let v1e := encode_vec v1 in
+  let v2e := encode_vec v2 in
+  decode_vec (app_vbinop_str op v1e v2e).
+
+Definition app_vternop (op: vternop) (v1: value_vec) (v2: value_vec) (v3: value_vec) : value_vec :=
+  let v1e := encode_vec v1 in
+  let v2e := encode_vec v2 in
+  let v3e := encode_vec v3 in
+  decode_vec (app_vternop_str op v1e v2e v3e).
 
 Definition app_vtestop (op: vtestop) (v1: value_vec) : bool :=
   true.
diff --git a/theories/simd.v b/theories/simd.v
index 42a027a0..36c0039a 100644
--- a/theories/simd.v
+++ b/theories/simd.v
@@ -1,5 +1,5 @@
 From mathcomp Require Import ssreflect ssrbool eqtype seq ssrnat.
-From Coq Require Import BinInt BinNat NArith Lia Uint63.
+From Coq Require Import BinInt BinNat NArith Lia Uint63 String.
 From Wasm Require Import numerics bytes memory common.
 
 Set Implicit Arguments.
@@ -12,6 +12,9 @@ Module Type SIMD_Type.
 
   Parameter v128_default: v128.
 
+  Parameter encode_v128 : v128 -> string.
+  Parameter decode_v128 : string -> v128.
+
 End SIMD_Type.
 
 Module SIMD <: SIMD_Type.
@@ -19,5 +22,39 @@ Module SIMD <: SIMD_Type.
   Definition v128 := bytes.
 
   Definition v128_default : v128 := nil.
-  
+
+Definition encode_v128 bs :=
+  let coq_bytes :=
+    List.map byte_of_compcert_byte bs in
+  string_of_list_byte coq_bytes.
+
+Definition decode_v128 s :=
+  let bs := list_byte_of_string s in
+    List.map compcert_byte_of_byte bs.
+
+Lemma decode_encode_v128: forall v,
+    decode_v128 (encode_v128 v) = v.
+Proof.
+  induction v => //=.
+  unfold decode_v128, encode_v128 in *.
+  rewrite list_byte_of_string_of_list_byte => /=.
+  rewrite list_byte_of_string_of_list_byte in IHv => /=.
+  rewrite IHv.
+  f_equal.
+  by rewrite compcert_byte_roundtrip.
+Qed.
+
+Lemma encode_decode_v128: forall s,
+    encode_v128 (decode_v128 s) = s.
+Proof.
+  induction s => //=.
+  unfold decode_v128, encode_v128 in * => /=.
+  rewrite coq_byte_roundtrip => /=.
+  cbn; f_equal.
+  - by rewrite Ascii.ascii_of_byte_of_ascii.
+  - by apply IHs. 
+Qed.
+
 End SIMD.
+
+Module Export simd_export := SIMD.
diff --git a/theories/simd_execute.v b/theories/simd_execute.v
new file mode 100644
index 00000000..4f9ca304
--- /dev/null
+++ b/theories/simd_execute.v
@@ -0,0 +1,18 @@
+From mathcomp Require Import ssreflect ssrbool eqtype seq ssrnat.
+From Wasm Require Import datatypes.
+From Coq Require Import String.
+
+Set Implicit Arguments.
+Unset Strict Implicit.
+Unset Printing Implicit Defensive.
+
+(* Cannot use Module Type + Declare Module because it doesn't extract *)
+Module SIMD_ops.
+  
+Parameter app_vunop_str : vunop -> string -> string.
+Parameter app_vbinop_str : vbinop -> string -> string -> string.
+Parameter app_vternop_str : vternop -> string -> string -> string -> string.
+  
+End SIMD_ops.
+
+Module Export simd_ops_export := SIMD_ops.
diff --git a/theories/type_preservation.v b/theories/type_preservation.v
index 620b6635..241d2191 100644
--- a/theories/type_preservation.v
+++ b/theories/type_preservation.v
@@ -76,15 +76,15 @@ Proof.
     erewrite eval_cvt_type_preserve; eauto.
     by resolve_subtyping.
   - (* vunop *)
-    replace (typeof_vec (app_vunop op v)) with (typeof_vec v); last by destruct op, v.
+    replace (typeof_vec (app_vunop op v)) with (typeof_vec v); last by destruct (app_vunop op v).
     rewrite H0.
     by resolve_subtyping.
   - (* vbinop *)
-    replace (typeof_vec (app_vbinop op v1 v2)) with (typeof_vec v1); last by destruct op, v1, v2.
+    replace (typeof_vec (app_vbinop op v1 v2)) with (typeof_vec v1); last by destruct (app_vbinop op v1 v2).
     rewrite H0.
     by resolve_subtyping.
   - (* vternop *)
-    replace (typeof_vec (app_vternop op v1 v2 v3)) with (typeof_vec v1); last by destruct op, v1, v2, v3.
+    replace (typeof_vec (app_vternop op v1 v2 v3)) with (typeof_vec v1); last by destruct (app_vternop op v1 v2 v3).
     rewrite H0.
     by resolve_subtyping.
   - (* vshiftop *)

From 905745efb2bfaced673111d3cb4694d1227bba9a Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Thu, 14 Aug 2025 08:12:59 -0700
Subject: [PATCH 07/19] execution framework; added some i32x4 operations and
 conversion for testing

---
 src/SIMD_ops.ml                 | 64 ++++++++++++++++++++++++++++++---
 src/SIMD_ops.mli                |  6 ++--
 src/convert.ml                  |  8 -----
 src/convert.mli                 |  4 ---
 src/execute.ml                  |  2 +-
 src/extraction.v                |  2 +-
 src/shim.ml                     | 27 +++++++-------
 src/utils.ml                    |  9 +++--
 src/utils.mli                   |  8 ++---
 theories/binary_format_parser.v |  4 +--
 theories/text_format_parser.v   |  4 +--
 11 files changed, 89 insertions(+), 49 deletions(-)

diff --git a/src/SIMD_ops.ml b/src/SIMD_ops.ml
index f95046a7..b39caec0 100644
--- a/src/SIMD_ops.ml
+++ b/src/SIMD_ops.ml
@@ -1,8 +1,64 @@
+open Wasm.V128
+
 let app_vunop_str op v =
-  v
+  let vw = of_bits v in
+  let wasm_f = 
+    match Utils.int_of_z op with
+    | 160 -> I32x4.abs
+    | 161 -> I32x4.neg
+    | 248 -> I32x4_convert.trunc_sat_f32x4_s
+    | 249 -> I32x4_convert.trunc_sat_f32x4_u
+    | 250 -> F32x4_convert.convert_i32x4_s
+    | 251 -> F32x4_convert.convert_i32x4_u
+    | 252 -> I32x4_convert.trunc_sat_f64x2_s_zero
+    | 253 -> I32x4_convert.trunc_sat_f64x2_u_zero
+    | 254 -> F64x2_convert.convert_i32x4_s (* low missing from the op name *)
+    | 255 -> F64x2_convert.convert_i32x4_u (* low missing from the op name *)
+    | 94 -> F32x4_convert.demote_f64x2_zero
+    | 95 -> F64x2_convert.promote_low_f32x4
+    | _ -> assert false
+  in
+  to_bits (wasm_f vw)
 
-let app_vbinop_str op v1 v2 = 
-  v1
+let app_vbinop_str op_args v1 v2 = 
+  let (op, _args) = op_args in
+  let v1w = of_bits v1 in
+  let v2w = of_bits v2 in
+  let wasm_f = 
+    match Utils.int_of_z op with
+    | 55 -> I32x4.eq
+    | 56 -> I32x4.ne
+    | 57 -> I32x4.lt_s
+    | 58 -> I32x4.lt_u
+    | 59 -> I32x4.gt_s
+    | 60 -> I32x4.gt_u
+    | 61 -> I32x4.le_s
+    | 62 -> I32x4.le_u
+    | 63 -> I32x4.ge_s
+    | 64 -> I32x4.ge_u
+    | 174 -> I32x4.add
+    | 177 -> I32x4.sub
+    | 181 -> I32x4.mul
+    | 182 -> I32x4.min_s
+    | 183 -> I32x4.min_u
+    | 184 -> I32x4.max_s
+    | 185 -> I32x4.max_u
+    | 188 -> I32x4_convert.extmul_low_s
+    | 189 -> I32x4_convert.extmul_high_s
+    | 190 -> I32x4_convert.extmul_low_u
+    | 191 -> I32x4_convert.extmul_high_u
+    | 206 -> I64x2.add
+    | _ -> assert false
+  in
+  to_bits (wasm_f v1w v2w)
 
 let app_vternop_str op v1 v2 v3 =
-  v1
\ No newline at end of file
+  let v1w = of_bits v1 in
+  let v2w = of_bits v2 in
+  let v3w = of_bits v3 in
+  let wasm_f = 
+    match Utils.int_of_z op with
+    | 82 -> V1x128.bitselect
+    | _ -> assert false
+  in
+  to_bits (wasm_f v1w v2w v3w)
\ No newline at end of file
diff --git a/src/SIMD_ops.mli b/src/SIMD_ops.mli
index 2047ae5d..c7aa3e6c 100644
--- a/src/SIMD_ops.mli
+++ b/src/SIMD_ops.mli
@@ -1,5 +1,5 @@
-val app_vunop_str : Big_int_Z.big_int -> char list -> char list
+val app_vunop_str : Big_int_Z.big_int -> string -> string
 
-val app_vbinop_str : Big_int_Z.big_int * (Big_int_Z.big_int list) -> char list -> char list -> char list
+val app_vbinop_str : Big_int_Z.big_int * (Big_int_Z.big_int list) -> string -> string -> string
 
-val app_vternop_str : Big_int_Z.big_int -> char list -> char list -> char list -> char list
\ No newline at end of file
+val app_vternop_str : Big_int_Z.big_int -> string -> string -> string -> string
\ No newline at end of file
diff --git a/src/convert.ml b/src/convert.ml
index 7516c162..a53a2b9a 100644
--- a/src/convert.ml
+++ b/src/convert.ml
@@ -6,11 +6,3 @@ let rec to_nat = function
 let rec from_nat = function
   | Extract.O -> 0
   | Extract.S n -> 1 + from_nat n
-
-let z_of_int x =
-  Big_int_Z.big_int_of_int x
-
-let int_of_z z =
-  if Big_int_Z.is_int_big_int z then 
-    Big_int_Z.int_of_big_int z
-else invalid_arg "int_of_z overflow"
\ No newline at end of file
diff --git a/src/convert.mli b/src/convert.mli
index f91ba700..87d4d0be 100644
--- a/src/convert.mli
+++ b/src/convert.mli
@@ -3,7 +3,3 @@ val to_nat : int -> Extract.nat
 
 (** Convert [Extract.nat] to [int]. **)
 val from_nat : Extract.nat -> int
-
-val z_of_int: int -> Big_int_Z.big_int
-
-val int_of_z: Big_int_Z.big_int -> int
\ No newline at end of file
diff --git a/src/execute.ml b/src/execute.ml
index 180c7fb3..673d6e1b 100644
--- a/src/execute.ml
+++ b/src/execute.ml
@@ -63,7 +63,7 @@ let rec eval_interp_cfg verbosity gen max_call_depth cfg d =
       (fun _ -> pp_res_cfg_except_store cfg cfg_res);
     match cfg_res with
     | RSC_normal (_hs', cfg', d') ->
-      let d_int = Convert.int_of_z d' in
+      let d_int = Utils.int_of_z d' in
       if (d_int > max_call_depth) && (max_call_depth != -1) then begin
         debug_info verbosity stage ~style:red (fun _ -> "Call stack exhaustion\n");
         Cfg_exhaustion
diff --git a/src/extraction.v b/src/extraction.v
index d49c6636..e100b53e 100644
--- a/src/extraction.v
+++ b/src/extraction.v
@@ -4,7 +4,7 @@ From Coq Require Extraction.
 From Coq Require PArray.
 From Coq Require Import
   extraction.ExtrOcamlBasic
-  extraction.ExtrOcamlString
+  extraction.ExtrOcamlNativeString
   extraction.ExtrOcamlZBigInt
   ExtrOCamlInt63
 .
diff --git a/src/shim.ml b/src/shim.ml
index 24e87c1e..00a79747 100644
--- a/src/shim.ml
+++ b/src/shim.ml
@@ -131,7 +131,7 @@ functor (EH : Host) -> struct
 
   (** Run one step of the interpreter. *)
   let run_one_step cfg d = 
-    Extraction_instance.run_one_step cfg (Convert.z_of_int d)
+    Extraction_instance.run_one_step cfg (Utils.z_of_int d)
 
   let run_v_init = Extraction_instance.run_v_init
 
@@ -145,40 +145,37 @@ functor (EH : Host) -> struct
 
   let interp_instantiate_wrapper s m extvals =
     let (res, msg) = Extraction_instance.interp_instantiate_wrapper s m extvals in
-    (res, Utils.implode msg)
+    (res, msg)
 
   let get_import_path m = 
-    let implode_pair p =
-      let (m, imp) = p in
-      (Utils.implode m, Utils.implode imp) in
-    List.map implode_pair (Extraction_instance.get_import_path m)
+    Extraction_instance.get_import_path m
 
   let get_exports f = 
     let exps = Extraction_instance.get_exports f in
-    List.map (fun exp -> let (n, v) = exp in (Utils.implode n, v)) exps
+    List.map (fun exp -> let (n, v) = exp in (n, v)) exps
 
-  let run_parse_module m = Extract.run_parse_module (Utils.explode m)
+  let run_parse_module m = Extract.run_parse_module m
 
-  let run_parse_arg a = Extract.run_parse_arg (Utils.explode a)
+  let run_parse_arg a = Extract.run_parse_arg a
 
   let pp_values l =
-    Utils.implode (Extraction_instance.pp_values l)
+    Extraction_instance.pp_values l
 
   let pp_store i st =
-    Utils.implode (Extraction_instance.pp_store (Convert.to_nat i) st)
+    Extraction_instance.pp_store (Convert.to_nat i) st
 
   let pp_cfg_tuple_ctx_except_store r =
-    Utils.implode (Extraction_instance.pp_cfg_tuple_ctx_except_store r)  
+    Extraction_instance.pp_cfg_tuple_ctx_except_store r
     
 (* Depth doesn't matter for pretty printing cfg *)
   let pp_res_cfg_except_store cfg res =
-    Utils.implode (Extraction_instance.pp_res_cfg_except_store cfg (Convert.z_of_int 0) res)
+    Extraction_instance.pp_res_cfg_except_store cfg (Utils.z_of_int 0) res
 
   let pp_es es =
-    Utils.implode (Extraction_instance.pp_administrative_instructions O es)
+    Extraction_instance.pp_administrative_instructions O es
 
   let pp_externval extval = 
-    Utils.implode (Extraction_instance.pp_extern_value extval)
+    Extraction_instance.pp_extern_value extval
 
   let is_canonical_nan =
     Extraction_instance.is_canonical_nan
diff --git a/src/utils.ml b/src/utils.ml
index 6ae1a4e2..cba4df0d 100644
--- a/src/utils.ml
+++ b/src/utils.ml
@@ -1,6 +1,9 @@
 let string_of_char c = String.make 1 c
 
-let explode s =
-  List.init (String.length s) (String.get s)
+let z_of_int x =
+  Big_int_Z.big_int_of_int x
 
-let implode l = String.concat "" (List.map string_of_char l)
+let int_of_z z =
+  if Big_int_Z.is_int_big_int z then 
+    Big_int_Z.int_of_big_int z
+else invalid_arg "int_of_z overflow"
\ No newline at end of file
diff --git a/src/utils.mli b/src/utils.mli
index 56dd28ea..4efb57ac 100644
--- a/src/utils.mli
+++ b/src/utils.mli
@@ -1,12 +1,8 @@
 (** Utility functions *)
 
-(** Explode a list into all its characters. *)
-val explode : string -> char list
-
-(** Inverse of [explode]. *)
-val implode : char list -> string
-
 (** Create a string with only one character. *)
 val string_of_char : char -> string
 
+val z_of_int: int -> Big_int_Z.big_int
 
+val int_of_z: Big_int_Z.big_int -> int
\ No newline at end of file
diff --git a/theories/binary_format_parser.v b/theories/binary_format_parser.v
index 86ed98dc..6792c180 100644
--- a/theories/binary_format_parser.v
+++ b/theories/binary_format_parser.v
@@ -1385,5 +1385,5 @@ Definition run_parse_expr (bs : list byte) : option (list basic_instruction) :=
 Definition run_parse_bes (bs : list byte) : option (list basic_instruction) :=
   run_parse_expr (bs ++ (x0b :: nil)).
 
-Definition run_parse_module (bs : list byte) : option module :=
-  run bs (fun n => parse_module).
+Definition run_parse_module (s : String.string) : option module :=
+  run (String.list_byte_of_string s) (fun n => parse_module).
diff --git a/theories/text_format_parser.v b/theories/text_format_parser.v
index bd9c53f3..005bd958 100644
--- a/theories/text_format_parser.v
+++ b/theories/text_format_parser.v
@@ -479,5 +479,5 @@ Definition run : list byte -> [ Parser (SizedList Tok) Tok M A ] -> option A :=
 
 End Run.
 
-Definition run_parse_arg (bs : list byte) : option datatypes.value :=
-  run bs (fun n => parse_arg).
+Definition run_parse_arg (s: String.string) : option datatypes.value :=
+  run (String.list_byte_of_string s) (fun n => parse_arg).

From 34e3313ebcd971772c0064eede035e286a3d3ffb Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Thu, 14 Aug 2025 09:22:27 -0700
Subject: [PATCH 08/19] add all unop/binop/ternop encodings

---
 src/SIMD_ops.ml                 | 148 ++++++++++++++++++++++++++++++++
 theories/binary_format_parser.v |   5 +-
 2 files changed, 151 insertions(+), 2 deletions(-)

diff --git a/src/SIMD_ops.ml b/src/SIMD_ops.ml
index b39caec0..84e6325e 100644
--- a/src/SIMD_ops.ml
+++ b/src/SIMD_ops.ml
@@ -4,8 +4,52 @@ let app_vunop_str op v =
   let vw = of_bits v in
   let wasm_f = 
     match Utils.int_of_z op with
+    | 77 -> V1x128.lognot
+    | 96 -> I8x16.abs
+    | 97 -> I8x16.neg
+    | 98 -> I8x16.popcnt
+    | 103 -> F32x4.ceil
+    | 104 -> F32x4.floor
+    | 105 -> F32x4.trunc
+    | 106 -> F32x4.nearest
+
+    | 116 -> F64x2.ceil
+    | 117 -> F64x2.floor
+    | 122 -> F64x2.trunc
+  
+    | 124 -> I16x8_convert.extadd_pairwise_s
+    | 125 -> I16x8_convert.extadd_pairwise_u
+
+    | 126 -> I32x4_convert.extadd_pairwise_s
+    | 127 -> I32x4_convert.extadd_pairwise_u
+
+    | 128 -> I16x8.abs
+    | 129 -> I16x8.neg
+    | 135 -> I16x8_convert.extend_low_s
+    | 136 -> I16x8_convert.extend_high_s
+    | 137 -> I16x8_convert.extend_low_u
+    | 138 -> I16x8_convert.extend_high_u
+
+    | 148 -> F64x2.nearest
+
     | 160 -> I32x4.abs
     | 161 -> I32x4.neg
+    | 167 -> I32x4_convert.extend_low_s
+    | 168 -> I32x4_convert.extend_high_s
+    | 169 -> I32x4_convert.extend_low_u
+    | 170 -> I32x4_convert.extend_high_u
+
+    | 192 -> I64x2.abs
+    | 193 -> I64x2.neg
+    | 199 -> I64x2_convert.extend_low_s
+    | 200 -> I64x2_convert.extend_high_s
+    | 201 -> I64x2_convert.extend_low_u
+    | 202 -> I64x2_convert.extend_high_u
+
+    | 224 -> F32x4.abs
+    | 225 -> F32x4.neg
+    | 227 -> F32x4.sqrt
+
     | 248 -> I32x4_convert.trunc_sat_f32x4_s
     | 249 -> I32x4_convert.trunc_sat_f32x4_u
     | 250 -> F32x4_convert.convert_i32x4_s
@@ -26,6 +70,27 @@ let app_vbinop_str op_args v1 v2 =
   let v2w = of_bits v2 in
   let wasm_f = 
     match Utils.int_of_z op with
+    | 14 -> V8x16.swizzle
+    | 35 -> I8x16.eq
+    | 36 -> I8x16.ne
+    | 37 -> I8x16.lt_s
+    | 38 -> I8x16.lt_u
+    | 39 -> I8x16.gt_s
+    | 40 -> I8x16.gt_u
+    | 41 -> I8x16.le_s
+    | 42 -> I8x16.le_u
+    | 43 -> I8x16.ge_s
+    | 44 -> I8x16.ge_u
+    | 45 -> I16x8.eq
+    | 46 -> I16x8.ne
+    | 47 -> I16x8.lt_s
+    | 48 -> I16x8.lt_u
+    | 49 -> I16x8.gt_s
+    | 50 -> I16x8.gt_u
+    | 51 -> I16x8.le_s
+    | 52 -> I16x8.le_u
+    | 53 -> I16x8.ge_s
+    | 54 -> I16x8.ge_u
     | 55 -> I32x4.eq
     | 56 -> I32x4.ne
     | 57 -> I32x4.lt_s
@@ -36,6 +101,54 @@ let app_vbinop_str op_args v1 v2 =
     | 62 -> I32x4.le_u
     | 63 -> I32x4.ge_s
     | 64 -> I32x4.ge_u
+    | 65 -> F32x4.eq
+    | 66 -> F32x4.ne
+    | 67 -> F32x4.lt
+    | 68 -> F32x4.gt
+    | 69 -> F32x4.le
+    | 70 -> F32x4.ge
+    | 71 -> F64x2.eq
+    | 72 -> F64x2.ne
+    | 73 -> F64x2.lt
+    | 74 -> F64x2.gt
+    | 75 -> F64x2.le
+    | 76 -> F64x2.ge
+    | 78 -> V1x128.and_
+    | 79 -> V1x128.andnot
+    | 80 -> V1x128.or_
+    | 81 -> V1x128.xor
+    | 101 -> I8x16_convert.narrow_s
+    | 102 -> I8x16_convert.narrow_u
+    | 110 -> I8x16.add
+    | 111 -> I8x16.add_sat_s
+    | 112 -> I8x16.add_sat_u
+    | 113 -> I8x16.sub
+    | 114 -> I8x16.sub_sat_u
+    | 115 -> I8x16.sub_sat_u
+    | 118 -> I8x16.min_s
+    | 119 -> I8x16.min_u
+    | 120 -> I8x16.max_s
+    | 121 -> I8x16.max_u
+    | 123 -> I8x16.avgr_u
+    | 130 -> I16x8.q15mulr_sat_s
+    | 133 -> I16x8_convert.narrow_s
+    | 134 -> I16x8_convert.narrow_u
+    | 142 -> I16x8.add
+    | 143 -> I16x8.add_sat_s
+    | 144 -> I16x8.add_sat_u
+    | 145 -> I16x8.sub
+    | 146 -> I16x8.sub_sat_u
+    | 147 -> I16x8.sub_sat_u
+    | 149 -> I16x8.mul
+    | 150 -> I16x8.min_s
+    | 151 -> I16x8.min_u
+    | 152 -> I16x8.max_s
+    | 153 -> I16x8.max_u
+    | 155 -> I16x8.avgr_u
+    | 156 -> I16x8_convert.extmul_low_s
+    | 157 -> I16x8_convert.extmul_high_s
+    | 158 -> I16x8_convert.extmul_low_u
+    | 159 -> I16x8_convert.extmul_high_u
     | 174 -> I32x4.add
     | 177 -> I32x4.sub
     | 181 -> I32x4.mul
@@ -43,11 +156,46 @@ let app_vbinop_str op_args v1 v2 =
     | 183 -> I32x4.min_u
     | 184 -> I32x4.max_s
     | 185 -> I32x4.max_u
+    | 186 -> I32x4_convert.dot_s
     | 188 -> I32x4_convert.extmul_low_s
     | 189 -> I32x4_convert.extmul_high_s
     | 190 -> I32x4_convert.extmul_low_u
     | 191 -> I32x4_convert.extmul_high_u
+
     | 206 -> I64x2.add
+    | 209 -> I64x2.sub
+    | 213 -> I64x2.mul
+
+    | 214 -> I64x2.eq
+    | 215 -> I64x2.ne
+    | 216 -> I64x2.lt_s
+    | 217 -> I64x2.gt_s
+    | 218 -> I64x2.le_s
+    | 219 -> I64x2.ge_s
+
+    | 220 -> I64x2_convert.extmul_low_s
+    | 221 -> I64x2_convert.extmul_high_s
+    | 222 -> I64x2_convert.extmul_low_u
+    | 223 -> I64x2_convert.extmul_high_u
+
+    | 228 -> F32x4.add
+    | 229 -> F32x4.sub
+    | 230 -> F32x4.mul
+    | 231 -> F32x4.div
+    | 232 -> F32x4.min
+    | 233 -> F32x4.max
+    | 234 -> F32x4.pmin
+    | 235 -> F32x4.pmax
+
+    | 240 -> F64x2.add
+    | 241 -> F64x2.sub
+    | 242 -> F64x2.mul
+    | 243 -> F64x2.div
+    | 244 -> F64x2.min
+    | 245 -> F64x2.max
+    | 246 -> F64x2.pmin
+    | 247 -> F64x2.pmax
+
     | _ -> assert false
   in
   to_bits (wasm_f v1w v2w)
diff --git a/theories/binary_format_parser.v b/theories/binary_format_parser.v
index 6792c180..3f9317de 100644
--- a/theories/binary_format_parser.v
+++ b/theories/binary_format_parser.v
@@ -564,9 +564,11 @@ Definition opcode_is_unop (n: N) : bool :=
     (n == 97) ||
     (n == 98) ||
     (* i16x8 *)
+    ((n >= 124) && (n <= 125)) ||
     (n == 128) ||
     (n == 129) ||
     (* i32x2 *)
+    ((n >= 126) && (n <= 127)) ||
     (n == 160) ||
     (n == 161) ||
     (* i64x2 *)
@@ -600,12 +602,11 @@ Definition opcode_is_binop (n: N) : bool :=
     ((n >= 101) && (n <= 102)) ||
     ((n >= 110) && (n <= 115)) ||
     ((n >= 118) && (n <= 121)) || (n == 123) ||
-    ((n >= 124) && (n <= 125)) || (n == 130) ||
+    (n == 130) ||
     ((n >= 133) && (n <= 134)) ||
     ((n >= 142) && (n <= 147)) ||
     ((n >= 149) && (n <= 153)) ||
     ((n >= 155) && (n <= 159)) ||
-    ((n >= 126) && (n <= 127)) ||
     (n == 174) ||
     (n == 177) ||
     ((n >= 181) && (n <= 186)) ||

From e5d1eefe81489081d149cd87af5f9dc765a7fbe7 Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Thu, 14 Aug 2025 12:59:42 -0700
Subject: [PATCH 09/19] use parameterised testop and shiftops, and added the
 corresponding ocaml linking

---
 changelogs/v2.2.0.md            |   8 ++
 src/SIMD_ops.ml                 |  71 ++++++++++-
 src/SIMD_ops.mli                |   6 +-
 src/extraction.v                |   4 +-
 src/parse.ml                    |   2 +-
 src/shim.ml                     |   4 +-
 src/shim.mli                    |   2 +-
 theories/binary_format_parser.v |  11 +-
 theories/interpreter_ctx.v      |   2 +-
 theories/operations.v           | 211 +++++++++++++++++---------------
 theories/opsem.v                |   2 +-
 theories/simd.v                 |  13 ++
 theories/simd_execute.v         |   5 +-
 theories/type_preservation.v    |  16 ---
 14 files changed, 227 insertions(+), 130 deletions(-)
 create mode 100644 changelogs/v2.2.0.md

diff --git a/changelogs/v2.2.0.md b/changelogs/v2.2.0.md
new file mode 100644
index 00000000..d5f75e0c
--- /dev/null
+++ b/changelogs/v2.2.0.md
@@ -0,0 +1,8 @@
+# Release 2.2.0
+
+## New Features
+- Implemented SIMD execution via eval functions in the reference implementation.
+
+## Refactorings
+- Extraction now extracts Coq's `String.string` to OCaml native `string` instead of `char list`. The signature of various interfaces are updated accordingly.
+- Extraction now extracts Coq's binary integers (N/Z/positive) to ZArith's BigInt instead. The original conversion functions are therefore deprecated. New conversion functions between ZArith's BigInt and OCaml's native int are added. This is to avoid cyclic dependencies in the parametric SIMD implementations.
\ No newline at end of file
diff --git a/src/SIMD_ops.ml b/src/SIMD_ops.ml
index 84e6325e..c9405ceb 100644
--- a/src/SIMD_ops.ml
+++ b/src/SIMD_ops.ml
@@ -1,3 +1,5 @@
+(* Invoke the reference eval functions corresponding to the binary encodings (from parsing). *)
+
 open Wasm.V128
 
 let app_vunop_str op v =
@@ -209,4 +211,71 @@ let app_vternop_str op v1 v2 v3 =
     | 82 -> V1x128.bitselect
     | _ -> assert false
   in
-  to_bits (wasm_f v1w v2w v3w)
\ No newline at end of file
+  to_bits (wasm_f v1w v2w v3w)
+
+let encode_bool b = 
+  if b then "\x01" else "\x00"
+
+let decode_int32 s = 
+  if String.length s < 4 then invalid_arg "int32_of_le_string: need at least 4 bytes";
+  let b0 = Char.code s.[0] in
+  let b1 = Char.code s.[1] in
+  let b2 = Char.code s.[2] in
+  let b3 = Char.code s.[3] in
+  let open Int32 in
+  logor (of_int b0)
+    (logor (shift_left (of_int b1) 8)
+      (logor (shift_left (of_int b2) 16)
+             (shift_left (of_int b3) 24)))
+
+let encode_int32 x =
+  let open Int32 in
+  String.init 4 (function
+    | 0 -> Char.chr (to_int (logand x 0xFFl))
+    | 1 -> Char.chr (to_int (logand (shift_right x 8) 0xFFl))
+    | 2 -> Char.chr (to_int (logand (shift_right x 16) 0xFFl))
+    | 3 -> Char.chr (to_int (logand (shift_right x 24) 0xFFl))
+    | _ -> assert false)
+
+let app_vtestop_str op v1 =
+  let v1w = of_bits v1 in
+  let op_i = Utils.int_of_z op in
+  match op_i with
+  | 83 -> encode_bool (I8x16.any_true v1w)
+
+  | 99 -> encode_bool (I8x16.all_true v1w)
+  | 131 -> encode_bool (I16x8.all_true v1w)
+  | 163 -> encode_bool (I32x4.all_true v1w)
+  | 195 -> encode_bool (I64x2.all_true v1w)
+
+  | 100 -> encode_int32 (I8x16.bitmask v1w)
+  | 132 -> encode_int32 (I16x8.bitmask v1w)
+  | 164 -> encode_int32 (I32x4.bitmask v1w)
+  | 196 -> encode_int32 (I64x2.bitmask v1w)
+
+  | _ -> assert false
+
+let app_vshiftop_str op v1 v2 =
+  let v1w = of_bits v1 in
+  let v2w = decode_int32 v2 in
+  let wasm_f = 
+    match Utils.int_of_z op with
+    | 107 -> I8x16.shl
+    | 108 -> I8x16.shr_s
+    | 109 -> I8x16.shr_u
+
+    | 139 -> I16x8.shl
+    | 140 -> I16x8.shr_s
+    | 141 -> I16x8.shr_u
+
+    | 171 -> I32x4.shl
+    | 172 -> I32x4.shr_s
+    | 173 -> I32x4.shr_u
+
+    | 203 -> I64x2.shl
+    | 204 -> I64x2.shr_s
+    | 205 -> I64x2.shr_u
+
+    | _ -> assert false
+  in
+  to_bits (wasm_f v1w v2w)
\ No newline at end of file
diff --git a/src/SIMD_ops.mli b/src/SIMD_ops.mli
index c7aa3e6c..6fbd166a 100644
--- a/src/SIMD_ops.mli
+++ b/src/SIMD_ops.mli
@@ -2,4 +2,8 @@ val app_vunop_str : Big_int_Z.big_int -> string -> string
 
 val app_vbinop_str : Big_int_Z.big_int * (Big_int_Z.big_int list) -> string -> string -> string
 
-val app_vternop_str : Big_int_Z.big_int -> string -> string -> string -> string
\ No newline at end of file
+val app_vternop_str : Big_int_Z.big_int -> string -> string -> string -> string
+
+val app_vtestop_str : Big_int_Z.big_int -> string -> string
+
+val app_vshiftop_str : Big_int_Z.big_int -> string -> string -> string
\ No newline at end of file
diff --git a/src/extraction.v b/src/extraction.v
index e100b53e..b8d7eeaf 100644
--- a/src/extraction.v
+++ b/src/extraction.v
@@ -43,10 +43,12 @@ Extract Constant memory_vec.arr_copy => "Parray.copy".
 Extract Constant SIMD_ops.app_vunop_str => "SIMD_ops.app_vunop_str".
 Extract Constant SIMD_ops.app_vbinop_str => "SIMD_ops.app_vbinop_str".
 Extract Constant SIMD_ops.app_vternop_str => "SIMD_ops.app_vternop_str".
+Extract Constant SIMD_ops.app_vtestop_str => "SIMD_ops.app_vtestop_str".
+Extract Constant SIMD_ops.app_vshiftop_str => "SIMD_ops.app_vshiftop_str".
 
 Extraction "extract"
   EfficientExtraction
-  run_parse_module
+  run_parse_module_str
   run_parse_arg
   Extraction_instance
   .
diff --git a/src/parse.ml b/src/parse.ml
index 6308fc1e..faf986a9 100644
--- a/src/parse.ml
+++ b/src/parse.ml
@@ -23,7 +23,7 @@ let binary_of_text textstr =
   | _ -> None
 
 let parse_binary_module bin_module = 
-  match Execute.Interpreter.run_parse_module bin_module with
+  match Execute.Interpreter.run_parse_module_str bin_module with
   | None -> Error "error in parsing module"
   | Some m -> OK m
 
diff --git a/src/shim.ml b/src/shim.ml
index 00a79747..ead6529e 100644
--- a/src/shim.ml
+++ b/src/shim.ml
@@ -73,7 +73,7 @@ module type InterpreterType = sig
   val get_import_path: Extract.module0 -> (string * string) list
   val get_exports : frame -> (string * externval) list
 
-  val run_parse_module : string -> Extract.module0 option
+  val run_parse_module_str : string -> Extract.module0 option
   val run_parse_arg : string -> value option
 
   val pp_values : value list -> string
@@ -154,7 +154,7 @@ functor (EH : Host) -> struct
     let exps = Extraction_instance.get_exports f in
     List.map (fun exp -> let (n, v) = exp in (n, v)) exps
 
-  let run_parse_module m = Extract.run_parse_module m
+  let run_parse_module_str m = Extract.run_parse_module_str m
 
   let run_parse_arg a = Extract.run_parse_arg a
 
diff --git a/src/shim.mli b/src/shim.mli
index 32ede3d9..6b6b2651 100644
--- a/src/shim.mli
+++ b/src/shim.mli
@@ -80,7 +80,7 @@ module type InterpreterType = sig
   val get_exports : frame -> (string * externval) list
 
   (** Parsing. *)
-  val run_parse_module : string -> Extract.module0 option
+  val run_parse_module_str : string -> Extract.module0 option
 
   val run_parse_arg : string -> Extract.value0 option
 
diff --git a/theories/binary_format_parser.v b/theories/binary_format_parser.v
index 3f9317de..bf5eb55c 100644
--- a/theories/binary_format_parser.v
+++ b/theories/binary_format_parser.v
@@ -582,7 +582,7 @@ Definition opcode_is_unop (n: N) : bool :=
     (* f64x2 *)
     ((n >= 116) && (n <= 117)) ||
     (n == 122) ||
-    (n == 248) ||
+    (n == 148) ||
     (n == 236) ||
     (n == 237) ||
     (n == 239) ||
@@ -623,7 +623,9 @@ Definition opcode_is_ternop (n: N) : bool :=
   (n == 82).
 
 Definition opcode_is_testop (n: N) : bool :=
+  (* all_true *)
   (n == 83) ||
+  (* any_true / bitmask *)
     (n == 99) || (n == 100) ||
     (n == 131) || (n == 132) ||
     (n == 163) || (n == 164) ||
@@ -1386,5 +1388,8 @@ Definition run_parse_expr (bs : list byte) : option (list basic_instruction) :=
 Definition run_parse_bes (bs : list byte) : option (list basic_instruction) :=
   run_parse_expr (bs ++ (x0b :: nil)).
 
-Definition run_parse_module (s : String.string) : option module :=
-  run (String.list_byte_of_string s) (fun n => parse_module).
+Definition run_parse_module (bs: list byte) : option module :=
+  run bs (fun n => parse_module).
+
+Definition run_parse_module_str (s : String.string) : option module :=
+  run_parse_module (String.list_byte_of_string s).
diff --git a/theories/interpreter_ctx.v b/theories/interpreter_ctx.v
index 0932fb22..c2fd4861 100644
--- a/theories/interpreter_ctx.v
+++ b/theories/interpreter_ctx.v
@@ -733,7 +733,7 @@ the condition that all values should live in the operand stack. *)
     (* AI_basic BI_vtestop tv *)
     - destruct vs0 as [|v vs0]; first by no_args.
       assert_value_vec v.
-      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_num (VAL_int32 (wasm_bool (app_vtestop op v))) :: vs0, es0), None), d>> => //.
+      apply <<hs, (s, (fc,lcs) :: ccs', (VAL_num (app_vtestop op v) :: vs0, es0), None), d>> => //.
       resolve_reduce_ctx vs0 es0.
       by apply r_simple, rs_vtestop.
       
diff --git a/theories/operations.v b/theories/operations.v
index 80205022..b3a7c2dc 100644
--- a/theories/operations.v
+++ b/theories/operations.v
@@ -541,45 +541,21 @@ Definition encode_vec (v: value_vec) : String.string :=
 Definition decode_vec (s: String.string) : value_vec :=
   VAL_vec128 (decode_v128 s).
 
-Definition app_vunop (op: vunop) (v1: value_vec) : value_vec :=
-  decode_vec (app_vunop_str op (encode_vec v1)).
-
-Definition app_vbinop (op: vbinop) (v1: value_vec) (v2: value_vec) : value_vec :=
-  let v1e := encode_vec v1 in
-  let v2e := encode_vec v2 in
-  decode_vec (app_vbinop_str op v1e v2e).
-
-Definition app_vternop (op: vternop) (v1: value_vec) (v2: value_vec) (v3: value_vec) : value_vec :=
-  let v1e := encode_vec v1 in
-  let v2e := encode_vec v2 in
-  let v3e := encode_vec v3 in
-  decode_vec (app_vternop_str op v1e v2e v3e).
-
-Definition app_vtestop (op: vtestop) (v1: value_vec) : bool :=
-  true.
-
-Definition app_vshiftop (op: vshiftop) (v1: value_vec) (v2: i32) : value_vec :=
-  v1.
+Definition encode_i32 (v: i32) : String.string :=
+  let bs := List.map byte_of_compcert_byte (serialise_i32 v) in
+  String.string_of_list_byte bs.
 
-Definition app_splat_vec (sh: vshape) (v1: value_num) : value_vec :=
-  VAL_vec128 SIMD.v128_default.
+Definition decode_i32 (s: String.string) : i32 :=
+  let bs := List.map compcert_byte_of_byte (String.list_byte_of_string s) in
+  Wasm_int.Int32.repr (Memdata.decode_int bs).
 
-Definition app_extract_vec (sh: vshape) (s: option sx) (n: laneidx) (v1: value_vec) : value_num :=
-  match sh with
-  | VS_i vsi =>
-      match vsi with
-      | VSI_64_2 => bitzero T_i64
-      | _ => bitzero T_i32
-      end
-  | VS_f vsf =>
-      match vsf with
-      | VSF_32_4 => bitzero T_f32
-      | VSF_64_2 => bitzero T_f64
-      end
-  end.
-
-Definition app_replace_vec (sh: vshape) (n: laneidx) (v1: value_vec) (v2: value_num) : value_vec :=
-  v1.
+Definition unpacked (sh: vshape) : number_type :=
+    match sh with
+    | VS_i VSI_64_2 => T_i64
+    | VS_i _ => T_i32
+    | VS_f VSF_64_2 => T_f64
+    | VS_f VSF_32_4 => T_f32
+    end.
 
 Definition shape_width (sh: vshape) : N :=
   match sh with
@@ -620,6 +596,103 @@ Proof.
   by destruct sh as [[] | []].
 Qed.
 
+(* Extract values from a v128 *)
+Definition v128_extract_lanes_n (sh: vshape) (v: SIMD.v128) : list N :=
+  let byte_width := N.div (shape_width sh) 8%N in
+  let dim := shape_dim sh in
+  let v128_bytes_grouped := v128_extract_bytes byte_width dim v in
+  let vt := unpacked sh in
+  map (fun bs =>
+         BinInt.Z.to_N (common.Memdata.decode_int bs)) v128_bytes_grouped.
+
+(* Extract values from a v128 *)
+Definition v128_extract_lanes (sh: vshape) (v: SIMD.v128) : list value_num :=
+  let byte_width := N.div (shape_width sh) 8%N in
+  let dim := shape_dim sh in
+  let v128_bytes_grouped := v128_extract_bytes byte_width dim v in
+  let vt := unpacked sh in
+  match sh with
+  | VS_f _ =>
+      map (fun bs => wasm_deserialise bs vt) v128_bytes_grouped
+  | VS_i VSI_64_2 =>
+      map (fun bs =>
+             let val_n := BinInt.Z.to_N (common.Memdata.decode_int bs) in
+             let val_z := sign_extend_n val_n byte_width in
+             VAL_int64 (Wasm_int.Int64.repr val_z)) v128_bytes_grouped
+  (* All the other shapes extract to Int32. *)
+  | VS_i _ =>
+      map (fun bs =>
+             let val_n := BinInt.Z.to_N (common.Memdata.decode_int bs) in
+             let val_z := sign_extend_n val_n byte_width in
+             VAL_int32 (Wasm_int.Int32.repr val_z)) v128_bytes_grouped
+  end.
+
+Definition v128_serialise (sh: vshape) (vs: list value_num) : SIMD.v128 :=
+  let value_bytes := map bits vs in
+  match sh with
+  | VS_f _
+  | VS_i VSI_64_2
+  | VS_i VSI_32_4 =>
+      List.concat (map bits vs)
+  (* i16x8 and i8x16 needs truncating bytes before merging. *)
+  | VS_i VSI_16_8 =>
+      List.concat (map (fun v => match v with
+                              | VAL_int32 vi => serialise_i16 vi
+                              | _ => nil
+                              end
+                     ) vs)
+  | VS_i VSI_8_16 =>
+      List.concat (map (fun v => match v with
+                              | VAL_int32 vi => serialise_i8 vi
+                              | _ => nil
+                              end
+                     ) vs)
+  end.
+
+Definition app_vunop (op: vunop) (v1: value_vec) : value_vec :=
+  decode_vec (app_vunop_str op (encode_vec v1)).
+
+Definition app_vbinop (op: vbinop) (v1: value_vec) (v2: value_vec) : value_vec :=
+  let v1e := encode_vec v1 in
+  let v2e := encode_vec v2 in
+  decode_vec (app_vbinop_str op v1e v2e).
+
+Definition app_vternop (op: vternop) (v1: value_vec) (v2: value_vec) (v3: value_vec) : value_vec :=
+  let v1e := encode_vec v1 in
+  let v2e := encode_vec v2 in
+  let v3e := encode_vec v3 in
+  decode_vec (app_vternop_str op v1e v2e v3e).
+
+Definition app_vtestop (op: vtestop) (v1: value_vec) : value_num :=
+  let v1e := encode_vec v1 in
+  let bs := decode_v128 (app_vtestop_str op v1e) in
+  wasm_deserialise bs T_i32.
+
+Definition app_vshiftop (op: vshiftop) (v1: value_vec) (v2: i32) : value_vec :=
+  let v1e := encode_vec v1 in
+  let v2e := encode_i32 v2 in
+  decode_vec (app_vshiftop_str op v1e v2e).
+
+Definition app_splat_vec (sh: vshape) (v1: value_num) : value_vec :=
+  VAL_vec128 SIMD.v128_default.
+
+Definition app_extract_vec (sh: vshape) (s: option sx) (n: laneidx) (v1: value_vec) : value_num :=
+  match sh with
+  | VS_i vsi =>
+      match vsi with
+      | VSI_64_2 => bitzero T_i64
+      | _ => bitzero T_i32
+      end
+  | VS_f vsf =>
+      match vsf with
+      | VSF_32_4 => bitzero T_f32
+      | VSF_64_2 => bitzero T_f64
+      end
+  end.
+
+Definition app_replace_vec (sh: vshape) (n: laneidx) (v1: value_vec) (v2: value_num) : value_vec :=
+  v1.
+
 
 Definition rglob_is_mut (g : module_global) : bool :=
   g.(modglob_type).(tg_mut) == MUT_var.
@@ -1368,70 +1441,6 @@ Definition cvtop_valid (t2: number_type) (op: cvtop) (t1: number_type) (s: optio
   ((op == CVO_promote) && (t2 == T_f64) && (t1 == T_f32) && (s == None)) ||
   ((op == CVO_reinterpret) && ((is_int_t t2 && is_float_t t1) || (is_float_t t2 && is_int_t t1)) && (s == None)).
 
-Program Fixpoint v128_extract_bytes (width dim: N) (v: SIMD.v128) {measure (N.to_nat dim)}: list bytes :=
-  match dim with
-  | 0%N => nil
-  | _ =>
-      let bytes := take width v in
-      let bytes_remaining := drop width v in
-      cons bytes (v128_extract_bytes width (N.pred dim) bytes_remaining)
-  end.
-Next Obligation.
-  by lias.
-Qed.
-
-Definition unpacked (sh: vshape) : number_type :=
-    match sh with
-    | VS_i VSI_64_2 => T_i64
-    | VS_i _ => T_i32
-    | VS_f VSF_64_2 => T_f64
-    | VS_f VSF_32_4 => T_f32
-    end.
-
-(* Extract values from a v128 *)
-Definition v128_extract_lanes (sh: vshape) (v: SIMD.v128) : list value_num :=
-  let byte_width := N.div (shape_width sh) 8%N in
-  let dim := shape_dim sh in
-  let v128_bytes_grouped := v128_extract_bytes byte_width dim v in
-  let vt := unpacked sh in
-  match sh with
-  | VS_f _ =>
-      map (fun bs => wasm_deserialise bs vt) v128_bytes_grouped
-  | VS_i VSI_64_2 =>
-      map (fun bs =>
-             let val_n := BinInt.Z.to_N (common.Memdata.decode_int bs) in
-             let val_z := sign_extend_n val_n byte_width in
-             VAL_int64 (Wasm_int.Int64.repr val_z)) v128_bytes_grouped
-  (* All the other shapes extract to Int32. *)
-  | VS_i _ =>
-      map (fun bs =>
-             let val_n := BinInt.Z.to_N (common.Memdata.decode_int bs) in
-             let val_z := sign_extend_n val_n byte_width in
-             VAL_int32 (Wasm_int.Int32.repr val_z)) v128_bytes_grouped
-  end.
-
-Definition v128_serialise (sh: vshape) (vs: list value_num) : SIMD.v128 :=
-  let value_bytes := map bits vs in
-  match sh with
-  | VS_f _
-  | VS_i VSI_64_2
-  | VS_i VSI_32_4 =>
-      List.concat (map bits vs)
-  (* i16x8 and i8x16 needs truncating bytes before merging. *)
-  | VS_i VSI_16_8 =>
-      List.concat (map (fun v => match v with
-                              | VAL_int32 vi => serialise_i16 vi
-                              | _ => nil
-                              end
-                     ) vs)
-  | VS_i VSI_8_16 =>
-      List.concat (map (fun v => match v with
-                              | VAL_int32 vi => serialise_i8 vi
-                              | _ => nil
-                              end
-                     ) vs)
-  end.
-
 End Operations.
 
 #[export]
diff --git a/theories/opsem.v b/theories/opsem.v
index dcb35f24..aee1b9ea 100644
--- a/theories/opsem.v
+++ b/theories/opsem.v
@@ -70,7 +70,7 @@ Inductive reduce_simple : seq administrative_instruction -> seq administrative_i
     reduce_simple [:: $VV v1; $VV v2; $VV v3; AI_basic (BI_vternop op)] [::$VV (app_vternop op v1 v2 v3)]
   | rs_vtestop: 
     forall v1 op,
-    reduce_simple [:: $VV v1; AI_basic (BI_vtestop op)] [::$VN (VAL_int32 (wasm_bool (app_vtestop op v1)))]
+    reduce_simple [:: $VV v1; AI_basic (BI_vtestop op)] [::$VN (app_vtestop op v1)]
   | rs_vshiftop: 
     forall v1 v2 op,
     reduce_simple [:: $VV v1; $VN (VAL_int32 v2); AI_basic (BI_vshiftop op)] [::$VV app_vshiftop op v1 v2]
diff --git a/theories/simd.v b/theories/simd.v
index 36c0039a..592fff8a 100644
--- a/theories/simd.v
+++ b/theories/simd.v
@@ -58,3 +58,16 @@ Qed.
 End SIMD.
 
 Module Export simd_export := SIMD.
+
+Program Fixpoint v128_extract_bytes (width dim: N) (v: SIMD.v128) {measure (N.to_nat dim)}: list bytes :=
+  match dim with
+  | 0%N => nil
+  | _ =>
+      let bytes := take width v in
+      let bytes_remaining := drop width v in
+      cons bytes (v128_extract_bytes width (N.pred dim) bytes_remaining)
+  end.
+Next Obligation.
+  by lias.
+Qed.
+
diff --git a/theories/simd_execute.v b/theories/simd_execute.v
index 4f9ca304..b8a8d2e2 100644
--- a/theories/simd_execute.v
+++ b/theories/simd_execute.v
@@ -12,7 +12,10 @@ Module SIMD_ops.
 Parameter app_vunop_str : vunop -> string -> string.
 Parameter app_vbinop_str : vbinop -> string -> string -> string.
 Parameter app_vternop_str : vternop -> string -> string -> string -> string.
-  
+Parameter app_vtestop_str : vtestop -> string -> string.
+Parameter app_vshiftop_str : vtestop -> string -> string -> string.
+
 End SIMD_ops.
 
 Module Export simd_ops_export := SIMD_ops.
+
diff --git a/theories/type_preservation.v b/theories/type_preservation.v
index 241d2191..7125aacb 100644
--- a/theories/type_preservation.v
+++ b/theories/type_preservation.v
@@ -75,22 +75,6 @@ Proof.
   - (* Cvtop *)
     erewrite eval_cvt_type_preserve; eauto.
     by resolve_subtyping.
-  - (* vunop *)
-    replace (typeof_vec (app_vunop op v)) with (typeof_vec v); last by destruct (app_vunop op v).
-    rewrite H0.
-    by resolve_subtyping.
-  - (* vbinop *)
-    replace (typeof_vec (app_vbinop op v1 v2)) with (typeof_vec v1); last by destruct (app_vbinop op v1 v2).
-    rewrite H0.
-    by resolve_subtyping.
-  - (* vternop *)
-    replace (typeof_vec (app_vternop op v1 v2 v3)) with (typeof_vec v1); last by destruct (app_vternop op v1 v2 v3).
-    rewrite H0.
-    by resolve_subtyping.
-  - (* vshiftop *)
-    replace (typeof_vec (app_vshiftop op v1 v2)) with (typeof_vec v1); last by destruct op, v1, v2.
-    rewrite H0.
-    by resolve_subtyping.
   - (* Extract_vec *)
     replace (typeof_num (app_extract_vec sh sx x v1)) with (typeof_shape_unpacked sh); last by destruct sh as [[] | []].
     by resolve_subtyping.

From e9b2cd8f2650879cdf5af7eb4d8e685d36a1d90d Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Fri, 15 Aug 2025 03:30:01 -0700
Subject: [PATCH 10/19] splat

---
 theories/operations.v | 48 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/theories/operations.v b/theories/operations.v
index b3a7c2dc..254a3030 100644
--- a/theories/operations.v
+++ b/theories/operations.v
@@ -46,7 +46,7 @@ Definition write_bytes_meminst (m: meminst) (start_idx : N) (bs: bytes) : option
   | None => None
   end.
 
-Definition bits (v : value_num) : bytes :=
+Definition serialise_num (v : value_num) : bytes :=
   match v with
   | VAL_int32 c => serialise_i32 c
   | VAL_int64 c => serialise_i64 c
@@ -54,6 +54,10 @@ Definition bits (v : value_num) : bytes :=
   | VAL_float64 c => serialise_f64 c
   end.
 
+#[deprecated(since="wasmcert-coq-v2.2", note="Use serialise_num instead.")]
+  Definition bits := serialise_num.
+
+
 Definition ml_valid (m: mem_t) : Prop :=
   N.modulo (mem_length m) page_size = 0%N.
 
@@ -596,6 +600,16 @@ Proof.
   by destruct sh as [[] | []].
 Qed.
 
+Definition serialise_num_shape (sh: vshape) (v: value_num) : bytes :=
+  match v with
+  | VAL_int64 c => serialise_i64 c
+  | VAL_float32 c => serialise_f32 c
+  | VAL_float64 c => serialise_f64 c
+  | VAL_int32 c => (* Could be i8/i16/i32 *)
+      let byte_width := N.div (shape_width sh) 8%N in
+      Memdata.encode_int byte_width (Wasm_int.Int32.unsigned c)
+  end.
+
 (* Extract values from a v128 *)
 Definition v128_extract_lanes_n (sh: vshape) (v: SIMD.v128) : list N :=
   let byte_width := N.div (shape_width sh) 8%N in
@@ -628,12 +642,12 @@ Definition v128_extract_lanes (sh: vshape) (v: SIMD.v128) : list value_num :=
   end.
 
 Definition v128_serialise (sh: vshape) (vs: list value_num) : SIMD.v128 :=
-  let value_bytes := map bits vs in
+  let value_bytes := map serialise_num vs in
   match sh with
   | VS_f _
   | VS_i VSI_64_2
   | VS_i VSI_32_4 =>
-      List.concat (map bits vs)
+      List.concat (map serialise_num vs)
   (* i16x8 and i8x16 needs truncating bytes before merging. *)
   | VS_i VSI_16_8 =>
       List.concat (map (fun v => match v with
@@ -673,8 +687,28 @@ Definition app_vshiftop (op: vshiftop) (v1: value_vec) (v2: i32) : value_vec :=
   let v2e := encode_i32 v2 in
   decode_vec (app_vshiftop_str op v1e v2e).
 
+(* Efficient splat, slightly faster than a simple loop but still slow due to string concatenation *)
+Fixpoint bytes_repeat_aux (bs: bytes) (p: positive) : bytes :=
+  match p with
+  | xH => bs
+  | xO p' =>
+      let bs' := bytes_repeat_aux bs p' in
+      bs' ++ bs'
+  | xI p' =>
+      let bs' := bytes_repeat_aux bs p' in
+      bs' ++ bs' ++ bs
+  end.
+      
+Definition bytes_repeat (bs: bytes) (n: N) : bytes :=
+  match n with
+  | N0 => nil
+  | Npos p => bytes_repeat_aux bs p
+  end.
+
 Definition app_splat_vec (sh: vshape) (v1: value_num) : value_vec :=
-  VAL_vec128 SIMD.v128_default.
+  let bs := serialise_num_shape sh v1 in
+  let dim := shape_dim sh in
+  VAL_vec128 (bytes_repeat bs dim).
 
 Definition app_extract_vec (sh: vshape) (s: option sx) (n: laneidx) (v1: value_vec) : value_num :=
   match sh with
@@ -849,7 +883,7 @@ Definition smem_store (s: store_record) (inst: moduleinst) (n: N) (off: static_o
   | Some addr =>
       match lookup_N s.(s_mems) addr with
       | Some mem =>
-        match store mem n off (bits v) (tnum_length t) with
+        match store mem n off (serialise_num v) (tnum_length t) with
         | Some mem' =>
            Some (upd_s_mem s (set_nth mem' s.(s_mems) (N.to_nat addr) mem'))
         | None => None
@@ -864,7 +898,7 @@ Definition smem_store_packed (s: store_record) (inst: moduleinst) (n: N) (off: s
   | Some addr =>
       match lookup_N s.(s_mems) addr with
       | Some mem =>
-        match store_packed mem n off (bits v) (tp_length tp) with
+        match store_packed mem n off (serialise_num v) (tp_length tp) with
         | Some mem' =>
            Some (upd_s_mem s (set_nth mem' s.(s_mems) (N.to_nat addr) mem'))
         | None => None
@@ -1427,7 +1461,7 @@ Definition eval_cvt (t : number_type) (op: cvtop) (s : option sx) (v : value_num
   | CVO_convert => cvt_convert t s v
   | CVO_demote => cvt_demote t v
   | CVO_promote => cvt_promote t v
-  | CVO_reinterpret => Some (wasm_deserialise (bits v) t)
+  | CVO_reinterpret => Some (wasm_deserialise (serialise_num v) t)
   end.
 
 (* Enumerates all the valid argument types for each convert operations *)

From 7263b82fd5914cb69264d9ac629367213e533c72 Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Fri, 15 Aug 2025 05:29:16 -0700
Subject: [PATCH 11/19] extract lane

---
 theories/operations.v        | 60 +++++++++++++++++++++++++++++-------
 theories/properties.v        |  1 +
 theories/simd.v              | 12 --------
 theories/type_preservation.v | 11 +++++--
 theories/typing.v            | 16 ----------
 5 files changed, 59 insertions(+), 41 deletions(-)

diff --git a/theories/operations.v b/theories/operations.v
index 254a3030..5190c4cc 100644
--- a/theories/operations.v
+++ b/theories/operations.v
@@ -610,11 +610,26 @@ Definition serialise_num_shape (sh: vshape) (v: value_num) : bytes :=
       Memdata.encode_int byte_width (Wasm_int.Int32.unsigned c)
   end.
 
-(* Extract values from a v128 *)
-Definition v128_extract_lanes_n (sh: vshape) (v: SIMD.v128) : list N :=
+Program Fixpoint v128_extract_bytes_aux (width dim: N) (v: SIMD.v128) {measure (N.to_nat dim)}: list bytes :=
+  match dim with
+  | 0%N => nil
+  | _ =>
+      let bytes := take width v in
+      let bytes_remaining := drop width v in
+      cons bytes (v128_extract_bytes_aux width (N.pred dim) bytes_remaining)
+  end.
+Next Obligation.
+  by lias.
+Qed.
+
+Definition v128_extract_bytes (sh: vshape) (v: v128) : list bytes :=
   let byte_width := N.div (shape_width sh) 8%N in
   let dim := shape_dim sh in
-  let v128_bytes_grouped := v128_extract_bytes byte_width dim v in
+  v128_extract_bytes_aux byte_width dim v.
+
+(* Extract values from a v128 *)
+Definition v128_extract_lanes_n (sh: vshape) (v: SIMD.v128) : list N :=
+  let v128_bytes_grouped := v128_extract_bytes sh v in
   let vt := unpacked sh in
   map (fun bs =>
          BinInt.Z.to_N (common.Memdata.decode_int bs)) v128_bytes_grouped.
@@ -622,8 +637,7 @@ Definition v128_extract_lanes_n (sh: vshape) (v: SIMD.v128) : list N :=
 (* Extract values from a v128 *)
 Definition v128_extract_lanes (sh: vshape) (v: SIMD.v128) : list value_num :=
   let byte_width := N.div (shape_width sh) 8%N in
-  let dim := shape_dim sh in
-  let v128_bytes_grouped := v128_extract_bytes byte_width dim v in
+  let v128_bytes_grouped := v128_extract_bytes sh v in
   let vt := unpacked sh in
   match sh with
   | VS_f _ =>
@@ -710,20 +724,44 @@ Definition app_splat_vec (sh: vshape) (v1: value_num) : value_vec :=
   let dim := shape_dim sh in
   VAL_vec128 (bytes_repeat bs dim).
 
-Definition app_extract_vec (sh: vshape) (s: option sx) (n: laneidx) (v1: value_vec) : value_num :=
-  match sh with
+Definition typeof_shape_unpacked (shape: vshape) : number_type :=
+  match shape with
   | VS_i vsi =>
       match vsi with
-      | VSI_64_2 => bitzero T_i64
-      | _ => bitzero T_i32
+      | VSI_8_16 => T_i32
+      | VSI_16_8 => T_i32
+      | VSI_32_4 => T_i32
+      | VSI_64_2 => T_i64
       end
   | VS_f vsf =>
       match vsf with
-      | VSF_32_4 => bitzero T_f32
-      | VSF_64_2 => bitzero T_f64
+      | VSF_32_4 => T_f32
+      | VSF_64_2 => T_f64
       end
   end.
 
+Definition vec_get_v128 (v: value_vec) : v128 :=
+  match v with
+  | VAL_vec128 vv => vv
+  end.
+
+Definition app_extract_vec (sh: vshape) (os: option sx) (n: laneidx) (v1: value_vec) : value_num :=
+  let vv := vec_get_v128 v1 in
+  let bss := v128_extract_bytes sh vv in
+  let unpacked_shape := typeof_shape_unpacked sh in
+  let obs := lookup_N bss n in
+  match obs with
+  | Some bs => 
+      let byte_width := N.div (shape_width sh) 8%N in
+      let bs_extend :=
+        match os with
+        | Some s => sign_extend_bytes s byte_width bs
+        | None => sign_extend_bytes SX_U byte_width bs
+        end in
+      wasm_deserialise bs_extend unpacked_shape
+  | None => bitzero unpacked_shape (* Will not happen *)
+  end.
+      
 Definition app_replace_vec (sh: vshape) (n: laneidx) (v1: value_vec) (v2: value_num) : value_vec :=
   v1.
 
diff --git a/theories/properties.v b/theories/properties.v
index f9889d0f..961d67c7 100644
--- a/theories/properties.v
+++ b/theories/properties.v
@@ -48,6 +48,7 @@ Proof.
   intros; f_equal.
   by apply IHl1.
 Qed.
+
 Lemma app_eq_singleton: forall T (l1 l2 : list T) (a : T),
     l1 ++ l2 = [::a] ->
     (l1 = [::a] /\ l2 = [::]) \/ (l1 = [::] /\ l2 = [::a]).
diff --git a/theories/simd.v b/theories/simd.v
index 592fff8a..009d32cd 100644
--- a/theories/simd.v
+++ b/theories/simd.v
@@ -59,15 +59,3 @@ End SIMD.
 
 Module Export simd_export := SIMD.
 
-Program Fixpoint v128_extract_bytes (width dim: N) (v: SIMD.v128) {measure (N.to_nat dim)}: list bytes :=
-  match dim with
-  | 0%N => nil
-  | _ =>
-      let bytes := take width v in
-      let bytes_remaining := drop width v in
-      cons bytes (v128_extract_bytes width (N.pred dim) bytes_remaining)
-  end.
-Next Obligation.
-  by lias.
-Qed.
-
diff --git a/theories/type_preservation.v b/theories/type_preservation.v
index 7125aacb..70c15fab 100644
--- a/theories/type_preservation.v
+++ b/theories/type_preservation.v
@@ -56,6 +56,13 @@ Proof.
   done.
 Qed.
 
+Lemma app_extract_vec_typing: forall sh os n v1,
+    typeof_num (app_extract_vec sh os n v1) = typeof_shape_unpacked sh.
+Proof.
+  intros; unfold app_extract_vec => /=.
+  destruct (lookup_N _ _) => //=; by destruct sh as [[] | []] => //=.
+Qed.
+
 (* It's better to just set `instr_subtyping` opaque and unset it when necessary, since most of the times we do not want to unfold this definition by simpl. But the simpl nomatch method doesn't prevent it from being unfolded for some reason. *)
 Opaque instr_subtyping.
 
@@ -76,8 +83,8 @@ Proof.
     erewrite eval_cvt_type_preserve; eauto.
     by resolve_subtyping.
   - (* Extract_vec *)
-    replace (typeof_num (app_extract_vec sh sx x v1)) with (typeof_shape_unpacked sh); last by destruct sh as [[] | []].
-    by resolve_subtyping.
+    replace (typeof_num (app_extract_vec sh sx x v1)) with (typeof_shape_unpacked sh); first by resolve_subtyping.
+    by rewrite app_extract_vec_typing.
   - (* Replace_vec *)
     replace (typeof_vec (app_replace_vec sh x v1 v2)) with (typeof_vec v1); last by destruct sh as [[] | []].
     rewrite H1.
diff --git a/theories/typing.v b/theories/typing.v
index c17e95f8..9a4fc11c 100644
--- a/theories/typing.v
+++ b/theories/typing.v
@@ -79,22 +79,6 @@ Definition value_typing (s: store_record) (v: value) (t: value_type) : bool :=
 Definition values_typing (s: store_record) (vs: list value) (tf: list value_type) : bool :=
   all2 (value_typing s) vs tf.
 
-Definition typeof_shape_unpacked (shape: vshape) : number_type :=
-  match shape with
-  | VS_i vsi =>
-      match vsi with
-      | VSI_8_16 => T_i32
-      | VSI_16_8 => T_i32
-      | VSI_32_4 => T_i32
-      | VSI_64_2 => T_i64
-      end
-  | VS_f vsf =>
-      match vsf with
-      | VSF_32_4 => T_f32
-      | VSF_64_2 => T_f64
-      end
-  end.
-  
 Definition result_types_agree (s: store_record) (ts : result_type) r : bool :=
   match r with
   | result_values vs => values_typing s vs ts

From a428681163a23f57a6b45efc8aad9c6aec141e3e Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Fri, 15 Aug 2025 06:48:29 -0700
Subject: [PATCH 12/19] add replace lane

---
 theories/operations.v        | 8 ++++++--
 theories/type_preservation.v | 8 ++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/theories/operations.v b/theories/operations.v
index 5190c4cc..19a0daf5 100644
--- a/theories/operations.v
+++ b/theories/operations.v
@@ -761,9 +761,13 @@ Definition app_extract_vec (sh: vshape) (os: option sx) (n: laneidx) (v1: value_
       wasm_deserialise bs_extend unpacked_shape
   | None => bitzero unpacked_shape (* Will not happen *)
   end.
-      
+
 Definition app_replace_vec (sh: vshape) (n: laneidx) (v1: value_vec) (v2: value_num) : value_vec :=
-  v1.
+  let vv := vec_get_v128 v1 in
+  let bss := v128_extract_bytes sh vv in
+  let bs_num := serialise_num_shape sh v2 in
+  let bss' := set_nth nil bss n bs_num in
+  VAL_vec128 (List.concat bss').
 
 
 Definition rglob_is_mut (g : module_global) : bool :=
diff --git a/theories/type_preservation.v b/theories/type_preservation.v
index 70c15fab..0c2edc46 100644
--- a/theories/type_preservation.v
+++ b/theories/type_preservation.v
@@ -85,10 +85,6 @@ Proof.
   - (* Extract_vec *)
     replace (typeof_num (app_extract_vec sh sx x v1)) with (typeof_shape_unpacked sh); first by resolve_subtyping.
     by rewrite app_extract_vec_typing.
-  - (* Replace_vec *)
-    replace (typeof_vec (app_replace_vec sh x v1 v2)) with (typeof_vec v1); last by destruct sh as [[] | []].
-    rewrite H1.
-    by resolve_subtyping.
   - (* Br *)
     eapply et_composition'; eauto; resolve_e_typing.
     by eapply Lfilled_break_typing with (tss := nil) in Hconjl1; eauto => //=; try (by apply v_to_e_const); last by lias.
@@ -1105,7 +1101,7 @@ Proof.
 Qed.
 
 Lemma mem_extension_store: forall m k off v tlen mem,
-    store m k off (bits v) tlen = Some mem ->
+    store m k off (serialise_num v) tlen = Some mem ->
     mem_extension m mem.
 Proof.
   move => m k off v tlen mem HStore.
@@ -1457,7 +1453,7 @@ Proof.
     by apply List.nth_error_In in Hnthsdata; eauto.
 Qed.
     
-(* Note that although config_typing gives a much 1stronger constraint on C', we allow much more flexibility here due to the need in inductive cases. *)
+(* Note that although config_typing gives a much stronger constraint on C', we allow much more flexibility here due to the need in inductive cases. *)
 Lemma store_extension_reduce: forall s f es s' f' es' C C' tf hs hs',
     reduce hs s f es hs' s' f' es' ->
     inst_typing s f.(f_inst) = Some C ->

From 146d73592124fb1b86e19b66eec86fa3ac794372 Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Sat, 16 Aug 2025 04:48:33 -0700
Subject: [PATCH 13/19] add vec memory operations, including the laned and
 other variants

---
 theories/binary_format_parser.v |  12 +--
 theories/datatypes.v            |   6 +-
 theories/extraction_instance.v  |   2 +-
 theories/operations.v           | 156 ++++++++++++++++++++++----------
 4 files changed, 117 insertions(+), 59 deletions(-)

diff --git a/theories/binary_format_parser.v b/theories/binary_format_parser.v
index bf5eb55c..9173ed5f 100644
--- a/theories/binary_format_parser.v
+++ b/theories/binary_format_parser.v
@@ -415,12 +415,12 @@ Definition parse_v128_load {n} : byte_parser basic_instruction n :=
   assert_u32 0%N &> (BI_load_vec (LVA_none) <$> parse_memarg).
 
 Definition parse_v128_load_packed {n} : byte_parser basic_instruction n :=
-  assert_u32 1%N &> (BI_load_vec (LVA_packed (8%N, 8%N) SX_S) <$> parse_memarg) <|>
-  assert_u32 2%N &> (BI_load_vec (LVA_packed (8%N, 8%N) SX_U) <$> parse_memarg) <|>
-  assert_u32 3%N &> (BI_load_vec (LVA_packed (16%N, 4%N) SX_S) <$> parse_memarg) <|>
-  assert_u32 4%N &> (BI_load_vec (LVA_packed (16%N, 4%N) SX_U) <$> parse_memarg) <|>
-  assert_u32 5%N &> (BI_load_vec (LVA_packed (32%N, 2%N) SX_S) <$> parse_memarg) <|>
-  assert_u32 6%N &> (BI_load_vec (LVA_packed (32%N, 2%N) SX_U) <$> parse_memarg).
+  assert_u32 1%N &> (BI_load_vec (LVA_extend (8%N, 8%N) SX_S) <$> parse_memarg) <|>
+  assert_u32 2%N &> (BI_load_vec (LVA_extend (8%N, 8%N) SX_U) <$> parse_memarg) <|>
+  assert_u32 3%N &> (BI_load_vec (LVA_extend (16%N, 4%N) SX_S) <$> parse_memarg) <|>
+  assert_u32 4%N &> (BI_load_vec (LVA_extend (16%N, 4%N) SX_U) <$> parse_memarg) <|>
+  assert_u32 5%N &> (BI_load_vec (LVA_extend (32%N, 2%N) SX_S) <$> parse_memarg) <|>
+  assert_u32 6%N &> (BI_load_vec (LVA_extend (32%N, 2%N) SX_U) <$> parse_memarg).
 
 Definition parse_v128_load_splat {n} : byte_parser basic_instruction n :=
   assert_u32 7%N &> (BI_load_vec (LVA_splat 8%N) <$> parse_memarg) <|>
diff --git a/theories/datatypes.v b/theories/datatypes.v
index 8b101e42..ce8022db 100644
--- a/theories/datatypes.v
+++ b/theories/datatypes.v
@@ -617,14 +617,14 @@ Definition vshiftop := N.
 Definition vpacked_type : Type := N * N.
 
 (* Available widths: 32, 64 *)
-Definition vzero_type := N.
+Definition vzero_width := N.
 
 (* Available widths: 8, 16, 32, 64 *)
 Definition vwidth := N.
 
 Inductive load_vec_arg :=
-  | LVA_packed: vpacked_type -> sx -> load_vec_arg (* v128.loadnxm_sx *)
-  | LVA_zero: vzero_type -> load_vec_arg (* v128.loadnn_zero *)
+  | LVA_extend: vpacked_type -> sx -> load_vec_arg (* v128.loadnxm_sx *)
+  | LVA_zero: vzero_width -> load_vec_arg (* v128.loadnn_zero *)
   | LVA_splat: vwidth -> load_vec_arg (* v128.loadww_spat *)
   | LVA_none (* v128.load *)
   .
diff --git a/theories/extraction_instance.v b/theories/extraction_instance.v
index 8170b15d..e5ff780b 100644
--- a/theories/extraction_instance.v
+++ b/theories/extraction_instance.v
@@ -179,7 +179,7 @@ Section Wast.
     | _ => false
     end.
 
-  Definition v128_extract_lanes (sh: vshape) (v: SIMD.v128) :=
+  Definition v128_extract_lanes (sh: vshape) (v: v128) :=
     v128_extract_lanes sh v.
   
 End Wast.
diff --git a/theories/operations.v b/theories/operations.v
index 19a0daf5..2588451f 100644
--- a/theories/operations.v
+++ b/theories/operations.v
@@ -10,6 +10,8 @@ Set Implicit Arguments.
 Unset Strict Implicit.
 Unset Printing Implicit Defensive.
 
+Definition v128_size: N := 16%N.
+
 Section Operations.
   Context `{memory: Memory} `{hfc: host_function_class}.
 
@@ -97,7 +99,7 @@ Definition mem_grow (m : meminst) (len_delta : N) : option meminst :=
 
 Definition load_vec_bounds (lv_arg: load_vec_arg) (m_arg: memarg) : bool :=
   match lv_arg with
-  | LVA_packed ptv sx =>
+  | LVA_extend ptv sx =>
       let (n, m) := ptv in
       N.leb (N.pow 2 m_arg.(memarg_align)) (N.mul (N.div n 8) m)
   | LVA_zero ztv =>
@@ -105,15 +107,15 @@ Definition load_vec_bounds (lv_arg: load_vec_arg) (m_arg: memarg) : bool :=
   | LVA_splat width =>
       N.leb (N.pow 2 m_arg.(memarg_align)) (N.div (width) 8)
   | LVA_none =>
-      N.leb (N.pow 2 m_arg.(memarg_align)) 16%N
+      N.leb (N.pow 2 m_arg.(memarg_align)) v128_size
   end.
 
 Definition load_store_vec_lane_bounds (width: vwidth) (m_arg: memarg) (x: laneidx) : bool :=
-   (N.leb (N.pow 2 (memarg_align m_arg)) (width / 8))%N &&
-     (N.ltb x (128 / width)%N).
+   (N.leb (N.pow 2 (memarg_align m_arg)) (width / 8%N))%N &&
+     (N.ltb x (128%N / width)%N).
 
 Definition store_vec_bounds (m_arg: memarg) : bool :=
-      N.leb (N.pow 2 m_arg.(memarg_align)) 16%N.
+      N.leb (N.pow 2 m_arg.(memarg_align)) v128_size.
 
 (* TODO: We crucially need documentation here. *)
 
@@ -128,26 +130,11 @@ Definition load (m : meminst) (n : N) (off : static_offset) (l : N) : option byt
 Definition int_of_bytes (bs: list byte) (m: N) : value_num :=
   VAL_int32 int32_minus_one.
 
-(* TODO: placeholder for vector load -- currently unimplemented. *)
-Definition load_vec (m: meminst) (i: N) (lvarg: load_vec_arg) (marg: memarg) : option value_vec :=
-  (* Placeholder just so that this operation can return both successful and error result *)
-(*  let ea := i + marg.(memarg_offset) in
-  match load_vec_arg with
-  | LVA_packed ptv sx =>
-      let (m, n) := ptv_to_nm in
-      if N.leb (ea + (N.div (N.mul m n) 8%N)) (m.(mem_length)) then
-        let bs = int_of_bytes (take (drop m
-  if ea + *)
-  match (i == marg.(memarg_offset)) with
-  | true => Some (VAL_vec128 SIMD.v128_default)
-  | _ => None
+Definition vec_get_v128 (v: value_vec) : v128 :=
+  match v with
+  | VAL_vec128 vv => vv
   end.
 
-Definition load_vec_lane (m: meminst) (i: N) (v: value_vec) (width: vwidth) (marg: memarg) (x: laneidx) : option value_vec :=
-  match (i == marg.(memarg_offset)) with
-  | true => Some v
-  | _ => None
-  end.
 
 Definition store (m : meminst) (n : N) (off : static_offset) (bs : bytes) (l : nat) : option meminst :=
   if N.leb (n + off + N.of_nat l) (mem_length m)
@@ -158,18 +145,6 @@ Definition store (m : meminst) (n : N) (off : static_offset) (bs : bytes) (l : n
 Might need a change in the future if this behaviour changes. *)
 Definition store_packed := store.
 
-Definition store_vec (m: meminst) (n: N) (v: value_vec) (marg: memarg) : option meminst :=
-  match (n == marg.(memarg_offset)) with
-  | true => Some m
-  | _ => None
-  end.
-
-Definition store_vec_lane (m: meminst) (n: N) (v: value_vec) (width: vwidth) (marg: memarg) (x: laneidx) : option meminst :=
-  match (n == marg.(memarg_offset)) with
-  | true => Some m
-  | _ => None
-  end.
-
 Definition wasm_deserialise (bs : bytes) (vt : number_type) : value_num :=
   match vt with
   | T_i32 => VAL_int32 (Wasm_int.Int32.repr (common.Memdata.decode_int bs))
@@ -189,7 +164,7 @@ Definition sign_extend_n (n: N) (bytelen: N) : Z :=
   val_z.
 
 (* l is the byte length of the target type, therefore can only be 4/8 *)
-(* v128 also uses this function for i8/i16 decoding *)
+(* v128 also uses this function for i8/i16 decoding, so l can be 1/2 *)
 Definition sign_extend_bytes (s : sx) (l : N) (bs : bytes) : bytes :=
   match s with
   | SX_U => bytes_takefill #00 (N.to_nat l) bs
@@ -610,13 +585,13 @@ Definition serialise_num_shape (sh: vshape) (v: value_num) : bytes :=
       Memdata.encode_int byte_width (Wasm_int.Int32.unsigned c)
   end.
 
-Program Fixpoint v128_extract_bytes_aux (width dim: N) (v: SIMD.v128) {measure (N.to_nat dim)}: list bytes :=
+Program Fixpoint bs_extract_bytes (byte_width dim: N) (bs: bytes) {measure (N.to_nat dim)}: list bytes :=
   match dim with
   | 0%N => nil
   | _ =>
-      let bytes := take width v in
-      let bytes_remaining := drop width v in
-      cons bytes (v128_extract_bytes_aux width (N.pred dim) bytes_remaining)
+      let bs_prefix := take byte_width bs in
+      let bs_remaining := drop byte_width bs in
+      cons bs_prefix (bs_extract_bytes byte_width (N.pred dim) bs_remaining)
   end.
 Next Obligation.
   by lias.
@@ -625,17 +600,21 @@ Qed.
 Definition v128_extract_bytes (sh: vshape) (v: v128) : list bytes :=
   let byte_width := N.div (shape_width sh) 8%N in
   let dim := shape_dim sh in
-  v128_extract_bytes_aux byte_width dim v.
+  bs_extract_bytes byte_width dim v.
+
+Definition v128_extract_bytes_width (byte_width: N) (v: v128) : list bytes :=
+  let dim := N.div 16%N byte_width in
+  bs_extract_bytes byte_width dim v.
 
 (* Extract values from a v128 *)
-Definition v128_extract_lanes_n (sh: vshape) (v: SIMD.v128) : list N :=
+Definition v128_extract_lanes_n (sh: vshape) (v: v128) : list N :=
   let v128_bytes_grouped := v128_extract_bytes sh v in
   let vt := unpacked sh in
   map (fun bs =>
          BinInt.Z.to_N (common.Memdata.decode_int bs)) v128_bytes_grouped.
 
 (* Extract values from a v128 *)
-Definition v128_extract_lanes (sh: vshape) (v: SIMD.v128) : list value_num :=
+Definition v128_extract_lanes (sh: vshape) (v: v128) : list value_num :=
   let byte_width := N.div (shape_width sh) 8%N in
   let v128_bytes_grouped := v128_extract_bytes sh v in
   let vt := unpacked sh in
@@ -740,11 +719,6 @@ Definition typeof_shape_unpacked (shape: vshape) : number_type :=
       end
   end.
 
-Definition vec_get_v128 (v: value_vec) : v128 :=
-  match v with
-  | VAL_vec128 vv => vv
-  end.
-
 Definition app_extract_vec (sh: vshape) (os: option sx) (n: laneidx) (v1: value_vec) : value_num :=
   let vv := vec_get_v128 v1 in
   let bss := v128_extract_bytes sh vv in
@@ -769,7 +743,6 @@ Definition app_replace_vec (sh: vshape) (n: laneidx) (v1: value_vec) (v2: value_
   let bss' := set_nth nil bss n bs_num in
   VAL_vec128 (List.concat bss').
 
-
 Definition rglob_is_mut (g : module_global) : bool :=
   g.(modglob_type).(tg_mut) == MUT_var.
 
@@ -920,6 +893,72 @@ Definition stab_update (s: store_record) (inst: moduleinst) (x: tableidx) (i: el
   | None => None
   end.
 
+Definition load_vec (mem: meminst) (n: N) (lvarg: load_vec_arg) (marg: memarg) : option value_vec :=
+  let ea := N.add n marg.(memarg_offset) in
+  match lvarg with
+  | LVA_none =>
+    if N.leb (ea + v128_size)%N (mem_length mem)
+    then
+      match read_bytes mem ea v128_size with
+      | Some bs => Some (VAL_vec128 bs)
+      | None => None
+      end
+    else None
+  | LVA_extend (m, n) sx =>
+      let byte_width := N.div m 8%N in
+      let read_size := N.mul byte_width n in (* Should always be 8 *)
+      if N.leb (ea + read_size)%N (mem_length mem)
+      then
+        match read_bytes mem ea read_size with
+        | Some bs =>
+            (* Extract individual lanes, and sign extend to double the width *)
+            let bss := bs_extract_bytes byte_width n bs in
+            (* new width is 2 * original *)
+            let bss_new := List.map (sign_extend_bytes sx (2 * byte_width)%N) bss in
+            Some (VAL_vec128 (List.concat bss_new))
+        | None => None
+        end
+      else None
+  | LVA_splat n =>
+    let byte_width := N.div n 8%N in
+    if N.leb (ea + byte_width)%N (mem_length mem)
+    then
+      let dim := N.div 16%N byte_width in
+      match read_bytes mem ea byte_width with
+      | Some bs =>
+          let bs_repeat := bytes_repeat bs dim in
+          Some (VAL_vec128 bs_repeat)
+      | None => None
+      end
+    else None
+  | LVA_zero n =>
+    let byte_width := N.div n 8%N in
+    if N.leb (ea + byte_width)%N (mem_length mem)
+    then
+      match read_bytes mem ea byte_width with
+      | Some bs =>
+          let bs' := sign_extend_bytes SX_U 16%N bs in
+          Some (VAL_vec128 bs')
+      | None => None
+      end
+    else None
+  end.
+
+Definition load_vec_lane (m: meminst) (n: N) (v: value_vec) (width: vwidth) (marg: memarg) (x: laneidx) : option value_vec :=
+  let ea := N.add n marg.(memarg_offset) in
+  let byte_width := N.div width 8%N in
+  if N.leb (ea + byte_width)%N (mem_length m)
+  then
+    match read_bytes m ea byte_width with
+    | Some bs =>
+        let vv := vec_get_v128 v in
+        let bss := v128_extract_bytes_width byte_width vv in
+        let bss' := set_nth nil bss x bs in
+        Some (VAL_vec128 (List.concat bss'))
+    | None => None
+    end
+  else None.
+
 Definition smem_store (s: store_record) (inst: moduleinst) (n: N) (off: static_offset) (v: value_num) (t: number_type) : option store_record :=
   match lookup_N inst.(inst_mems) 0%N with
   | Some addr =>
@@ -950,6 +989,25 @@ Definition smem_store_packed (s: store_record) (inst: moduleinst) (n: N) (off: s
   | None => None
   end.
 
+Definition store_vec (m: meminst) (n: N) (v: value_vec) (marg: memarg) : option meminst :=
+  if N.leb (n + marg.(memarg_offset) + v128_size) (mem_length m)
+  then write_bytes_meminst m (n + marg.(memarg_offset)) (vec_get_v128 v)
+  else None.
+
+Definition store_vec_lane (m: meminst) (n: N) (v: value_vec) (width: vwidth) (marg: memarg) (x: laneidx) : option meminst :=
+  let byte_width := N.div width 8%N in
+  if N.leb (n + marg.(memarg_offset) + byte_width) (mem_length m)
+  then
+    let vv := vec_get_v128 v in
+    let bss := v128_extract_bytes_width byte_width vv in
+    let obs := List.nth_error bss x in
+    match obs with
+    | Some bs =>
+        write_bytes_meminst m (n + marg.(memarg_offset)) bs
+    | None => None
+    end
+  else None.
+
 Definition smem_store_vec (s: store_record) (inst: moduleinst) (n: N) (v: value_vec) (marg: memarg) : option store_record :=
   match lookup_N inst.(inst_mems) 0%N with
   | Some addr =>

From 087df6455a5c59a6554383d71785d39743313c97 Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Sun, 17 Aug 2025 09:18:18 -0700
Subject: [PATCH 14/19] extract lane sx bugfix and proof fixes

---
 theories/extraction_instance.v |  2 +-
 theories/operations.v          | 40 ++++++++++--------
 theories/pp.v                  | 16 ++++++--
 theories/type_preservation.v   | 74 ++++++++++++++++++++++++++--------
 4 files changed, 96 insertions(+), 36 deletions(-)

diff --git a/theories/extraction_instance.v b/theories/extraction_instance.v
index e5ff780b..fa7630d1 100644
--- a/theories/extraction_instance.v
+++ b/theories/extraction_instance.v
@@ -180,7 +180,7 @@ Section Wast.
     end.
 
   Definition v128_extract_lanes (sh: vshape) (v: v128) :=
-    v128_extract_lanes sh v.
+    v128_extract_lanes sh SX_S v.
   
 End Wast.
 
diff --git a/theories/operations.v b/theories/operations.v
index 2588451f..f65440bf 100644
--- a/theories/operations.v
+++ b/theories/operations.v
@@ -582,7 +582,10 @@ Definition serialise_num_shape (sh: vshape) (v: value_num) : bytes :=
   | VAL_float64 c => serialise_f64 c
   | VAL_int32 c => (* Could be i8/i16/i32 *)
       let byte_width := N.div (shape_width sh) 8%N in
-      Memdata.encode_int byte_width (Wasm_int.Int32.unsigned c)
+      (* Need to normalise first *)
+      let val_n := BinInt.Z.to_N (Wasm_int.Int32.unsigned c) in
+      let val_z := sign_extend_n val_n byte_width in
+      Memdata.encode_int byte_width val_z
   end.
 
 Program Fixpoint bs_extract_bytes (byte_width dim: N) (bs: bytes) {measure (N.to_nat dim)}: list bytes :=
@@ -614,7 +617,7 @@ Definition v128_extract_lanes_n (sh: vshape) (v: v128) : list N :=
          BinInt.Z.to_N (common.Memdata.decode_int bs)) v128_bytes_grouped.
 
 (* Extract values from a v128 *)
-Definition v128_extract_lanes (sh: vshape) (v: v128) : list value_num :=
+Definition v128_extract_lanes (sh: vshape) (s: sx) (v: v128) : list value_num :=
   let byte_width := N.div (shape_width sh) 8%N in
   let v128_bytes_grouped := v128_extract_bytes sh v in
   let vt := unpacked sh in
@@ -628,10 +631,17 @@ Definition v128_extract_lanes (sh: vshape) (v: v128) : list value_num :=
              VAL_int64 (Wasm_int.Int64.repr val_z)) v128_bytes_grouped
   (* All the other shapes extract to Int32. *)
   | VS_i _ =>
-      map (fun bs =>
+      match s with
+      | SX_S => 
+        map (fun bs =>
              let val_n := BinInt.Z.to_N (common.Memdata.decode_int bs) in
              let val_z := sign_extend_n val_n byte_width in
              VAL_int32 (Wasm_int.Int32.repr val_z)) v128_bytes_grouped
+      | SX_U =>
+          map (fun bs =>
+                 let val_z := common.Memdata.decode_int bs in
+                 VAL_int32 (Wasm_int.Int32.repr val_z)) v128_bytes_grouped
+      end
   end.
 
 Definition v128_serialise (sh: vshape) (vs: list value_num) : SIMD.v128 :=
@@ -720,20 +730,18 @@ Definition typeof_shape_unpacked (shape: vshape) : number_type :=
   end.
 
 Definition app_extract_vec (sh: vshape) (os: option sx) (n: laneidx) (v1: value_vec) : value_num :=
+  let s :=
+    match os with
+    | Some SX_S => SX_S
+    | _ => SX_U
+    end in
   let vv := vec_get_v128 v1 in
-  let bss := v128_extract_bytes sh vv in
-  let unpacked_shape := typeof_shape_unpacked sh in
-  let obs := lookup_N bss n in
-  match obs with
-  | Some bs => 
-      let byte_width := N.div (shape_width sh) 8%N in
-      let bs_extend :=
-        match os with
-        | Some s => sign_extend_bytes s byte_width bs
-        | None => sign_extend_bytes SX_U byte_width bs
-        end in
-      wasm_deserialise bs_extend unpacked_shape
-  | None => bitzero unpacked_shape (* Will not happen *)
+  let vs := v128_extract_lanes sh s vv in
+  match lookup_N vs n with
+  | Some v => v
+  | None =>
+      let unpacked_shape := typeof_shape_unpacked sh in
+      bitzero unpacked_shape (* Will not happen *)
   end.
 
 Definition app_replace_vec (sh: vshape) (n: laneidx) (v1: value_vec) (v2: value_num) : value_vec :=
diff --git a/theories/pp.v b/theories/pp.v
index a7a5c655..70ee7918 100644
--- a/theories/pp.v
+++ b/theories/pp.v
@@ -539,14 +539,24 @@ Definition pp_vtestop (op: vtestop) :=
 Definition pp_vshiftop (op: vshiftop) :=
   "vshiftop: " ++ pp_N op.
 
+Definition pp_vshape (sh: vshape) :=
+  match sh with
+  | VS_i VSI_8_16 => "I8x16"
+  | VS_i VSI_16_8 => "I16x8"
+  | VS_i VSI_32_4 => "I32x4"
+  | VS_i VSI_64_2 => "I64x2"
+  | VS_f VSF_32_4 => "F32x4"
+  | VS_f VSF_64_2 => "F64x2"
+  end.
+    
 Definition pp_splat_vec (sh: vshape) :=
-  "(not implemented: splat)".
+  "splat " ++ pp_vshape sh.
 
 Definition pp_extract_vec (sh: vshape) (s: option sx) (x: laneidx) :=
-  "(not implemented: extract_lane)".
+  "extract_lane " ++ pp_vshape sh ++ " " ++ pp_sx_o s ++ " lane:" ++ pp_N x.
 
 Definition pp_replace_vec (sh: vshape) (x: laneidx) :=
-  "(not implemented: replace_lane)".
+  "replace_lane " ++ pp_vshape sh ++ " lane:" ++ pp_N x.
   
 Definition pp_load_vec (lvarg: load_vec_arg) (marg: memarg) :=
   "(not implemented: load_vec)".
diff --git a/theories/type_preservation.v b/theories/type_preservation.v
index 0c2edc46..bb1f2933 100644
--- a/theories/type_preservation.v
+++ b/theories/type_preservation.v
@@ -1115,6 +1115,38 @@ Proof.
   simpl in *; subst.
   by lias.
 Qed.
+
+Lemma mem_extension_store_vec: forall m n v marg mem,
+    store_vec m n v marg = Some mem ->
+    mem_extension m mem.
+Proof.
+  move => m n v marg mem HStore.
+  unfold mem_extension.
+  unfold store_vec in HStore.
+  remove_bools_options.
+  apply write_bytes_meminst_preserve_type in HStore as [Htype Hlen].
+  apply/andP; split; last by lias.
+  destruct m, mem.
+  unfold limits_extension.
+  simpl in *; subst.
+  by lias.
+Qed.
+
+Lemma mem_extension_store_vec_lane: forall m n v width marg x mem,
+    store_vec_lane m n v width marg x = Some mem ->
+    mem_extension m mem.
+Proof.
+  move => m n v width marg x mem HStore.
+  unfold mem_extension.
+  unfold store_vec_lane in HStore.
+  remove_bools_options.
+  apply write_bytes_meminst_preserve_type in HStore as [Htype Hlen].
+  apply/andP; split; last by lias.
+  destruct m, mem.
+  unfold limits_extension.
+  simpl in *; subst.
+  by lias.
+Qed.
   
 Lemma mem_extension_grow: forall s m c mem t,
     meminst_typing s m = Some t ->
@@ -1270,8 +1302,8 @@ Proof.
     resolve_if_true_eq.
     by lias.
   - eapply List.Forall_forall in Hmt; eauto.
-    + by apply List.nth_error_In in Hnth'.
-    + by apply nth_error_Some_length in Hoption2; lias.
+    by apply List.nth_error_In in Hnth'.
+  - by apply nth_error_Some_length in Hoption2; lias.
 Qed.
 
 Lemma smem_store_vec_extension: forall s f n v marg s' C mt,
@@ -1284,10 +1316,9 @@ Proof.
   move => s f n v marg s' C a Hst Hupd Hit Hnth.
   unfold store_extension; unfold_store_operations => /=; resolve_store_extension; resolve_store_inst_lookup; remove_bools_options => /=.
   rewrite Hnth in Hnthmt; injection Hnthmt as ->.
-  unfold store_vec in Hoption1.
   destruct m0; simpl in *; remove_bools_options; simpl in *.
   eapply component_extension_update; eauto; first by apply mem_extension_refl.
-  by apply mem_extension_refl.
+  by eapply mem_extension_store_vec in Hoption1.
 Qed.
 
 Lemma smem_store_vec_typing: forall s f n v marg s' C mt,
@@ -1298,9 +1329,9 @@ Lemma smem_store_vec_typing: forall s f n v marg s' C mt,
   store_typing s'.
 Proof.
   move => s f n v marg s' C a Hst Hupd Hit Hnth.
+
   assert (store_extension s s') as Hstext; first by eapply smem_store_vec_extension; eauto.
   unfold_store_operations; remove_bools_options.
-  unfold store_vec in Hoption1.
   resolve_store_inst_lookup; destruct m0; simpl in *; remove_bools_options; simpl in *.
   rewrite Hnthmt in Hnth; injection Hnth as <-.
   unfold store_typing in *; destruct s; simpl in *.
@@ -1310,13 +1341,19 @@ Proof.
   move => x0 Hin.
   apply set_nth_In in Hin.
   destruct Hin as [-> | [m0 [Hneq Hnth']]] => //=.
-  - exists mt.
+  - exists m1.(meminst_type).
+    unfold meminst_typing => /=.
+    destruct m1 => //=.
+    unfold store_vec, store in Hoption1; remove_bools_options.
+    apply write_bytes_meminst_preserve_type in Hoption1; simpl in *.
+    destruct Hoption1 as [-> Hmemlen].
+    unfold mem_length in *; simpl in *.
     rewrite Hif.
     resolve_if_true_eq.
-    by rewrite Hif0.
+    by lias.
   - eapply List.Forall_forall in Hmt; eauto.
-    + by apply List.nth_error_In in Hnth'.
-    + by apply nth_error_Some_length in Hoption2; lias.
+    by apply List.nth_error_In in Hnth'.
+  - by apply nth_error_Some_length in Hoption2; lias.
 Qed.
 
 Lemma smem_store_vec_lane_extension: forall s f n v width marg x s' C mt,
@@ -1329,10 +1366,9 @@ Proof.
   move => s f n v width marg x s' C a Hst Hupd Hit Hnth.
   unfold store_extension; unfold_store_operations => /=; resolve_store_extension; resolve_store_inst_lookup; remove_bools_options => /=.
   rewrite Hnth in Hnthmt; injection Hnthmt as ->.
-  unfold store_vec_lane in Hoption1.
   destruct m0; simpl in *; remove_bools_options; simpl in *.
   eapply component_extension_update; eauto; first by apply mem_extension_refl.
-  by apply mem_extension_refl.
+  by eapply mem_extension_store_vec_lane in Hoption1.
 Qed.
 
 Lemma smem_store_vec_lane_typing: forall s f n v width marg x s' C mt,
@@ -1343,9 +1379,9 @@ Lemma smem_store_vec_lane_typing: forall s f n v width marg x s' C mt,
   store_typing s'.
 Proof.
   move => s f n v width marg x s' C a Hst Hupd Hit Hnth.
+
   assert (store_extension s s') as Hstext; first by eapply smem_store_vec_lane_extension; eauto.
   unfold_store_operations; remove_bools_options.
-  unfold store_vec_lane in Hoption1.
   resolve_store_inst_lookup; destruct m0; simpl in *; remove_bools_options; simpl in *.
   rewrite Hnthmt in Hnth; injection Hnth as <-.
   unfold store_typing in *; destruct s; simpl in *.
@@ -1355,13 +1391,19 @@ Proof.
   move => x0 Hin.
   apply set_nth_In in Hin.
   destruct Hin as [-> | [m0 [Hneq Hnth']]] => //=.
-  - exists mt.
+  - exists m1.(meminst_type).
+    unfold meminst_typing => /=.
+    destruct m1 => //=.
+    unfold store_vec_lane, store in Hoption1; remove_bools_options.
+    apply write_bytes_meminst_preserve_type in Hoption1; simpl in *.
+    destruct Hoption1 as [-> Hmemlen].
+    unfold mem_length in *; simpl in *.
     rewrite Hif.
     resolve_if_true_eq.
-    by rewrite Hif0.
+    by lias.
   - eapply List.Forall_forall in Hmt; eauto.
-    + by apply List.nth_error_In in Hnth'.
-    + by apply nth_error_Some_length in Hoption2; lias.
+    by apply List.nth_error_In in Hnth'.
+  - by apply nth_error_Some_length in Hoption2; lias.
 Qed.
 
 Lemma smem_grow_extension: forall s f n s' sz C mt,

From 19bba8520f6a3af9a767fefc6cc947ae2871023d Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Sun, 17 Aug 2025 10:00:37 -0700
Subject: [PATCH 15/19] bugfixes

---
 src/SIMD_ops.ml         | 17 +++++++++++++----
 theories/operations.v   |  7 +++++++
 theories/type_checker.v |  4 +++-
 theories/typing.v       |  1 +
 4 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/src/SIMD_ops.ml b/src/SIMD_ops.ml
index c9405ceb..c65ba471 100644
--- a/src/SIMD_ops.ml
+++ b/src/SIMD_ops.ml
@@ -52,6 +52,10 @@ let app_vunop_str op v =
     | 225 -> F32x4.neg
     | 227 -> F32x4.sqrt
 
+    | 236 -> F64x2.abs
+    | 237 -> F64x2.neg
+    | 239 -> F64x2.sqrt
+
     | 248 -> I32x4_convert.trunc_sat_f32x4_s
     | 249 -> I32x4_convert.trunc_sat_f32x4_u
     | 250 -> F32x4_convert.convert_i32x4_s
@@ -67,11 +71,16 @@ let app_vunop_str op v =
   to_bits (wasm_f vw)
 
 let app_vbinop_str op_args v1 v2 = 
-  let (op, _args) = op_args in
+  let (op, args) = op_args in
   let v1w = of_bits v1 in
   let v2w = of_bits v2 in
+  let iop = Utils.int_of_z op in
+  let iargs = List.map Utils.int_of_z args in
+  if iop = 13 then (* shuffle *)
+    to_bits (V8x16.shuffle v1w v2w iargs)
+  else
   let wasm_f = 
-    match Utils.int_of_z op with
+    match iop with
     | 14 -> V8x16.swizzle
     | 35 -> I8x16.eq
     | 36 -> I8x16.ne
@@ -125,7 +134,7 @@ let app_vbinop_str op_args v1 v2 =
     | 111 -> I8x16.add_sat_s
     | 112 -> I8x16.add_sat_u
     | 113 -> I8x16.sub
-    | 114 -> I8x16.sub_sat_u
+    | 114 -> I8x16.sub_sat_s
     | 115 -> I8x16.sub_sat_u
     | 118 -> I8x16.min_s
     | 119 -> I8x16.min_u
@@ -139,7 +148,7 @@ let app_vbinop_str op_args v1 v2 =
     | 143 -> I16x8.add_sat_s
     | 144 -> I16x8.add_sat_u
     | 145 -> I16x8.sub
-    | 146 -> I16x8.sub_sat_u
+    | 146 -> I16x8.sub_sat_s
     | 147 -> I16x8.sub_sat_u
     | 149 -> I16x8.mul
     | 150 -> I16x8.min_s
diff --git a/theories/operations.v b/theories/operations.v
index f65440bf..78a4352a 100644
--- a/theories/operations.v
+++ b/theories/operations.v
@@ -751,6 +751,13 @@ Definition app_replace_vec (sh: vshape) (n: laneidx) (v1: value_vec) (v2: value_
   let bss' := set_nth nil bss n bs_num in
   VAL_vec128 (List.concat bss').
 
+(* Check the laneidx of shuffle *)
+Definition vbinop_valid (op: vbinop) : bool :=
+  let '(opcode, args) := op in
+  if (opcode == 13%N) then (* shuffle *)
+    if (List.forallb (fun n => N.ltb n 32%N) args) then true else false
+  else true.
+             
 Definition rglob_is_mut (g : module_global) : bool :=
   g.(modglob_type).(tg_mut) == MUT_var.
 
diff --git a/theories/type_checker.v b/theories/type_checker.v
index 11433b12..eadefbdf 100644
--- a/theories/type_checker.v
+++ b/theories/type_checker.v
@@ -193,7 +193,9 @@ Fixpoint check_single (C : t_context) (ct : option checker_type) (be : basic_ins
       | BI_vunop op =>
           type_update ts [::T_vec T_v128] [::T_vec T_v128]
       | BI_vbinop op =>
-          type_update ts [::T_vec T_v128; T_vec T_v128] [::T_vec T_v128]
+          if vbinop_valid op then
+            type_update ts [::T_vec T_v128; T_vec T_v128] [::T_vec T_v128]
+          else None
       | BI_vternop op =>
           type_update ts [::T_vec T_v128; T_vec T_v128; T_vec T_v128] [::T_vec T_v128]
       | BI_vtestop tv =>
diff --git a/theories/typing.v b/theories/typing.v
index 9a4fc11c..2aa5b19d 100644
--- a/theories/typing.v
+++ b/theories/typing.v
@@ -208,6 +208,7 @@ Inductive be_typing : t_context -> seq basic_instruction -> instr_type -> Prop :
 | bet_vunop: forall C op,
     be_typing C [::BI_vunop op] (Tf [::T_vec T_v128] [::T_vec T_v128])
 | bet_vbinop: forall C op,
+    vbinop_valid op -> (* Check against laneids for shuffle *)
     be_typing C [::BI_vbinop op] (Tf [::T_vec T_v128; T_vec T_v128] [::T_vec T_v128])
 | bet_vternop: forall C op,
     be_typing C [::BI_vternop op] (Tf [::T_vec T_v128; T_vec T_v128; T_vec T_v128] [::T_vec T_v128])

From df7513410b08f7de97669aaffcc58f5ebebd0a22 Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Sun, 17 Aug 2025 10:22:00 -0700
Subject: [PATCH 16/19] readme and changelog update

---
 README.md            | 25 ++++++++++++++++++-------
 changelogs/v2.2.0.md |  7 ++++---
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 1480275f..5acc019a 100644
--- a/README.md
+++ b/README.md
@@ -6,10 +6,11 @@ A WebAssembly (aka Wasm) formalisation in Coq(Rocq), based on the [official spec
 The quotes from the WebAssembly standard (starting with `std-doc`) are (C) their respective authors. 
 The files located in `src/Parray` are adapted from the Rocq kernel and therefore licensed under GNU LPGL 2.1 - see `src/Parray/LICENSE.LGPL`.
 
-The current project formalises Wasm version 2.0 with the following deltas:
+The current project formalises Wasm version 2.0 with the following additions:
 - [+] Subtyping systems (from the future funcref/GC extension proposals);
-- [+] Tail-call;
-- [-] SIMD.
+- [+] Tail-call.
+
+SIMD execution is implemented via the corresponding evaluation functions in the reference implementations. The Rocq mechanisation uses opauqe opcodes (from the binary format) for parsing the SIMD instructions and handling their execution.
 
 A large part of the old Wasm 1.0 formalisation has been published at [FM'21](https://link.springer.com/chapter/10.1007/978-3-030-90870-6_4), with many additions to the repository since then.
 
@@ -27,7 +28,7 @@ A large part of the old Wasm 1.0 formalisation has been published at [FM'21](htt
 - [x] Interpreter with optimised context representations.
 
 ## Merged
-- [x] Updates for Wasm 2.0 (except SIMD) + subtyping systems + tail-call.
+- [x] Updates for Wasm 2.0 + subtyping systems + tail-call.
 - [x] Validate WasmRef-Coq (conformance tests).
 
 # Program Logic
@@ -70,12 +71,22 @@ Then, run:
 ```bash
 make run_wast
 ```
-All SIMD tests are skipped since the project does not implement SIMD yet. The interpreter is expected to pass all the other core tests (last tested on 30th May 2025):
+The interpreter is expected to pass all the other core tests (last tested on 17th Aug 2025):
 ```bash
-Total passed: 28018/28018 (100.00%)
+Total passed: 54004/54004 (100.00%)
 ```
 
-Running the test suite takes around 1-2 minutes.
+Running the test suite takes around 2-3 minutes.
+
+It is also possible to run a selected subset of the test by filtering the file names. For example,
+```bash
+make run_wast FILTER="simd"
+```
+runs only the SIMD tests, while
+```bash
+make run_wast FILTER="load"
+```
+runs only the test files whose names include `load` (i.e. various memory load instructions both for the Wasm 1.0 values and the v128 values).
 
 Note that tail-call is not part of the standard yet and is therefore not tested.
 
diff --git a/changelogs/v2.2.0.md b/changelogs/v2.2.0.md
index d5f75e0c..f9a95d55 100644
--- a/changelogs/v2.2.0.md
+++ b/changelogs/v2.2.0.md
@@ -1,8 +1,9 @@
 # Release 2.2.0
 
 ## New Features
-- Implemented SIMD execution via eval functions in the reference implementation.
+- Implemented SIMD execution via eval functions in the reference implementation. The extracted runtime now pass the full Wasm 2.0 test suite.
 
-## Refactorings
+## Refactorings and Infrastructural Changes
 - Extraction now extracts Coq's `String.string` to OCaml native `string` instead of `char list`. The signature of various interfaces are updated accordingly.
-- Extraction now extracts Coq's binary integers (N/Z/positive) to ZArith's BigInt instead. The original conversion functions are therefore deprecated. New conversion functions between ZArith's BigInt and OCaml's native int are added. This is to avoid cyclic dependencies in the parametric SIMD implementations.
\ No newline at end of file
+- Extraction now extracts Coq's binary integers (N/Z/positive) to ZArith's BigInt instead. The original conversion functions are therefore deprecated. New conversion functions between ZArith's BigInt and OCaml's native int are added. This is to avoid cyclic dependencies in the parametric SIMD implementations.
+- The testing script `run_wast.sh` and the makefile now support testing a subset of the test suite by passing in a filter argument. For example: `make run_wast FILTER="simd"` runs only the simd tests.
\ No newline at end of file

From 66c72e510c78c0d89d428cd06e746f213a2b26ea Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Sun, 17 Aug 2025 10:35:03 -0700
Subject: [PATCH 17/19] fix proofs

---
 theories/type_preservation.v | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/theories/type_preservation.v b/theories/type_preservation.v
index bb1f2933..89a901d2 100644
--- a/theories/type_preservation.v
+++ b/theories/type_preservation.v
@@ -56,11 +56,22 @@ Proof.
   done.
 Qed.
 
+Lemma v128_extract_lanes_typing: forall sh s vv vs,
+    v128_extract_lanes sh s vv = vs ->
+    List.Forall (fun v => typeof_num v = typeof_shape_unpacked sh) vs.
+Proof.
+  move => sh s vv vs; unfold v128_extract_lanes.
+  destruct sh as [[] | []];
+  destruct s; move => <-; apply List.Forall_map => /=;
+  by apply List.Forall_forall.
+Qed.
+  
 Lemma app_extract_vec_typing: forall sh os n v1,
     typeof_num (app_extract_vec sh os n v1) = typeof_shape_unpacked sh.
 Proof.
-  intros; unfold app_extract_vec => /=.
-  destruct (lookup_N _ _) => //=; by destruct sh as [[] | []] => //=.
+  intros; unfold app_extract_vec.
+  destruct (lookup_N _ _) eqn:Hlookup; last by destruct sh as [[] | []].
+  by eapply Forall_lookup in Hlookup; last by eapply v128_extract_lanes_typing; eauto.
 Qed.
 
 (* It's better to just set `instr_subtyping` opaque and unset it when necessary, since most of the times we do not want to unfold this definition by simpl. But the simpl nomatch method doesn't prevent it from being unfolded for some reason. *)

From 2e111b2ecc40693fefae76893335b87d82eb6fc0 Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Mon, 18 Aug 2025 04:45:12 -0700
Subject: [PATCH 18/19] printing, binary format, and cleanups

---
 theories/binary_format_printer.v | 93 +++++++++++++++++++++++++-------
 theories/operations.v            |  3 +-
 theories/pp.v                    | 26 ++++++---
 3 files changed, 95 insertions(+), 27 deletions(-)

diff --git a/theories/binary_format_printer.v b/theories/binary_format_printer.v
index 170ee6b5..043cbc75 100644
--- a/theories/binary_format_printer.v
+++ b/theories/binary_format_printer.v
@@ -1,11 +1,9 @@
 (** Wasm AST to binary.
 Breaks non-determinism ties; see binary_format_spec.v for the spec. *)
-Require Import datatypes_properties numerics common list_extra.
-From compcert Require Integers.
-Require Import Strings.Byte.
-Require leb128.
-Require Import BinNat.
 From mathcomp Require Import ssreflect ssrfun ssrnat ssrbool eqtype seq.
+From Wasm Require Import datatypes_properties numerics common list_extra leb128.
+From compcert Require Integers.
+From Coq Require Import BinNat Strings.Byte.
 
 Definition binary_of_number_type (t: number_type) : byte :=
   match t with
@@ -34,9 +32,12 @@ Definition binary_of_value_type (t : value_type) : byte :=
   | T_bot => x00 (* will not happen *)
   end.
 
-Definition binary_of_u32 (n: BinNums.N) : list byte :=
+Definition binary_of_N (n: BinNums.N) : list byte :=
   leb128.encode_unsigned n.
 
+Definition binary_of_u32 (n: u32) : list byte :=
+  binary_of_N n.
+
 Definition binary_of_u32_nat (n : nat) : list byte :=
   leb128.encode_unsigned (BinNatDef.N.of_nat n).
 
@@ -102,47 +103,101 @@ Proof. exact (x00 :: x00 :: x00 :: nil). Qed.
 
 (* placeholder for vector values, not implemented *)
 Definition binary_of_valvec (v: value_vec) :=
-  xfd :: x0c :: (List.repeat x00 16).
+  match v with
+  | VAL_vec128 vv =>
+      xfd :: x0c :: (List.map byte_of_compcert_byte vv)
+  end.
 
-(* placeholder for vector operations added in 2.0, to be filled in a future update
+(*
+Most vector insturctions are opaquely represented by their binary format opcode in the Rocq formalisation.
 https://webassembly.github.io/spec/core/binary/instructions.html#vector-instructions
 *)
 Definition binary_of_vunop (op: vunop) :=
-  xfd :: x0c :: (List.repeat x00 16).
+  xfd :: binary_of_N op.
 
 Definition binary_of_vbinop (op: vbinop) :=
-  xfd :: x0c :: (List.repeat x00 16).
+  let '(opcode, args) := op in
+  if N.eqb opcode 13%N then (* shuffle *)
+    xfd :: x0d :: binary_of_vec (binary_of_N) args
+  else
+    xfd :: binary_of_N opcode.
 
 Definition binary_of_vternop (op: vternop) :=
-  xfd :: x0c :: (List.repeat x00 16).
+  xfd :: binary_of_N op.
 
 Definition binary_of_vtestop (op: vtestop) :=
-  xfd :: x0c :: (List.repeat x00 16).
+  xfd :: binary_of_N op.
 
 Definition binary_of_vshiftop (op: vshiftop) :=
-  xfd :: x0c :: (List.repeat x00 16).
+  xfd :: binary_of_N op.
 
 Definition binary_of_splat_vec (sh: vshape) :=
-  xfd :: x0c :: (List.repeat x00 16).
+  xfd ::
+  (match sh with
+   | VS_i VSI_8_16 => x0e
+   | VS_i VSI_16_8 => x0f
+   | VS_i VSI_32_4 => x10
+   | VS_i VSI_64_2 => x11
+   | VS_f VSF_32_4 => x12
+   | VS_f VSF_64_2 => x13
+   end
+  ) :: nil.
 
 Definition binary_of_extract_vec (sh: vshape) (s: option sx) (x: laneidx) :=
-  xfd :: x0c :: (List.repeat x00 16).
+  xfd ::
+  (match sh, s with
+   | VS_i VSI_8_16, Some SX_S => x15
+   | VS_i VSI_8_16, Some SX_U => x16
+   | VS_i VSI_16_8, Some SX_S => x18
+   | VS_i VSI_16_8, Some SX_U => x19
+   | VS_i VSI_32_4, None => x1b
+   | VS_i VSI_64_2, None => x1d
+   | VS_f VSF_32_4, None => x1f
+   | VS_f VSF_64_2, None => x21
+   | _, _ => x00 (* will not happen due to well formedness *)
+   end
+  ) :: binary_of_N x.
 
 Definition binary_of_replace_vec (sh: vshape) (x: laneidx) :=
-  xfd :: x0c :: (List.repeat x00 16).
+  xfd ::
+  (match sh with
+   | VS_i VSI_8_16 => x17
+   | VS_i VSI_16_8 => x1a
+   | VS_i VSI_32_4 => x1c
+   | VS_i VSI_64_2 => x1e
+   | VS_f VSF_32_4 => x20
+   | VS_f VSF_64_2 => x22
+   end
+  ) :: binary_of_N x.
   
 Definition binary_of_load_vec (lvarg: load_vec_arg) (marg: memarg) :=
   xfd :: x0c :: (List.repeat x00 16).
 
 Definition binary_of_load_vec_lane (w: vwidth) (marg: memarg) (x: laneidx) :=
-  xfd :: x0c :: (List.repeat x00 16).
+  xfd :: 
+  (match w with
+  | 8%N => x54 
+  | 16%N => x55
+  | 32%N => x56 
+  | 64%N => x57
+  | _ => x00 (* will not happen due to wellformedness *)
+  end
+  ) :: binary_of_memarg marg ++ binary_of_N x.
 
 (* store_vec_lane and load_vec uses the same args. Maybe it's better to find a new name *)
 Definition binary_of_store_vec (marg: memarg) :=
-  xfd :: x0c :: (List.repeat x00 16).
+  xfd :: x0b :: binary_of_memarg marg.
 
 Definition binary_of_store_vec_lane (w: vwidth) (marg: memarg) (x: laneidx) :=
-  xfd :: x0c :: (List.repeat x00 16).
+  xfd :: 
+  (match w with
+  | 8%N => x58
+  | 16%N => x59
+  | 32%N => x5a 
+  | 64%N => x5b
+  | _ => x00 (* will not happen due to wellformedness *)
+  end
+  ) :: binary_of_memarg marg ++ binary_of_N x.
 
 Fixpoint binary_of_be (be : basic_instruction) : list byte :=
   let binary_of_instrs bes := List.concat (List.map binary_of_be bes) in
diff --git a/theories/operations.v b/theories/operations.v
index 78a4352a..07507196 100644
--- a/theories/operations.v
+++ b/theories/operations.v
@@ -1364,10 +1364,11 @@ Fixpoint split_n (es : seq value) (n : nat) : seq value * seq value :=
     (e :: es', es'')
   end.
 
-(* TODO: eliminate the use of this *)
+#[deprecated(since="wasmcert-coq-v2.2", note="Use serialise_num instead.")]
 Definition expect {A B : Type} (ao : option A) (f : A -> B) (b : B) : B :=
   oapp f b ao.
 
+#[deprecated(since="wasmcert-coq-v2.2", note="Use serialise_num instead.")]
 Definition expect' {A B : Type} (ao : option A) (f : A -> B) (b : B) : B :=
   match ao as ao0 return (ao = ao0) -> B with
   | Some a => (fun _ => f a)
diff --git a/theories/pp.v b/theories/pp.v
index 70ee7918..3d80540f 100644
--- a/theories/pp.v
+++ b/theories/pp.v
@@ -552,24 +552,36 @@ Definition pp_vshape (sh: vshape) :=
 Definition pp_splat_vec (sh: vshape) :=
   "splat " ++ pp_vshape sh.
 
+Definition pp_laneidx (x: laneidx) :=
+  "lane:" ++ pp_N x.
+
 Definition pp_extract_vec (sh: vshape) (s: option sx) (x: laneidx) :=
-  "extract_lane " ++ pp_vshape sh ++ " " ++ pp_sx_o s ++ " lane:" ++ pp_N x.
+  "extract_lane " ++ pp_vshape sh ++ " " ++ pp_sx_o s ++ " " ++ pp_laneidx x.
 
 Definition pp_replace_vec (sh: vshape) (x: laneidx) :=
-  "replace_lane " ++ pp_vshape sh ++ " lane:" ++ pp_N x.
-  
+  "replace_lane " ++ pp_vshape sh ++ " " ++ pp_laneidx x.
+
 Definition pp_load_vec (lvarg: load_vec_arg) (marg: memarg) :=
-  "(not implemented: load_vec)".
+  match lvarg with
+  | LVA_extend (n, m) s =>
+      "load_vec_extend_" ++ pp_N n ++ "x" ++ pp_N m ++ " " ++ pp_memarg marg
+  | LVA_zero ww =>
+      "load_vec_zero_" ++ pp_N ww ++ " " ++ pp_memarg marg
+  | LVA_splat ww =>
+      "load_vec_splat_" ++ pp_N ww ++ " " ++ pp_memarg marg
+  | LVA_none =>
+      " v128.load_vec " ++ pp_memarg marg
+  end.
 
 Definition pp_load_vec_lane (w: vwidth) (marg: memarg) (x: laneidx) :=
-  "(not implemented: load_vec_lane)".
+  "load_vec_lane_" ++ pp_N w ++ " " ++ pp_memarg marg ++ " " ++ pp_laneidx x.
 
 (* store_vec_lane and load_vec uses the same args. Maybe it's better to find a new name *)
 Definition pp_store_vec (marg: memarg) :=
-  "(not implemented: store_vec)".
+  "v128.store " ++ pp_memarg marg.
 
 Definition pp_store_vec_lane (w: vwidth) (marg: memarg) (x: laneidx) :=
-  "(not implemented: store_vec_lane)".
+  "store_vec_lane_" ++ pp_N w ++ " " ++ pp_memarg marg ++ " " ++ pp_laneidx x.
 
 Fixpoint pp_basic_instruction (i : indentation) (be : basic_instruction) : string :=
   let pp_basic_instructions bes i :=

From 483a0ee1c499dcf4a208c3efb08e1b3f7dbd7e37 Mon Sep 17 00:00:00 2001
From: raoxiaojia <raoxiaojia@gmail.com>
Date: Mon, 18 Aug 2025 04:52:34 -0700
Subject: [PATCH 19/19] more cleanups

---
 theories/binary_format_parser.v |  42 --------
 theories/datatypes.v            | 168 ++------------------------------
 2 files changed, 7 insertions(+), 203 deletions(-)

diff --git a/theories/binary_format_parser.v b/theories/binary_format_parser.v
index 9173ed5f..fb66dab3 100644
--- a/theories/binary_format_parser.v
+++ b/theories/binary_format_parser.v
@@ -504,48 +504,6 @@ Definition parse_f64_const {n} : be_parser n :=
 Definition parse_v128_const {n} : be_parser n :=
   exact_byte xfd &> assert_u32 12%N &> ((fun x => BI_const_vec (VAL_vec128 x)) <$> parse_16bytes).
 
-(*
-Inductive vvunop : Set :=
-  | VVU_not
-  .
-
-Inductive viunop : Set :=
-  | VUOI_abs
-  | VUOI_neg
-  | VUOI_popcnt (* has shape constraints: i8x16 *)
-  .
-
-Inductive vfunop : Set :=
-  | VUOF_abs
-  | VUOF_neg
-  | VUOF_sqrt
-  | VUOF_ceil
-  | VUOF_floor
-  | VUOF_trunc
-  | VUOF_nearest
-  .
-
-(* Indicate whether a cvtop has a zero flag. *)
-Inductive vec_zero : Set :=
-  | VZ_zero
-  .
-  
-Inductive vcvtop : Set :=
-  | VCVT_extend : vec_half -> sx -> vcvtop
-  | VCVT_trunc_sat : sx -> option vec_zero -> vcvtop
-  | VCVT_convert : option vec_half -> sx -> vcvtop
-  | VCVT_demote : vec_zero -> vcvtop (* Superficial tag since zero has to be present for demote; only available on f64x2 *)
-  | VCVT_promote (* only available on LOW half and f32x4 *)
-  .
-  
-Inductive vunop : Set :=
-  | VI_unop: viunop -> vunop
-  | VF_unop: vfunop -> vunop
-  | VV_unop: vvunop -> vunop
-  | V_cvtop: vcvtop -> vunop
-  | V_extadd_pairwise: sx -> vunop (* i16 and i32 only *)
-*)
-
 (* Slightly stupid, but no better solution without defining the concrete constructors *)
 
 Section Opcode_classes.
diff --git a/theories/datatypes.v b/theories/datatypes.v
index ce8022db..9a4aee3b 100644
--- a/theories/datatypes.v
+++ b/theories/datatypes.v
@@ -417,167 +417,13 @@ Inductive packed_type : Set := (* tp *)
   | Tp_i32
   .
 
-(** SIMD Definitions and Instructions **)
-  (*
-Inductive vshape_i: Set :=
-  | VSI_8_16
-  | VSI_16_8
-  | VSI_32_4
-  | VSI_64_2
-  .
-  
-Inductive vshape_f: Set :=
-  | VSF_32_4
-  | VSF_64_2
-  .
-  
-Inductive vshape : Set := (* shape *)
-  | VS_i: vshape_i -> vshape
-  | VS_f: vshape_f -> vshape
-  .
-
-Inductive vec_half : Set :=
-  | VH_low
-  | VH_high
-  .
-
-Definition laneidx : Set := u8. 
-
-Inductive vvunop : Set :=
-  | VVU_not
-  .
-
-Inductive viunop : Set :=
-  | VUOI_abs
-  | VUOI_neg
-  | VUOI_popcnt (* has shape constraints: i8x16 *)
-  .
-
-Inductive vfunop : Set :=
-  | VUOF_abs
-  | VUOF_neg
-  | VUOF_sqrt
-  | VUOF_ceil
-  | VUOF_floor
-  | VUOF_trunc
-  | VUOF_nearest
-  .
-
-(* Indicate whether a cvtop has a zero flag. *)
-Inductive vec_zero : Set :=
-  | VZ_zero
-  .
-  
-Inductive vcvtop : Set :=
-  | VCVT_extend : vec_half -> sx -> vcvtop
-  | VCVT_trunc_sat : sx -> option vec_zero -> vcvtop
-  | VCVT_convert : option vec_half -> sx -> vcvtop
-  | VCVT_demote : vec_zero -> vcvtop (* Superficial tag since zero has to be present for demote; only available on f64x2 *)
-  | VCVT_promote (* only available on LOW half and f32x4 *)
-  .
-  
-Inductive vunop : Set :=
-  | VI_unop: viunop -> vunop
-  | VF_unop: vfunop -> vunop
-  | VV_unop: vvunop -> vunop
-  | V_cvtop: vcvtop -> vunop
-  | V_extadd_pairwise: sx -> vunop (* i16 and i32 only *)
-  .
-  
-Inductive vvbinop : Set :=
-  | VVB_and
-  | VVB_andnot
-  | VVB_or
-  | VVB_xor
-  .
-  
-Inductive vibinop : Set :=
-  | VBOI_add
-  | VBOI_sub
-  | VBOI_add_sat: sx -> vibinop (* i8 and i16 only *)
-  | VBOI_sub_sat: sx -> vibinop (* i8 and i16 only *)
-  | VBOI_mul (* i16/i32/i64 only *)
-  | VBOI_avgr (* unsigned only, i8/i16 only *)
-  | VBOI_q15mulr_sat (* signed only, i8/i16 only *)
-  | VBOI_min: sx -> vibinop (* i8/i16/i32 only *)
-  | VBOI_max: sx -> vibinop (* i8/i16/i32 only *)
-  .
-
-Inductive vfbinop : Set :=
-  | VBOF_add
-  | VBOF_sub
-  | VBOF_mul
-  | VBOF_div
-  | VBOF_min
-  | VBOF_max
-  | VBOF_pmin
-  | VBOF_pmax
-  .
-
-Inductive vextbinop: Set :=
-  | VBOE_extmul: vec_half -> sx -> vextbinop
-  | VBOE_dot (* signed only, i32 only *)
-.
-  
-Inductive vbinop : Set :=
-  | VI_binop: vibinop -> vbinop
-  | VF_binop: vfbinop -> vbinop
-  | VV_binop: vvbinop -> vbinop
-  | VE_binop: vextbinop -> vbinop
-  | V_narrow: vshape_i -> sx -> vbinop (* resulting width needs to be 2x original and at most i32 *)
-  | V_shuffle: list laneidx -> vbinop (* i8x16 only *)
-  | V_swizzle (* i8x16 only *)
-  .
-
-(* Technically this is vvternop. But this is the only ternary operation. *)
-Inductive vternop : Set :=
-  | VT_bitselect
-  .
-
-Inductive vvtestop : Set :=
-  | VVT_any_true
-  .
-  
-Inductive vitestop : Set :=
-  | VIT_all_true
-  .
-
-Inductive vtestop : Set :=
-  | VI_testop: vitestop -> vtestop
-  | VV_testop: vvtestop -> vtestop
-  | V_bitmask
-  .
-
-Inductive virelop : Set :=
-  | VIR_eq
-  | VIR_ne
-  | VIR_lt: sx -> virelop (* not available on i64x2 unsigned *)
-  | VIR_gt: sx -> virelop (* not available on i64x2 unsigned *)
-  | VIR_le: sx -> virelop (* not available on i64x2 unsigned *)
-  | VIR_ge: sx -> virelop (* not available on i64x2 unsigned *)
-  .
-  
-Inductive vfrelop : Set :=
-  | VFR_eq
-  | VFR_ne
-  | VFR_lt: sx -> vfrelop 
-  | VFR_gt: sx -> vfrelop
-  | VFR_le: sx -> vfrelop
-  | VFR_ge: sx -> vfrelop
-  .
-
-Inductive vrelop : Set :=
-  | VI_relop: virelop -> vrelop
-  | VF_relop: vfrelop -> vrelop
-  .
-
-(* Technically vishiftop, but shifts are only available for integers *)
-Inductive vshiftop : Set :=
-  | VS_shl
-  | VS_shr : sx -> vshiftop
-  .
+(** SIMD Interface **)
+(* SIMD instructions are grouped according to their type signatures. The Rocq
+formalisation skips the concrete AST of them but instead uses the binary format
+to represent the insturctions via an 'opcode'. Execution of these instructions
+is delegated to the reference implementation. Check src/SIMD_ops.ml for the
+interface.
 *)
-  
 Inductive vshape_i: Set :=
   | VSI_8_16
   | VSI_16_8
@@ -604,7 +450,7 @@ Definition laneidx : Set := u8.
 
 Definition vunop := N.
 
-(* Shuffle is included in this type, which needs to encode an additional list of laneidx. *)
+(* Shuffle is also a binop, but it needs an additional list of laneidx. *)
 Definition vbinop : Type := N * list N.
 
 Definition vternop := N.