[llvm] 3572b21 - [WebAssembly] Lower extend v16i8 to v16i32 (#188936)

Mon Mar 30 00:32:46 PDT 2026

Author: Sam Parker
Date: 2026-03-30T08:32:41+01:00
New Revision: 3572b21dec19f509f598b456d8819a62ddbf7fac

URL: https://github.com/llvm/llvm-project/commit/3572b21dec19f509f598b456d8819a62ddbf7fac
DIFF: https://github.com/llvm/llvm-project/commit/3572b21dec19f509f598b456d8819a62ddbf7fac.diff

LOG: [WebAssembly] Lower extend v16i8 to v16i32 (#188936)

Split the input vector with an extend_low and high and then split the
results again with extend_low and high for a total of 6 instructions.
This is removes 3 shuffles and a couple of extends.

Added: 
    

Modified: 
    llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
    llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
    llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
    llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll
    llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 874ea2be79a33..913a5bb22cb41 100644

--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -2992,6 +2992,29 @@ performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   assert(N->getOpcode() == ISD::SIGN_EXTEND ||
          N->getOpcode() == ISD::ZERO_EXTEND);
 
+  EVT ResVT = N->getValueType(0);
+  bool IsSext = N->getOpcode() == ISD::SIGN_EXTEND;
+  SDLoc DL(N);
+
+  if (ResVT == MVT::v16i32 && N->getOperand(0)->getValueType(0) == MVT::v16i8) {
+    // Use a tree of extend low/high to split and extend the input in two
+    // layers to avoid doing several shuffles and even more extends.
+    unsigned LowOp =
+        IsSext ? WebAssemblyISD::EXTEND_LOW_S : WebAssemblyISD::EXTEND_LOW_U;
+    unsigned HighOp =
+        IsSext ? WebAssemblyISD::EXTEND_HIGH_S : WebAssemblyISD::EXTEND_HIGH_U;
+    SDValue Input = N->getOperand(0);
+    SDValue LowHalf = DAG.getNode(LowOp, DL, MVT::v8i16, Input);
+    SDValue HighHalf = DAG.getNode(HighOp, DL, MVT::v8i16, Input);
+    SDValue Subvectors[] = {
+        DAG.getNode(LowOp, DL, MVT::v4i32, LowHalf),
+        DAG.getNode(HighOp, DL, MVT::v4i32, LowHalf),
+        DAG.getNode(LowOp, DL, MVT::v4i32, HighHalf),
+        DAG.getNode(HighOp, DL, MVT::v4i32, HighHalf),
+    };
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Subvectors);
+  }
+
   // Combine ({s,z}ext (extract_subvector src, i)) into a widening operation if
   // possible before the extract_subvector can be expanded.
   auto Extract = N->getOperand(0);
@@ -3005,7 +3028,6 @@ performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
 
   // Only v8i8, v4i16, and v2i32 extracts can be widened, and only if the
   // extracted subvector is the low or high half of its source.
-  EVT ResVT = N->getValueType(0);
   if (ResVT == MVT::v8i16) {
     if (Extract.getValueType() != MVT::v8i8 ||
         Source.getValueType() != MVT::v16i8 || (Index != 0 && Index != 8))
@@ -3022,7 +3044,6 @@ performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
     return SDValue();
   }
 
-  bool IsSext = N->getOpcode() == ISD::SIGN_EXTEND;
   bool IsLow = Index == 0;
 
   unsigned Op = IsSext ? (IsLow ? WebAssemblyISD::EXTEND_LOW_S
@@ -3030,7 +3051,7 @@ performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
                        : (IsLow ? WebAssemblyISD::EXTEND_LOW_U
                                 : WebAssemblyISD::EXTEND_HIGH_U);
 
-  return DAG.getNode(Op, SDLoc(N), ResVT, Source);
+  return DAG.getNode(Op, DL, ResVT, Source);
 }
 
 static SDValue

diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index e3f02e17cd49e..1bd3a6b950944 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -167,6 +167,9 @@ InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
       {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 4},
       {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4},
       {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4},
+      // 6x extend_low, extend_high
+      {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
+      {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
       // shuffle
       {ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 2},
       {ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 4},

diff  --git a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
index 91cd3dd1ca4e7..13fabb75e09f7 100644
--- a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
+++ b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
@@ -64,68 +64,48 @@ define hidden i32 @i32_mac_u8_s8(ptr nocapture noundef readonly %a, ptr nocaptur
 
 ; MAX-BANDWIDTH: loop
 ; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i8x16.shuffle   12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i16x8.extend_high_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
 ; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i8x16.shuffle   12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
 ; MAX-BANDWIDTH: i32x4.mul
 ; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i8x16.shuffle   8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
 ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; MAX-BANDWIDTH: i8x16.shuffle   8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
 ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
 ; MAX-BANDWIDTH: i32x4.mul
 ; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i8x16.shuffle   4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; MAX-BANDWIDTH: i8x16.shuffle   4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
 ; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
 ; MAX-BANDWIDTH: i32x4.mul
 ; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
 ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
 ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
 ; MAX-BANDWIDTH: i32x4.mul
 ; MAX-BANDWIDTH: i32x4.add
 
 ; RELAXED-MAX-BANDWIDTH: loop
 ; RELAXED-MAX-BANDWIDTH: v128.load
-; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
-; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_high_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
 ; RELAXED-MAX-BANDWIDTH: v128.load
-; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
 ; RELAXED-MAX-BANDWIDTH: i32x4.mul
 ; RELAXED-MAX-BANDWIDTH: i32x4.add
-; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
 ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
 ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
 ; RELAXED-MAX-BANDWIDTH: i32x4.mul
 ; RELAXED-MAX-BANDWIDTH: i32x4.add
-; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
-; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; RELAXED-MAX-BANDWIDTH: i8x16.shuffle   4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
 ; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
 ; RELAXED-MAX-BANDWIDTH: i32x4.mul
 ; RELAXED-MAX-BANDWIDTH: i32x4.add
-; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
 ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
 ; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
 ; RELAXED-MAX-BANDWIDTH: i32x4.mul
 ; RELAXED-MAX-BANDWIDTH: i32x4.add

diff  --git a/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll b/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll
index a599f4653f323..8fcc7c8b4448d 100644
--- a/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll
+++ b/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll
@@ -150,31 +150,26 @@ define hidden i32 @accumulate_add_s8_s16(ptr noundef readonly  %a, ptr noundef r
 
 ; MAX-BANDWIDTH: loop
 ; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
 ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
 ; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
 ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
-; MAX-BANDWIDTH: i32x4.add
 entry:
   %cmp8.not = icmp eq i32 %N, 0
   br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
@@ -511,29 +506,25 @@ define hidden i32 @accumulate_sub_s8_s8(ptr noundef readonly  %a, ptr noundef re
 
 ; MAX-BANDWIDTH: loop
 ; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i8x16.shuffle	12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
 ; MAX-BANDWIDTH: i32x4.sub
-; MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
 ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i8x16.shuffle	8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
 ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
 ; MAX-BANDWIDTH: i32x4.sub
-; MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
 ; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i8x16.shuffle	4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
+; MAX-BANDWIDTH: i32x4.sub
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
 ; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
 ; MAX-BANDWIDTH: i32x4.sub
 entry:

diff  --git a/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll b/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll
index 25dc1efedfb95..e155ffb31d5cd 100644
--- a/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll
+++ b/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll
@@ -140,36 +140,30 @@ define <16 x i32> @sext_zext_mul_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: sext_zext_mul_v16i8:
 ; CHECK:         .functype sext_zext_mul_v16i8 (i32, v128, v128) -> ()
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    i16x8.extend_low_i8x16_s $push2=, $1
-; CHECK-NEXT:    i32x4.extend_low_i16x8_s $push3=, $pop2
-; CHECK-NEXT:    i16x8.extend_low_i8x16_u $push0=, $1
-; CHECK-NEXT:    i32x4.extend_low_i16x8_u $push1=, $pop0
-; CHECK-NEXT:    i32x4.mul $push4=, $pop3, $pop1
-; CHECK-NEXT:    v128.store 0($0), $pop4
-; CHECK-NEXT:    i8x16.shuffle $push25=, $1, $1, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK-NEXT:    local.tee $push24=, $3=, $pop25
-; CHECK-NEXT:    i16x8.extend_low_i8x16_s $push7=, $pop24
-; CHECK-NEXT:    i32x4.extend_low_i16x8_s $push8=, $pop7
-; CHECK-NEXT:    i16x8.extend_low_i8x16_u $push5=, $3
-; CHECK-NEXT:    i32x4.extend_low_i16x8_u $push6=, $pop5
-; CHECK-NEXT:    i32x4.mul $push9=, $pop8, $pop6
-; CHECK-NEXT:    v128.store 48($0), $pop9
-; CHECK-NEXT:    i8x16.shuffle $push23=, $1, $1, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK-NEXT:    local.tee $push22=, $3=, $pop23
-; CHECK-NEXT:    i16x8.extend_low_i8x16_s $push12=, $pop22
-; CHECK-NEXT:    i32x4.extend_low_i16x8_s $push13=, $pop12
-; CHECK-NEXT:    i16x8.extend_low_i8x16_u $push10=, $3
-; CHECK-NEXT:    i32x4.extend_low_i16x8_u $push11=, $pop10
-; CHECK-NEXT:    i32x4.mul $push14=, $pop13, $pop11
-; CHECK-NEXT:    v128.store 32($0), $pop14
-; CHECK-NEXT:    i8x16.shuffle $push21=, $1, $1, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK-NEXT:    local.tee $push20=, $1=, $pop21
-; CHECK-NEXT:    i16x8.extend_low_i8x16_s $push17=, $pop20
-; CHECK-NEXT:    i32x4.extend_low_i16x8_s $push18=, $pop17
-; CHECK-NEXT:    i16x8.extend_low_i8x16_u $push15=, $1
-; CHECK-NEXT:    i32x4.extend_low_i16x8_u $push16=, $pop15
-; CHECK-NEXT:    i32x4.mul $push19=, $pop18, $pop16
-; CHECK-NEXT:    v128.store 16($0), $pop19
+; CHECK-NEXT:    i16x8.extend_high_i8x16_s $push19=, $1
+; CHECK-NEXT:    local.tee $push18=, $4=, $pop19
+; CHECK-NEXT:    i32x4.extend_high_i16x8_s $push1=, $pop18
+; CHECK-NEXT:    i16x8.extend_high_i8x16_u $push17=, $1
+; CHECK-NEXT:    local.tee $push16=, $3=, $pop17
+; CHECK-NEXT:    i32x4.extend_high_i16x8_u $push0=, $pop16
+; CHECK-NEXT:    i32x4.mul $push2=, $pop1, $pop0
+; CHECK-NEXT:    v128.store 48($0), $pop2
+; CHECK-NEXT:    i32x4.extend_low_i16x8_s $push4=, $4
+; CHECK-NEXT:    i32x4.extend_low_i16x8_u $push3=, $3
+; CHECK-NEXT:    i32x4.mul $push5=, $pop4, $pop3
+; CHECK-NEXT:    v128.store 32($0), $pop5
+; CHECK-NEXT:    i16x8.extend_low_i8x16_s $push15=, $1
+; CHECK-NEXT:    local.tee $push14=, $4=, $pop15
+; CHECK-NEXT:    i32x4.extend_high_i16x8_s $push7=, $pop14
+; CHECK-NEXT:    i16x8.extend_low_i8x16_u $push13=, $1
+; CHECK-NEXT:    local.tee $push12=, $1=, $pop13
+; CHECK-NEXT:    i32x4.extend_high_i16x8_u $push6=, $pop12
+; CHECK-NEXT:    i32x4.mul $push8=, $pop7, $pop6
+; CHECK-NEXT:    v128.store 16($0), $pop8
+; CHECK-NEXT:    i32x4.extend_low_i16x8_s $push10=, $4
+; CHECK-NEXT:    i32x4.extend_low_i16x8_u $push9=, $1
+; CHECK-NEXT:    i32x4.mul $push11=, $pop10, $pop9
+; CHECK-NEXT:    v128.store 0($0), $pop11
 ; CHECK-NEXT:    return
   %wide.a = sext <16 x i8> %a to <16 x i32>
   %wide.b = zext <16 x i8> %a to <16 x i32>