[llvm] 3572b21 - [WebAssembly] Lower extend v16i8 to v16i32 (#188936)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 30 00:32:46 PDT 2026
Author: Sam Parker
Date: 2026-03-30T08:32:41+01:00
New Revision: 3572b21dec19f509f598b456d8819a62ddbf7fac
URL: https://github.com/llvm/llvm-project/commit/3572b21dec19f509f598b456d8819a62ddbf7fac
DIFF: https://github.com/llvm/llvm-project/commit/3572b21dec19f509f598b456d8819a62ddbf7fac.diff
LOG: [WebAssembly] Lower extend v16i8 to v16i32 (#188936)
Split the input vector with an extend_low and high and then split the
results again with extend_low and high for a total of 6 instructions.
This is removes 3 shuffles and a couple of extends.
Added:
Modified:
llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll
llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 874ea2be79a33..913a5bb22cb41 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -2992,6 +2992,29 @@ performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
assert(N->getOpcode() == ISD::SIGN_EXTEND ||
N->getOpcode() == ISD::ZERO_EXTEND);
+ EVT ResVT = N->getValueType(0);
+ bool IsSext = N->getOpcode() == ISD::SIGN_EXTEND;
+ SDLoc DL(N);
+
+ if (ResVT == MVT::v16i32 && N->getOperand(0)->getValueType(0) == MVT::v16i8) {
+ // Use a tree of extend low/high to split and extend the input in two
+ // layers to avoid doing several shuffles and even more extends.
+ unsigned LowOp =
+ IsSext ? WebAssemblyISD::EXTEND_LOW_S : WebAssemblyISD::EXTEND_LOW_U;
+ unsigned HighOp =
+ IsSext ? WebAssemblyISD::EXTEND_HIGH_S : WebAssemblyISD::EXTEND_HIGH_U;
+ SDValue Input = N->getOperand(0);
+ SDValue LowHalf = DAG.getNode(LowOp, DL, MVT::v8i16, Input);
+ SDValue HighHalf = DAG.getNode(HighOp, DL, MVT::v8i16, Input);
+ SDValue Subvectors[] = {
+ DAG.getNode(LowOp, DL, MVT::v4i32, LowHalf),
+ DAG.getNode(HighOp, DL, MVT::v4i32, LowHalf),
+ DAG.getNode(LowOp, DL, MVT::v4i32, HighHalf),
+ DAG.getNode(HighOp, DL, MVT::v4i32, HighHalf),
+ };
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Subvectors);
+ }
+
// Combine ({s,z}ext (extract_subvector src, i)) into a widening operation if
// possible before the extract_subvector can be expanded.
auto Extract = N->getOperand(0);
@@ -3005,7 +3028,6 @@ performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
// Only v8i8, v4i16, and v2i32 extracts can be widened, and only if the
// extracted subvector is the low or high half of its source.
- EVT ResVT = N->getValueType(0);
if (ResVT == MVT::v8i16) {
if (Extract.getValueType() != MVT::v8i8 ||
Source.getValueType() != MVT::v16i8 || (Index != 0 && Index != 8))
@@ -3022,7 +3044,6 @@ performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return SDValue();
}
- bool IsSext = N->getOpcode() == ISD::SIGN_EXTEND;
bool IsLow = Index == 0;
unsigned Op = IsSext ? (IsLow ? WebAssemblyISD::EXTEND_LOW_S
@@ -3030,7 +3051,7 @@ performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
: (IsLow ? WebAssemblyISD::EXTEND_LOW_U
: WebAssemblyISD::EXTEND_HIGH_U);
- return DAG.getNode(Op, SDLoc(N), ResVT, Source);
+ return DAG.getNode(Op, DL, ResVT, Source);
}
static SDValue
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index e3f02e17cd49e..1bd3a6b950944 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -167,6 +167,9 @@ InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
{ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 4},
{ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4},
{ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4},
+ // 6x extend_low, extend_high
+ {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
+ {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
// shuffle
{ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 2},
{ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 4},
diff --git a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
index 91cd3dd1ca4e7..13fabb75e09f7 100644
--- a/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
+++ b/llvm/test/CodeGen/WebAssembly/int-mac-reduction-loops.ll
@@ -64,68 +64,48 @@ define hidden i32 @i32_mac_u8_s8(ptr nocapture noundef readonly %a, ptr nocaptur
; MAX-BANDWIDTH: loop
; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; MAX-BANDWIDTH: i16x8.extend_high_i8x16_u
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
; MAX-BANDWIDTH: i32x4.mul
; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; MAX-BANDWIDTH: i32x4.mul
; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
; MAX-BANDWIDTH: i32x4.mul
; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; MAX-BANDWIDTH: i32x4.mul
; MAX-BANDWIDTH: i32x4.add
; RELAXED-MAX-BANDWIDTH: loop
; RELAXED-MAX-BANDWIDTH: v128.load
-; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
-; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_high_i8x16_u
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
; RELAXED-MAX-BANDWIDTH: v128.load
-; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
; RELAXED-MAX-BANDWIDTH: i32x4.mul
; RELAXED-MAX-BANDWIDTH: i32x4.add
-; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; RELAXED-MAX-BANDWIDTH: i32x4.mul
; RELAXED-MAX-BANDWIDTH: i32x4.add
-; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
-; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; RELAXED-MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_u
; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; RELAXED-MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
; RELAXED-MAX-BANDWIDTH: i32x4.mul
; RELAXED-MAX-BANDWIDTH: i32x4.add
-; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_u
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_u
-; RELAXED-MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
; RELAXED-MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; RELAXED-MAX-BANDWIDTH: i32x4.mul
; RELAXED-MAX-BANDWIDTH: i32x4.add
diff --git a/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll b/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll
index a599f4653f323..8fcc7c8b4448d 100644
--- a/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll
+++ b/llvm/test/CodeGen/WebAssembly/partial-reduce-accumulate.ll
@@ -150,31 +150,26 @@ define hidden i32 @accumulate_add_s8_s16(ptr noundef readonly %a, ptr noundef r
; MAX-BANDWIDTH: loop
; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: v128.load
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
-; MAX-BANDWIDTH: i32x4.add
entry:
%cmp8.not = icmp eq i32 %N, 0
br i1 %cmp8.not, label %for.cond.cleanup, label %for.body
@@ -511,29 +506,25 @@ define hidden i32 @accumulate_sub_s8_s8(ptr noundef readonly %a, ptr noundef re
; MAX-BANDWIDTH: loop
; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: v128.load
-; MAX-BANDWIDTH: i8x16.shuffle 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i16x8.extend_high_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
; MAX-BANDWIDTH: i32x4.sub
-; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i8x16.shuffle 8, 9, 10, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; MAX-BANDWIDTH: i32x4.sub
-; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
-; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
; MAX-BANDWIDTH: i32x4.add
-; MAX-BANDWIDTH: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; MAX-BANDWIDTH: i16x8.extend_low_i8x16_s
+; MAX-BANDWIDTH: i32x4.extend_high_i16x8_s
+; MAX-BANDWIDTH: i32x4.sub
+; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
+; MAX-BANDWIDTH: i32x4.add
; MAX-BANDWIDTH: i32x4.extend_low_i16x8_s
; MAX-BANDWIDTH: i32x4.sub
entry:
diff --git a/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll b/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll
index 25dc1efedfb95..e155ffb31d5cd 100644
--- a/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll
+++ b/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll
@@ -140,36 +140,30 @@ define <16 x i32> @sext_zext_mul_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: sext_zext_mul_v16i8:
; CHECK: .functype sext_zext_mul_v16i8 (i32, v128, v128) -> ()
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: i16x8.extend_low_i8x16_s $push2=, $1
-; CHECK-NEXT: i32x4.extend_low_i16x8_s $push3=, $pop2
-; CHECK-NEXT: i16x8.extend_low_i8x16_u $push0=, $1
-; CHECK-NEXT: i32x4.extend_low_i16x8_u $push1=, $pop0
-; CHECK-NEXT: i32x4.mul $push4=, $pop3, $pop1
-; CHECK-NEXT: v128.store 0($0), $pop4
-; CHECK-NEXT: i8x16.shuffle $push25=, $1, $1, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK-NEXT: local.tee $push24=, $3=, $pop25
-; CHECK-NEXT: i16x8.extend_low_i8x16_s $push7=, $pop24
-; CHECK-NEXT: i32x4.extend_low_i16x8_s $push8=, $pop7
-; CHECK-NEXT: i16x8.extend_low_i8x16_u $push5=, $3
-; CHECK-NEXT: i32x4.extend_low_i16x8_u $push6=, $pop5
-; CHECK-NEXT: i32x4.mul $push9=, $pop8, $pop6
-; CHECK-NEXT: v128.store 48($0), $pop9
-; CHECK-NEXT: i8x16.shuffle $push23=, $1, $1, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK-NEXT: local.tee $push22=, $3=, $pop23
-; CHECK-NEXT: i16x8.extend_low_i8x16_s $push12=, $pop22
-; CHECK-NEXT: i32x4.extend_low_i16x8_s $push13=, $pop12
-; CHECK-NEXT: i16x8.extend_low_i8x16_u $push10=, $3
-; CHECK-NEXT: i32x4.extend_low_i16x8_u $push11=, $pop10
-; CHECK-NEXT: i32x4.mul $push14=, $pop13, $pop11
-; CHECK-NEXT: v128.store 32($0), $pop14
-; CHECK-NEXT: i8x16.shuffle $push21=, $1, $1, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK-NEXT: local.tee $push20=, $1=, $pop21
-; CHECK-NEXT: i16x8.extend_low_i8x16_s $push17=, $pop20
-; CHECK-NEXT: i32x4.extend_low_i16x8_s $push18=, $pop17
-; CHECK-NEXT: i16x8.extend_low_i8x16_u $push15=, $1
-; CHECK-NEXT: i32x4.extend_low_i16x8_u $push16=, $pop15
-; CHECK-NEXT: i32x4.mul $push19=, $pop18, $pop16
-; CHECK-NEXT: v128.store 16($0), $pop19
+; CHECK-NEXT: i16x8.extend_high_i8x16_s $push19=, $1
+; CHECK-NEXT: local.tee $push18=, $4=, $pop19
+; CHECK-NEXT: i32x4.extend_high_i16x8_s $push1=, $pop18
+; CHECK-NEXT: i16x8.extend_high_i8x16_u $push17=, $1
+; CHECK-NEXT: local.tee $push16=, $3=, $pop17
+; CHECK-NEXT: i32x4.extend_high_i16x8_u $push0=, $pop16
+; CHECK-NEXT: i32x4.mul $push2=, $pop1, $pop0
+; CHECK-NEXT: v128.store 48($0), $pop2
+; CHECK-NEXT: i32x4.extend_low_i16x8_s $push4=, $4
+; CHECK-NEXT: i32x4.extend_low_i16x8_u $push3=, $3
+; CHECK-NEXT: i32x4.mul $push5=, $pop4, $pop3
+; CHECK-NEXT: v128.store 32($0), $pop5
+; CHECK-NEXT: i16x8.extend_low_i8x16_s $push15=, $1
+; CHECK-NEXT: local.tee $push14=, $4=, $pop15
+; CHECK-NEXT: i32x4.extend_high_i16x8_s $push7=, $pop14
+; CHECK-NEXT: i16x8.extend_low_i8x16_u $push13=, $1
+; CHECK-NEXT: local.tee $push12=, $1=, $pop13
+; CHECK-NEXT: i32x4.extend_high_i16x8_u $push6=, $pop12
+; CHECK-NEXT: i32x4.mul $push8=, $pop7, $pop6
+; CHECK-NEXT: v128.store 16($0), $pop8
+; CHECK-NEXT: i32x4.extend_low_i16x8_s $push10=, $4
+; CHECK-NEXT: i32x4.extend_low_i16x8_u $push9=, $1
+; CHECK-NEXT: i32x4.mul $push11=, $pop10, $pop9
+; CHECK-NEXT: v128.store 0($0), $pop11
; CHECK-NEXT: return
%wide.a = sext <16 x i8> %a to <16 x i32>
%wide.b = zext <16 x i8> %a to <16 x i32>
More information about the llvm-commits
mailing list