[llvm] [WebAssembly] v16i8 mul support (PR #150209)
Sam Parker via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 23 04:17:21 PDT 2025
https://github.com/sparker-arm created https://github.com/llvm/llvm-project/pull/150209
During target DAG combine, use two i16x8.extmul_low_i8x16 and a shuffle for v16i8 mul.
On my AArch64 machine, using V8, I observe a 3.14% geomean improvement across 65 benchmarks, including: 9.2% for spec2017.x264, 6% for libyuv and 1.8% for ncnn.
>From ab6b746ea3e756ec4d85d53923399c56a187b1fc Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker at arm.com>
Date: Wed, 23 Jul 2025 12:05:49 +0100
Subject: [PATCH] [WebAssembly] v16i8 mul support
During target DAG combine, use two i16x8.extmul_low_i8x16 and a
shuffle for v16i8 mul.
On my AArch64 machine, using V8, I observe a 3.14% geomean
improvement across 65 benchmarks, including: 9.2% for spec2017.x264,
6% for libyuv and 1.8% for ncnn.
---
.../WebAssembly/WebAssemblyISelLowering.cpp | 45 +++++-
llvm/test/CodeGen/WebAssembly/simd-arith.ll | 136 +-----------------
.../test/CodeGen/WebAssembly/vector-reduce.ll | 56 +++-----
3 files changed, 71 insertions(+), 166 deletions(-)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 09b8864dfd7e9..43aa37a25f882 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3435,8 +3435,7 @@ static SDValue performSETCCCombine(SDNode *N,
return SDValue();
}
-static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
- assert(N->getOpcode() == ISD::MUL);
+static SDValue TryWideExtMulCombine(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
if (VT != MVT::v8i32 && VT != MVT::v16i32)
return SDValue();
@@ -3522,6 +3521,46 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue performMulCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert(N->getOpcode() == ISD::MUL);
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector())
+ return SDValue();
+
+ if (auto Res = TryWideExtMulCombine(N, DCI.DAG))
+ return Res;
+
+ // We don't natively support v16i8 mul, but we do support v8i16 so split the
+ // inputs and extend them to v8i16. Only do this before legalization in case
+ // a narrow vector is widened and may be simplified later.
+ if (!DCI.isBeforeLegalize() || VT != MVT::v16i8)
+ return SDValue();
+
+ SDLoc DL(N);
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ SDValue LowLHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, LHS);
+ SDValue HighLHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, LHS);
+ SDValue LowRHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, RHS);
+ SDValue HighRHS =
+ DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, RHS);
+
+ SDValue MulLow =
+ DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS));
+ SDValue MulHigh = DAG.getBitcast(
+ VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS));
+
+ // Take the low byte of each lane.
+ return DAG.getVectorShuffle(
+ VT, DL, MulLow, MulHigh,
+ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+}
+
SDValue
WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
@@ -3556,6 +3595,6 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
return performLowerPartialReduction(N, DCI.DAG);
}
case ISD::MUL:
- return performMulCombine(N, DCI.DAG);
+ return performMulCombine(N, DCI);
}
}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index e3607e12bf530..36637e1d555bd 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -199,139 +199,17 @@ define <16 x i8> @mul_v16i8(<16 x i8> %x, <16 x i8> %y) {
; SIMD128-LABEL: mul_v16i8:
; SIMD128: .functype mul_v16i8 (v128, v128) -> (v128)
; SIMD128-NEXT: # %bb.0:
-; SIMD128-NEXT: i8x16.extract_lane_u $push4=, $0, 0
-; SIMD128-NEXT: i8x16.extract_lane_u $push3=, $1, 0
-; SIMD128-NEXT: i32.mul $push5=, $pop4, $pop3
-; SIMD128-NEXT: i8x16.splat $push6=, $pop5
-; SIMD128-NEXT: i8x16.extract_lane_u $push1=, $0, 1
-; SIMD128-NEXT: i8x16.extract_lane_u $push0=, $1, 1
-; SIMD128-NEXT: i32.mul $push2=, $pop1, $pop0
-; SIMD128-NEXT: i8x16.replace_lane $push7=, $pop6, 1, $pop2
-; SIMD128-NEXT: i8x16.extract_lane_u $push9=, $0, 2
-; SIMD128-NEXT: i8x16.extract_lane_u $push8=, $1, 2
-; SIMD128-NEXT: i32.mul $push10=, $pop9, $pop8
-; SIMD128-NEXT: i8x16.replace_lane $push11=, $pop7, 2, $pop10
-; SIMD128-NEXT: i8x16.extract_lane_u $push13=, $0, 3
-; SIMD128-NEXT: i8x16.extract_lane_u $push12=, $1, 3
-; SIMD128-NEXT: i32.mul $push14=, $pop13, $pop12
-; SIMD128-NEXT: i8x16.replace_lane $push15=, $pop11, 3, $pop14
-; SIMD128-NEXT: i8x16.extract_lane_u $push17=, $0, 4
-; SIMD128-NEXT: i8x16.extract_lane_u $push16=, $1, 4
-; SIMD128-NEXT: i32.mul $push18=, $pop17, $pop16
-; SIMD128-NEXT: i8x16.replace_lane $push19=, $pop15, 4, $pop18
-; SIMD128-NEXT: i8x16.extract_lane_u $push21=, $0, 5
-; SIMD128-NEXT: i8x16.extract_lane_u $push20=, $1, 5
-; SIMD128-NEXT: i32.mul $push22=, $pop21, $pop20
-; SIMD128-NEXT: i8x16.replace_lane $push23=, $pop19, 5, $pop22
-; SIMD128-NEXT: i8x16.extract_lane_u $push25=, $0, 6
-; SIMD128-NEXT: i8x16.extract_lane_u $push24=, $1, 6
-; SIMD128-NEXT: i32.mul $push26=, $pop25, $pop24
-; SIMD128-NEXT: i8x16.replace_lane $push27=, $pop23, 6, $pop26
-; SIMD128-NEXT: i8x16.extract_lane_u $push29=, $0, 7
-; SIMD128-NEXT: i8x16.extract_lane_u $push28=, $1, 7
-; SIMD128-NEXT: i32.mul $push30=, $pop29, $pop28
-; SIMD128-NEXT: i8x16.replace_lane $push31=, $pop27, 7, $pop30
-; SIMD128-NEXT: i8x16.extract_lane_u $push33=, $0, 8
-; SIMD128-NEXT: i8x16.extract_lane_u $push32=, $1, 8
-; SIMD128-NEXT: i32.mul $push34=, $pop33, $pop32
-; SIMD128-NEXT: i8x16.replace_lane $push35=, $pop31, 8, $pop34
-; SIMD128-NEXT: i8x16.extract_lane_u $push37=, $0, 9
-; SIMD128-NEXT: i8x16.extract_lane_u $push36=, $1, 9
-; SIMD128-NEXT: i32.mul $push38=, $pop37, $pop36
-; SIMD128-NEXT: i8x16.replace_lane $push39=, $pop35, 9, $pop38
-; SIMD128-NEXT: i8x16.extract_lane_u $push41=, $0, 10
-; SIMD128-NEXT: i8x16.extract_lane_u $push40=, $1, 10
-; SIMD128-NEXT: i32.mul $push42=, $pop41, $pop40
-; SIMD128-NEXT: i8x16.replace_lane $push43=, $pop39, 10, $pop42
-; SIMD128-NEXT: i8x16.extract_lane_u $push45=, $0, 11
-; SIMD128-NEXT: i8x16.extract_lane_u $push44=, $1, 11
-; SIMD128-NEXT: i32.mul $push46=, $pop45, $pop44
-; SIMD128-NEXT: i8x16.replace_lane $push47=, $pop43, 11, $pop46
-; SIMD128-NEXT: i8x16.extract_lane_u $push49=, $0, 12
-; SIMD128-NEXT: i8x16.extract_lane_u $push48=, $1, 12
-; SIMD128-NEXT: i32.mul $push50=, $pop49, $pop48
-; SIMD128-NEXT: i8x16.replace_lane $push51=, $pop47, 12, $pop50
-; SIMD128-NEXT: i8x16.extract_lane_u $push53=, $0, 13
-; SIMD128-NEXT: i8x16.extract_lane_u $push52=, $1, 13
-; SIMD128-NEXT: i32.mul $push54=, $pop53, $pop52
-; SIMD128-NEXT: i8x16.replace_lane $push55=, $pop51, 13, $pop54
-; SIMD128-NEXT: i8x16.extract_lane_u $push57=, $0, 14
-; SIMD128-NEXT: i8x16.extract_lane_u $push56=, $1, 14
-; SIMD128-NEXT: i32.mul $push58=, $pop57, $pop56
-; SIMD128-NEXT: i8x16.replace_lane $push59=, $pop55, 14, $pop58
-; SIMD128-NEXT: i8x16.extract_lane_u $push61=, $0, 15
-; SIMD128-NEXT: i8x16.extract_lane_u $push60=, $1, 15
-; SIMD128-NEXT: i32.mul $push62=, $pop61, $pop60
-; SIMD128-NEXT: i8x16.replace_lane $push63=, $pop59, 15, $pop62
-; SIMD128-NEXT: return $pop63
+; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push1=, $0, $1
+; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push0=, $0, $1
+; SIMD128-NEXT: i8x16.shuffle $push2=, $pop1, $pop0, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; SIMD128-NEXT: return $pop2
;
; SIMD128-FAST-LABEL: mul_v16i8:
; SIMD128-FAST: .functype mul_v16i8 (v128, v128) -> (v128)
; SIMD128-FAST-NEXT: # %bb.0:
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push5=, $0, 0
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push4=, $1, 0
-; SIMD128-FAST-NEXT: i32.mul $push6=, $pop5, $pop4
-; SIMD128-FAST-NEXT: i8x16.splat $push7=, $pop6
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push2=, $0, 1
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push1=, $1, 1
-; SIMD128-FAST-NEXT: i32.mul $push3=, $pop2, $pop1
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push8=, $pop7, 1, $pop3
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push10=, $0, 2
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push9=, $1, 2
-; SIMD128-FAST-NEXT: i32.mul $push11=, $pop10, $pop9
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push12=, $pop8, 2, $pop11
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push14=, $0, 3
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push13=, $1, 3
-; SIMD128-FAST-NEXT: i32.mul $push15=, $pop14, $pop13
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push16=, $pop12, 3, $pop15
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push18=, $0, 4
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push17=, $1, 4
-; SIMD128-FAST-NEXT: i32.mul $push19=, $pop18, $pop17
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push20=, $pop16, 4, $pop19
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push22=, $0, 5
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push21=, $1, 5
-; SIMD128-FAST-NEXT: i32.mul $push23=, $pop22, $pop21
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push24=, $pop20, 5, $pop23
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push26=, $0, 6
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push25=, $1, 6
-; SIMD128-FAST-NEXT: i32.mul $push27=, $pop26, $pop25
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push28=, $pop24, 6, $pop27
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push30=, $0, 7
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push29=, $1, 7
-; SIMD128-FAST-NEXT: i32.mul $push31=, $pop30, $pop29
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push32=, $pop28, 7, $pop31
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push34=, $0, 8
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push33=, $1, 8
-; SIMD128-FAST-NEXT: i32.mul $push35=, $pop34, $pop33
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push36=, $pop32, 8, $pop35
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push38=, $0, 9
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push37=, $1, 9
-; SIMD128-FAST-NEXT: i32.mul $push39=, $pop38, $pop37
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push40=, $pop36, 9, $pop39
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push42=, $0, 10
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push41=, $1, 10
-; SIMD128-FAST-NEXT: i32.mul $push43=, $pop42, $pop41
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push44=, $pop40, 10, $pop43
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push46=, $0, 11
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push45=, $1, 11
-; SIMD128-FAST-NEXT: i32.mul $push47=, $pop46, $pop45
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push48=, $pop44, 11, $pop47
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push50=, $0, 12
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push49=, $1, 12
-; SIMD128-FAST-NEXT: i32.mul $push51=, $pop50, $pop49
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push52=, $pop48, 12, $pop51
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push54=, $0, 13
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push53=, $1, 13
-; SIMD128-FAST-NEXT: i32.mul $push55=, $pop54, $pop53
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push56=, $pop52, 13, $pop55
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push58=, $0, 14
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push57=, $1, 14
-; SIMD128-FAST-NEXT: i32.mul $push59=, $pop58, $pop57
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push60=, $pop56, 14, $pop59
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push62=, $0, 15
-; SIMD128-FAST-NEXT: i8x16.extract_lane_u $push61=, $1, 15
-; SIMD128-FAST-NEXT: i32.mul $push63=, $pop62, $pop61
-; SIMD128-FAST-NEXT: i8x16.replace_lane $push0=, $pop60, 15, $pop63
+; SIMD128-FAST-NEXT: i16x8.extmul_low_i8x16_u $push2=, $0, $1
+; SIMD128-FAST-NEXT: i16x8.extmul_high_i8x16_u $push1=, $0, $1
+; SIMD128-FAST-NEXT: i8x16.shuffle $push0=, $pop2, $pop1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
; SIMD128-FAST-NEXT: return $pop0
;
; NO-SIMD128-LABEL: mul_v16i8:
diff --git a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
index 1d194b640eab2..4c30a3adf2378 100644
--- a/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
+++ b/llvm/test/CodeGen/WebAssembly/vector-reduce.ll
@@ -116,40 +116,28 @@ define i8 @pairwise_mul_v16i8(<16 x i8> %arg) {
; SIMD128-LABEL: pairwise_mul_v16i8:
; SIMD128: .functype pairwise_mul_v16i8 (v128) -> (i32)
; SIMD128-NEXT: # %bb.0:
-; SIMD128-NEXT: i8x16.extract_lane_u $push26=, $0, 0
-; SIMD128-NEXT: i8x16.shuffle $push32=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
-; SIMD128-NEXT: local.tee $push31=, $1=, $pop32
-; SIMD128-NEXT: i8x16.extract_lane_u $push25=, $pop31, 0
-; SIMD128-NEXT: i32.mul $push27=, $pop26, $pop25
-; SIMD128-NEXT: i8x16.extract_lane_u $push23=, $0, 4
-; SIMD128-NEXT: i8x16.extract_lane_u $push22=, $1, 4
-; SIMD128-NEXT: i32.mul $push24=, $pop23, $pop22
-; SIMD128-NEXT: i32.mul $push28=, $pop27, $pop24
-; SIMD128-NEXT: i8x16.extract_lane_u $push19=, $0, 2
-; SIMD128-NEXT: i8x16.extract_lane_u $push18=, $1, 2
-; SIMD128-NEXT: i32.mul $push20=, $pop19, $pop18
-; SIMD128-NEXT: i8x16.extract_lane_u $push16=, $0, 6
-; SIMD128-NEXT: i8x16.extract_lane_u $push15=, $1, 6
-; SIMD128-NEXT: i32.mul $push17=, $pop16, $pop15
-; SIMD128-NEXT: i32.mul $push21=, $pop20, $pop17
-; SIMD128-NEXT: i32.mul $push29=, $pop28, $pop21
-; SIMD128-NEXT: i8x16.extract_lane_u $push11=, $0, 1
-; SIMD128-NEXT: i8x16.extract_lane_u $push10=, $1, 1
-; SIMD128-NEXT: i32.mul $push12=, $pop11, $pop10
-; SIMD128-NEXT: i8x16.extract_lane_u $push8=, $0, 5
-; SIMD128-NEXT: i8x16.extract_lane_u $push7=, $1, 5
-; SIMD128-NEXT: i32.mul $push9=, $pop8, $pop7
-; SIMD128-NEXT: i32.mul $push13=, $pop12, $pop9
-; SIMD128-NEXT: i8x16.extract_lane_u $push4=, $0, 3
-; SIMD128-NEXT: i8x16.extract_lane_u $push3=, $1, 3
-; SIMD128-NEXT: i32.mul $push5=, $pop4, $pop3
-; SIMD128-NEXT: i8x16.extract_lane_u $push1=, $0, 7
-; SIMD128-NEXT: i8x16.extract_lane_u $push0=, $1, 7
-; SIMD128-NEXT: i32.mul $push2=, $pop1, $pop0
-; SIMD128-NEXT: i32.mul $push6=, $pop5, $pop2
-; SIMD128-NEXT: i32.mul $push14=, $pop13, $pop6
-; SIMD128-NEXT: i32.mul $push30=, $pop29, $pop14
-; SIMD128-NEXT: return $pop30
+; SIMD128-NEXT: i8x16.shuffle $push20=, $0, $0, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: local.tee $push19=, $1=, $pop20
+; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push1=, $0, $pop19
+; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push0=, $0, $1
+; SIMD128-NEXT: i8x16.shuffle $push18=, $pop1, $pop0, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; SIMD128-NEXT: local.tee $push17=, $0=, $pop18
+; SIMD128-NEXT: i8x16.shuffle $push16=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: local.tee $push15=, $1=, $pop16
+; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push3=, $pop17, $pop15
+; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push2=, $0, $1
+; SIMD128-NEXT: i8x16.shuffle $push14=, $pop3, $pop2, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; SIMD128-NEXT: local.tee $push13=, $0=, $pop14
+; SIMD128-NEXT: i8x16.shuffle $push12=, $0, $0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: local.tee $push11=, $1=, $pop12
+; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push5=, $pop13, $pop11
+; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push4=, $0, $1
+; SIMD128-NEXT: i8x16.shuffle $push10=, $pop5, $pop4, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; SIMD128-NEXT: local.tee $push9=, $0=, $pop10
+; SIMD128-NEXT: i8x16.shuffle $push6=, $0, $0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push7=, $pop9, $pop6
+; SIMD128-NEXT: i8x16.extract_lane_u $push8=, $pop7, 0
+; SIMD128-NEXT: return $pop8
%res = tail call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %arg)
ret i8 %res
}
More information about the llvm-commits
mailing list