[llvm] 18077e9 - [WebAssembly] Re-land 8392bf6000ad
Thomas Lively via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 9 08:42:34 PDT 2023
Author: Caleb Zulawski
Date: 2023-06-09T08:42:27-07:00
New Revision: 18077e9fd688443ca111111541e7e3a71236efd5
URL: https://github.com/llvm/llvm-project/commit/18077e9fd688443ca111111541e7e3a71236efd5
DIFF: https://github.com/llvm/llvm-project/commit/18077e9fd688443ca111111541e7e3a71236efd5.diff
LOG: [WebAssembly] Re-land 8392bf6000ad
Correctly handle single-element vectors to fix an assertion failure. Add tests
that were missing from the original commit.
Differential Revision: D151782
Added:
llvm/test/CodeGen/WebAssembly/simd-bitmask.ll
llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll
Modified:
llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll
llvm/test/CodeGen/WebAssembly/simd-extending.ll
llvm/test/CodeGen/WebAssembly/simd-offset.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 9227f3c5de87a..4b0775d0d8425 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -157,6 +157,12 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
// SIMD-specific configuration
if (Subtarget->hasSIMD128()) {
+ // Combine vector mask reductions into alltrue/anytrue
+ setTargetDAGCombine(ISD::SETCC);
+
+ // Convert vector to integer bitcasts to bitmask
+ setTargetDAGCombine(ISD::BITCAST);
+
// Hoist bitcasts out of shuffles
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
@@ -258,6 +264,12 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
// But saturating fp_to_int converstions are
for (auto Op : {ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT})
setOperationAction(Op, MVT::v4i32, Custom);
+
+ // Support vector extending
+ for (auto T : MVT::integer_fixedlen_vector_valuetypes()) {
+ setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, T, Custom);
+ setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Custom);
+ }
}
// As a special case, these operators use the type to mean the type to
@@ -1374,6 +1386,11 @@ void WebAssemblyTargetLowering::ReplaceNodeResults(
// SIGN_EXTEND_INREG, but for non-vector sign extends the result might be an
// illegal type.
break;
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ // Do not add any results, signifying that N should not be custom lowered.
+ // EXTEND_VECTOR_INREG is implemented for some vectors, but not all.
+ break;
default:
llvm_unreachable(
"ReplaceNodeResults not implemented for this op for WebAssembly!");
@@ -1424,6 +1441,9 @@ SDValue WebAssemblyTargetLowering::LowerOperation(SDValue Op,
return LowerIntrinsic(Op, DAG);
case ISD::SIGN_EXTEND_INREG:
return LowerSIGN_EXTEND_INREG(Op, DAG);
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ return LowerEXTEND_VECTOR_INREG(Op, DAG);
case ISD::BUILD_VECTOR:
return LowerBUILD_VECTOR(Op, DAG);
case ISD::VECTOR_SHUFFLE:
@@ -1877,6 +1897,48 @@ WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
Op.getOperand(1));
}
+SDValue
+WebAssemblyTargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ SDValue Src = Op.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+
+ if (SrcVT.getVectorElementType() == MVT::i1 ||
+ SrcVT.getVectorElementType() == MVT::i64)
+ return SDValue();
+
+ assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
+ "Unexpected extension factor.");
+ unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
+
+ if (Scale != 2 && Scale != 4 && Scale != 8)
+ return SDValue();
+
+ unsigned Ext;
+ switch (Op.getOpcode()) {
+ case ISD::ZERO_EXTEND_VECTOR_INREG:
+ Ext = WebAssemblyISD::EXTEND_LOW_U;
+ break;
+ case ISD::SIGN_EXTEND_VECTOR_INREG:
+ Ext = WebAssemblyISD::EXTEND_LOW_S;
+ break;
+ }
+
+ SDValue Ret = Src;
+ while (Scale != 1) {
+ Ret = DAG.getNode(Ext, DL,
+ Ret.getValueType()
+ .widenIntegerVectorElementType(*DAG.getContext())
+ .getHalfNumVectorElementsVT(*DAG.getContext()),
+ Ret);
+ Scale /= 2;
+ }
+ assert(Ret.getValueType() == VT);
+ return Ret;
+}
+
static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
if (Op.getValueType() != MVT::v2f64)
@@ -2692,12 +2754,91 @@ static SDValue performTruncateCombine(SDNode *N,
return truncateVectorWithNARROW(OutVT, In, DL, DAG);
}
+static SDValue performBitcastCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ auto &DAG = DCI.DAG;
+ SDLoc DL(N);
+ SDValue Src = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ EVT SrcVT = Src.getValueType();
+
+ // bitcast <N x i1> to iN
+ // ==> bitmask
+ if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
+ SrcVT.isFixedLengthVector() && SrcVT.getScalarType() == MVT::i1) {
+ unsigned NumElts = SrcVT.getVectorNumElements();
+ if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+ return SDValue();
+ EVT Width = MVT::getIntegerVT(128 / NumElts);
+ return DAG.getZExtOrTrunc(
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+ {DAG.getConstant(Intrinsic::wasm_bitmask, DL, MVT::i32),
+ DAG.getSExtOrTrunc(N->getOperand(0), DL,
+ SrcVT.changeVectorElementType(Width))}),
+ DL, VT);
+ }
+
+ return SDValue();
+}
+
+static SDValue performSETCCCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ auto &DAG = DCI.DAG;
+
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+ ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+
+ // setcc (iN (bitcast (vNi1 X))), 0, ne
+ // ==> any_true (vNi1 X)
+ // setcc (iN (bitcast (vNi1 X))), 0, eq
+ // ==> xor (any_true (vNi1 X)), -1
+ // setcc (iN (bitcast (vNi1 X))), -1, eq
+ // ==> all_true (vNi1 X)
+ // setcc (iN (bitcast (vNi1 X))), -1, ne
+ // ==> xor (all_true (vNi1 X)), -1
+ if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
+ (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+ (isNullConstant(RHS) || isAllOnesConstant(RHS)) &&
+ LHS->getOpcode() == ISD::BITCAST) {
+ EVT FromVT = LHS->getOperand(0).getValueType();
+ if (FromVT.isFixedLengthVector() &&
+ FromVT.getVectorElementType() == MVT::i1) {
+ int Intrin = isNullConstant(RHS) ? Intrinsic::wasm_anytrue
+ : Intrinsic::wasm_alltrue;
+ unsigned NumElts = FromVT.getVectorNumElements();
+ assert(NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16);
+ EVT Width = MVT::getIntegerVT(128 / NumElts);
+ SDValue Ret = DAG.getZExtOrTrunc(
+ DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+ {DAG.getConstant(Intrin, DL, MVT::i32),
+ DAG.getSExtOrTrunc(LHS->getOperand(0), DL,
+ FromVT.changeVectorElementType(Width))}),
+ DL, MVT::i1);
+ if ((isNullConstant(RHS) && (Cond == ISD::SETEQ)) ||
+ (isAllOnesConstant(RHS) && (Cond == ISD::SETNE))) {
+ Ret = DAG.getNOT(DL, Ret, MVT::i1);
+ }
+ return DAG.getZExtOrTrunc(Ret, DL, VT);
+ }
+ }
+
+ return SDValue();
+}
+
SDValue
WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
switch (N->getOpcode()) {
default:
return SDValue();
+ case ISD::BITCAST:
+ return performBitcastCombine(N, DCI);
+ case ISD::SETCC:
+ return performSETCCCombine(N, DCI);
case ISD::VECTOR_SHUFFLE:
return performVECTOR_SHUFFLECombine(N, DCI);
case ISD::SIGN_EXTEND:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index b6750879daa5b..ecf5d5b1ea5da 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -131,6 +131,7 @@ class WebAssemblyTargetLowering final : public TargetLowering {
SDValue LowerCopyToReg(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerIntrinsic(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll b/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll
new file mode 100644
index 0000000000000..ca160c091b229
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll
@@ -0,0 +1,414 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mattr=+simd128 | FileCheck %s
+
+;; Test that SIMD bitmask instruction can be selected
+
+target triple = "wasm32-unknown-unknown"
+
+define i16 @bitmask_v16i8(<16 x i8> %v) {
+; CHECK-LABEL: bitmask_v16i8:
+; CHECK: .functype bitmask_v16i8 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT: i8x16.eq
+; CHECK-NEXT: i8x16.bitmask
+; CHECK-NEXT: # fallthrough-return
+ %cmp = icmp eq <16 x i8> %v, zeroinitializer
+ %bitmask = bitcast <16 x i1> %cmp to i16
+ ret i16 %bitmask
+}
+
+define i8 @bitmask_v8i16(<8 x i16> %v) {
+; CHECK-LABEL: bitmask_v8i16:
+; CHECK: .functype bitmask_v8i16 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT: i16x8.eq
+; CHECK-NEXT: i16x8.bitmask
+; CHECK-NEXT: # fallthrough-return
+ %cmp = icmp eq <8 x i16> %v, zeroinitializer
+ %bitmask = bitcast <8 x i1> %cmp to i8
+ ret i8 %bitmask
+}
+
+define i8 @bitmask_v4i32(<4 x i32> %v) {
+; CHECK-LABEL: bitmask_v4i32:
+; CHECK: .functype bitmask_v4i32 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: v128.const 0, 0, 0, 0
+; CHECK-NEXT: i32x4.eq
+; CHECK-NEXT: i32x4.bitmask
+; CHECK-NEXT: # fallthrough-return
+ %cmp = icmp eq <4 x i32> %v, zeroinitializer
+ %bitmask = bitcast <4 x i1> %cmp to i4
+ %ext = zext i4 %bitmask to i8
+ ret i8 %ext
+}
+
+define i8 @bitmask_v2i64(<2 x i64> %v) {
+; CHECK-LABEL: bitmask_v2i64:
+; CHECK: .functype bitmask_v2i64 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: v128.const 0, 0
+; CHECK-NEXT: i64x2.eq
+; CHECK-NEXT: i64x2.bitmask
+; CHECK-NEXT: # fallthrough-return
+ %cmp = icmp eq <2 x i64> %v, zeroinitializer
+ %bitmask = bitcast <2 x i1> %cmp to i2
+ %ext = zext i2 %bitmask to i8
+ ret i8 %ext
+}
+
+;; Test unusual vectors
+
+define i1 @bitmask_v1i8(<1 x i8> %v) {
+; CHECK-LABEL: bitmask_v1i8:
+; CHECK: .functype bitmask_v1i8 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 0
+; CHECK-NEXT: i32.eqz
+; CHECK-NEXT: # fallthrough-return
+ %cmp = icmp eq <1 x i8> %v, zeroinitializer
+ %bitmask = bitcast <1 x i1> %cmp to i1
+ ret i1 %bitmask
+}
+
+define i7 @bitmask_v7i8(<7 x i8> %v) {
+; CHECK-LABEL: bitmask_v7i8:
+; CHECK: .functype bitmask_v7i8 (i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; CHECK-NEXT: .local v128
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: global.get __stack_pointer
+; CHECK-NEXT: i32.const 16
+; CHECK-NEXT: i32.sub
+; CHECK-NEXT: drop
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.splat
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i8x16.replace_lane 1
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i8x16.replace_lane 2
+; CHECK-NEXT: local.get 3
+; CHECK-NEXT: i8x16.replace_lane 3
+; CHECK-NEXT: local.get 4
+; CHECK-NEXT: i8x16.replace_lane 4
+; CHECK-NEXT: local.get 5
+; CHECK-NEXT: i8x16.replace_lane 5
+; CHECK-NEXT: local.get 6
+; CHECK-NEXT: i8x16.replace_lane 6
+; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT: i8x16.eq
+; CHECK-NEXT: local.tee 7
+; CHECK-NEXT: i16x8.extract_lane_u 0
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: local.get 7
+; CHECK-NEXT: i16x8.extend_low_i8x16_s
+; CHECK-NEXT: local.tee 7
+; CHECK-NEXT: i16x8.extract_lane_u 1
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 7
+; CHECK-NEXT: i16x8.extract_lane_u 2
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 2
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 7
+; CHECK-NEXT: i16x8.extract_lane_u 3
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 3
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 7
+; CHECK-NEXT: i16x8.extract_lane_u 4
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 4
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 7
+; CHECK-NEXT: i16x8.extract_lane_u 5
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 5
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 7
+; CHECK-NEXT: i16x8.extract_lane_u 6
+; CHECK-NEXT: i32.const 6
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.const 127
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: # fallthrough-return
+ %cmp = icmp eq <7 x i8> %v, zeroinitializer
+ %bitmask = bitcast <7 x i1> %cmp to i7
+ ret i7 %bitmask
+}
+
+define i8 @bitmask_v8i8(<8 x i8> %v) {
+; CHECK-LABEL: bitmask_v8i8:
+; CHECK: .functype bitmask_v8i8 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT: i8x16.eq
+; CHECK-NEXT: i16x8.extend_low_i8x16_s
+; CHECK-NEXT: i16x8.bitmask
+; CHECK-NEXT: # fallthrough-return
+ %cmp = icmp eq <8 x i8> %v, zeroinitializer
+ %bitmask = bitcast <8 x i1> %cmp to i8
+ ret i8 %bitmask
+}
+
+define i32 @bitmask_v32i8(<32 x i8> %v) {
+; CHECK-LABEL: bitmask_v32i8:
+; CHECK: .functype bitmask_v32i8 (v128, v128) -> (i32)
+; CHECK-NEXT: .local v128
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: global.get __stack_pointer
+; CHECK-NEXT: i32.const 16
+; CHECK-NEXT: i32.sub
+; CHECK-NEXT: drop
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT: local.tee 2
+; CHECK-NEXT: i8x16.eq
+; CHECK-NEXT: local.tee 0
+; CHECK-NEXT: i8x16.extract_lane_u 0
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 1
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 2
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 2
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 3
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 3
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 4
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 4
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 5
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 5
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 6
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 6
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 7
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 7
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 8
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 8
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 9
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 9
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 10
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 10
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 11
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 11
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 12
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 12
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 13
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 13
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 14
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 14
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 15
+; CHECK-NEXT: i32.const 15
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.const 65535
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: i8x16.eq
+; CHECK-NEXT: local.tee 0
+; CHECK-NEXT: i8x16.extract_lane_u 15
+; CHECK-NEXT: i32.const 31
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 14
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 30
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 13
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 29
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 12
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 28
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 11
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 27
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 10
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 26
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 9
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 25
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 8
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 24
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 7
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 23
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 6
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 22
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 5
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 21
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 4
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 20
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 3
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 19
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 2
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 18
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 1
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 17
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.extract_lane_u 0
+; CHECK-NEXT: i32.const 1
+; CHECK-NEXT: i32.and
+; CHECK-NEXT: i32.const 16
+; CHECK-NEXT: i32.shl
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: i32.or
+; CHECK-NEXT: # fallthrough-return
+ %cmp = icmp eq <32 x i8> %v, zeroinitializer
+ %bitmask = bitcast <32 x i1> %cmp to i32
+ ret i32 %bitmask
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll
index 2b871f1e84adc..c93b8aa7fb42e 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll
@@ -36,9 +36,9 @@ define <4 x float> @extend_to_float_low_i8x16_u(<8 x i8> %x) {
; CHECK-LABEL: extend_to_float_low_i8x16_u:
; CHECK: .functype extend_to_float_low_i8x16_u (v128) -> (v128)
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
+; CHECK-NEXT: i16x8.extend_low_i8x16_u
+; CHECK-NEXT: i32x4.extend_low_i16x8_u
; CHECK-NEXT: f32x4.convert_i32x4_u
; CHECK-NEXT: # fallthrough-return
%low = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -51,8 +51,10 @@ define <4 x float> @extend_to_float_high_i8x16_u(<8 x i8> %x) {
; CHECK: .functype extend_to_float_high_i8x16_u (v128) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
-; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK-NEXT: i8x16.shuffle 4, 17, 18, 19, 5, 21, 22, 23, 6, 25, 26, 27, 7, 29, 30, 31
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT: i16x8.extend_low_i8x16_u
+; CHECK-NEXT: i32x4.extend_low_i16x8_u
; CHECK-NEXT: f32x4.convert_i32x4_u
; CHECK-NEXT: # fallthrough-return
%high = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -91,12 +93,8 @@ define <4 x float> @extend_to_float_low_i8x16_s(<8 x i8> %x) {
; CHECK: .functype extend_to_float_low_i8x16_s (v128) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shl
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shr_s
+; CHECK-NEXT: i16x8.extend_low_i8x16_s
+; CHECK-NEXT: i32x4.extend_low_i16x8_s
; CHECK-NEXT: f32x4.convert_i32x4_s
; CHECK-NEXT: # fallthrough-return
%low = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -110,11 +108,9 @@ define <4 x float> @extend_to_float_high_i8x16_s(<8 x i8> %x) {
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.shuffle 4, 0, 0, 0, 5, 0, 0, 0, 6, 0, 0, 0, 7, 0, 0, 0
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shl
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shr_s
+; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT: i16x8.extend_low_i8x16_s
+; CHECK-NEXT: i32x4.extend_low_i16x8_s
; CHECK-NEXT: f32x4.convert_i32x4_s
; CHECK-NEXT: # fallthrough-return
%high = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -138,9 +134,8 @@ define <2 x double> @extend_to_double_low_i16x4_u(<4 x i16> %x) {
; CHECK-LABEL: extend_to_double_low_i16x4_u:
; CHECK: .functype extend_to_double_low_i16x4_u (v128) -> (v128)
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.shuffle 16, 17, 2, 3, 18, 19, 6, 7, 20, 21, 10, 11, 22, 23, 14, 15
+; CHECK-NEXT: i32x4.extend_low_i16x8_u
; CHECK-NEXT: f64x2.convert_low_i32x4_u
; CHECK-NEXT: # fallthrough-return
%low = shufflevector <4 x i16> %x, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
diff --git a/llvm/test/CodeGen/WebAssembly/simd-extending.ll b/llvm/test/CodeGen/WebAssembly/simd-extending.ll
index 1f84e6485dac5..2445570bb8fa9 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-extending.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-extending.ll
@@ -170,11 +170,8 @@ define <8 x i16> @extend_lowish_i8x16_s(<16 x i8> %v) {
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.shuffle 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0
-; CHECK-NEXT: i32.const 8
-; CHECK-NEXT: i16x8.shl
-; CHECK-NEXT: i32.const 8
-; CHECK-NEXT: i16x8.shr_s
+; CHECK-NEXT: i8x16.shuffle 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT: i16x8.extend_low_i8x16_s
; CHECK-NEXT: # fallthrough-return
%lowish = shufflevector <16 x i8> %v, <16 x i8> undef,
<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
@@ -188,14 +185,81 @@ define <4 x i32> @extend_lowish_i16x8_s(<8 x i16> %v) {
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.shuffle 2, 3, 0, 1, 4, 5, 0, 1, 6, 7, 0, 1, 8, 9, 0, 1
-; CHECK-NEXT: i32.const 16
-; CHECK-NEXT: i32x4.shl
-; CHECK-NEXT: i32.const 16
-; CHECK-NEXT: i32x4.shr_s
+; CHECK-NEXT: i8x16.shuffle 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK-NEXT: i32x4.extend_low_i16x8_s
; CHECK-NEXT: # fallthrough-return
%lowish = shufflevector <8 x i16> %v, <8 x i16> undef,
<4 x i32> <i32 1, i32 2, i32 3, i32 4>
%extended = sext <4 x i16> %lowish to <4 x i32>
ret <4 x i32> %extended
}
+
+;; Also test vectors that aren't full 128 bits, or might require
+;; multiple extensions
+
+define <16 x i8> @extend_i1x16_i8(<16 x i1> %v) {
+; CHECK-LABEL: extend_i1x16_i8:
+; CHECK: .functype extend_i1x16_i8 (v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32.const 7
+; CHECK-NEXT: i8x16.shl
+; CHECK-NEXT: i32.const 7
+; CHECK-NEXT: i8x16.shr_s
+; CHECK-NEXT: # fallthrough-return
+ %extended = sext <16 x i1> %v to <16 x i8>
+ ret <16 x i8> %extended
+}
+
+define <8 x i8> @extend_i1x8_i8(<8 x i1> %v) {
+; CHECK-LABEL: extend_i1x8_i8:
+; CHECK: .functype extend_i1x8_i8 (v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT: i32.const 7
+; CHECK-NEXT: i8x16.shl
+; CHECK-NEXT: i32.const 7
+; CHECK-NEXT: i8x16.shr_s
+; CHECK-NEXT: # fallthrough-return
+ %extended = sext <8 x i1> %v to <8 x i8>
+ ret <8 x i8> %extended
+}
+
+define <8 x i16> @extend_i1x8_i16(<8 x i1> %v) {
+; CHECK-LABEL: extend_i1x8_i16:
+; CHECK: .functype extend_i1x8_i16 (v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: v128.const 1, 1, 1, 1, 1, 1, 1, 1
+; CHECK-NEXT: v128.and
+; CHECK-NEXT: # fallthrough-return
+ %extended = zext <8 x i1> %v to <8 x i16>
+ ret <8 x i16> %extended
+}
+
+define <4 x i32> @extend_i8x4_i32(<4 x i8> %v) {
+; CHECK-LABEL: extend_i8x4_i32:
+; CHECK: .functype extend_i8x4_i32 (v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i16x8.extend_low_i8x16_u
+; CHECK-NEXT: i32x4.extend_low_i16x8_u
+; CHECK-NEXT: # fallthrough-return
+ %extended = zext <4 x i8> %v to <4 x i32>
+ ret <4 x i32> %extended
+}
+
+define <2 x i64> @extend_i8x2_i64(<2 x i8> %v) {
+; CHECK-LABEL: extend_i8x2_i64:
+; CHECK: .functype extend_i8x2_i64 (v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i16x8.extend_low_i8x16_s
+; CHECK-NEXT: i32x4.extend_low_i16x8_s
+; CHECK-NEXT: i64x2.extend_low_i32x4_s
+; CHECK-NEXT: # fallthrough-return
+ %extended = sext <2 x i8> %v to <2 x i64>
+ ret <2 x i64> %extended
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-offset.ll b/llvm/test/CodeGen/WebAssembly/simd-offset.ll
index f317edca549de..fc47dc829999c 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-offset.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-offset.ll
@@ -1183,16 +1183,11 @@ define <4 x i32> @load_zext_v4i16_to_v4i32(ptr %p) {
define <4 x i32> @load_sext_v4i8_to_v4i32(ptr %p) {
; CHECK-LABEL: load_sext_v4i8_to_v4i32:
; CHECK: .functype load_sext_v4i8_to_v4i32 (i32) -> (v128)
-; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.load32_zero 0
-; CHECK-NEXT: local.get 1
-; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shl
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shr_s
+; CHECK-NEXT: i16x8.extend_low_i8x16_s
+; CHECK-NEXT: i32x4.extend_low_i16x8_s
; CHECK-NEXT: # fallthrough-return
%v = load <4 x i8>, ptr %p
%v2 = sext <4 x i8> %v to <4 x i32>
@@ -1203,10 +1198,10 @@ define <4 x i32> @load_zext_v4i8_to_v4i32(ptr %p) {
; CHECK-LABEL: load_zext_v4i8_to_v4i32:
; CHECK: .functype load_zext_v4i8_to_v4i32 (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.load32_zero 0
-; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
+; CHECK-NEXT: i16x8.extend_low_i8x16_u
+; CHECK-NEXT: i32x4.extend_low_i16x8_u
; CHECK-NEXT: # fallthrough-return
%v = load <4 x i8>, ptr %p
%v2 = zext <4 x i8> %v to <4 x i32>
@@ -1287,16 +1282,11 @@ define <4 x i32> @load_zext_v4i16_to_v4i32_with_folded_offset(ptr %p) {
define <4 x i32> @load_sext_v4i8_to_v4i32_with_folded_offset(ptr %p) {
; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_folded_offset:
; CHECK: .functype load_sext_v4i8_to_v4i32_with_folded_offset (i32) -> (v128)
-; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.load32_zero 16
-; CHECK-NEXT: local.get 1
-; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shl
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shr_s
+; CHECK-NEXT: i16x8.extend_low_i8x16_s
+; CHECK-NEXT: i32x4.extend_low_i16x8_s
; CHECK-NEXT: # fallthrough-return
%q = ptrtoint ptr %p to i32
%r = add nuw i32 %q, 16
@@ -1310,10 +1300,10 @@ define <4 x i32> @load_zext_v4i8_to_v4i32_with_folded_offset(ptr %p) {
; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_folded_offset:
; CHECK: .functype load_zext_v4i8_to_v4i32_with_folded_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.load32_zero 16
-; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
+; CHECK-NEXT: i16x8.extend_low_i8x16_u
+; CHECK-NEXT: i32x4.extend_low_i16x8_u
; CHECK-NEXT: # fallthrough-return
%q = ptrtoint ptr %p to i32
%r = add nuw i32 %q, 16
@@ -1392,16 +1382,11 @@ define <4 x i32> @load_zext_v4i16_to_v4i32_with_folded_gep_offset(ptr %p) {
define <4 x i32> @load_sext_v4i8_to_v4i32_with_folded_gep_offset(ptr %p) {
; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_folded_gep_offset:
; CHECK: .functype load_sext_v4i8_to_v4i32_with_folded_gep_offset (i32) -> (v128)
-; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.load32_zero 4
-; CHECK-NEXT: local.get 1
-; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shl
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shr_s
+; CHECK-NEXT: i16x8.extend_low_i8x16_s
+; CHECK-NEXT: i32x4.extend_low_i16x8_s
; CHECK-NEXT: # fallthrough-return
%s = getelementptr inbounds <4 x i8>, ptr %p, i32 1
%v = load <4 x i8>, ptr %s
@@ -1413,10 +1398,10 @@ define <4 x i32> @load_zext_v4i8_to_v4i32_with_folded_gep_offset(ptr %p) {
; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_folded_gep_offset:
; CHECK: .functype load_zext_v4i8_to_v4i32_with_folded_gep_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.load32_zero 4
-; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
+; CHECK-NEXT: i16x8.extend_low_i8x16_u
+; CHECK-NEXT: i32x4.extend_low_i16x8_u
; CHECK-NEXT: # fallthrough-return
%s = getelementptr inbounds <4 x i8>, ptr %p, i32 1
%v = load <4 x i8>, ptr %s
@@ -1499,18 +1484,13 @@ define <4 x i32> @load_zext_v4i16_to_v4i32_with_unfolded_gep_negative_offset(ptr
define <4 x i32> @load_sext_v4i8_to_v4i32_with_unfolded_gep_negative_offset(ptr %p) {
; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_unfolded_gep_negative_offset:
; CHECK: .functype load_sext_v4i8_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
-; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const -4
; CHECK-NEXT: i32.add
; CHECK-NEXT: v128.load32_zero 0
-; CHECK-NEXT: local.get 1
-; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shl
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shr_s
+; CHECK-NEXT: i16x8.extend_low_i8x16_s
+; CHECK-NEXT: i32x4.extend_low_i16x8_s
; CHECK-NEXT: # fallthrough-return
%s = getelementptr inbounds <4 x i8>, ptr %p, i32 -1
%v = load <4 x i8>, ptr %s
@@ -1522,12 +1502,12 @@ define <4 x i32> @load_zext_v4i8_to_v4i32_with_unfolded_gep_negative_offset(ptr
; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_unfolded_gep_negative_offset:
; CHECK: .functype load_zext_v4i8_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const -4
; CHECK-NEXT: i32.add
; CHECK-NEXT: v128.load32_zero 0
-; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
+; CHECK-NEXT: i16x8.extend_low_i8x16_u
+; CHECK-NEXT: i32x4.extend_low_i16x8_u
; CHECK-NEXT: # fallthrough-return
%s = getelementptr inbounds <4 x i8>, ptr %p, i32 -1
%v = load <4 x i8>, ptr %s
@@ -1620,18 +1600,13 @@ define <4 x i32> @load_zext_v4i16_to_v4i32_with_unfolded_offset(ptr %p) {
define <4 x i32> @load_sext_v4i8_to_v4i32_with_unfolded_offset(ptr %p) {
; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_unfolded_offset:
; CHECK: .functype load_sext_v4i8_to_v4i32_with_unfolded_offset (i32) -> (v128)
-; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const 16
; CHECK-NEXT: i32.add
; CHECK-NEXT: v128.load32_zero 0
-; CHECK-NEXT: local.get 1
-; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shl
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shr_s
+; CHECK-NEXT: i16x8.extend_low_i8x16_s
+; CHECK-NEXT: i32x4.extend_low_i16x8_s
; CHECK-NEXT: # fallthrough-return
%q = ptrtoint ptr %p to i32
%r = add nsw i32 %q, 16
@@ -1645,12 +1620,12 @@ define <4 x i32> @load_zext_v4i8_to_v4i32_with_unfolded_offset(ptr %p) {
; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_unfolded_offset:
; CHECK: .functype load_zext_v4i8_to_v4i32_with_unfolded_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const 16
; CHECK-NEXT: i32.add
; CHECK-NEXT: v128.load32_zero 0
-; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
+; CHECK-NEXT: i16x8.extend_low_i8x16_u
+; CHECK-NEXT: i32x4.extend_low_i16x8_u
; CHECK-NEXT: # fallthrough-return
%q = ptrtoint ptr %p to i32
%r = add nsw i32 %q, 16
@@ -1739,18 +1714,13 @@ define <4 x i32> @load_zext_v4i16_to_v4i32_with_unfolded_gep_offset(ptr %p) {
define <4 x i32> @load_sext_v4i8_to_v4i32_with_unfolded_gep_offset(ptr %p) {
; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_unfolded_gep_offset:
; CHECK: .functype load_sext_v4i8_to_v4i32_with_unfolded_gep_offset (i32) -> (v128)
-; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const 4
; CHECK-NEXT: i32.add
; CHECK-NEXT: v128.load32_zero 0
-; CHECK-NEXT: local.get 1
-; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shl
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shr_s
+; CHECK-NEXT: i16x8.extend_low_i8x16_s
+; CHECK-NEXT: i32x4.extend_low_i16x8_s
; CHECK-NEXT: # fallthrough-return
%s = getelementptr <4 x i8>, ptr %p, i32 1
%v = load <4 x i8>, ptr %s
@@ -1762,12 +1732,12 @@ define <4 x i32> @load_zext_v4i8_to_v4i32_with_unfolded_gep_offset(ptr %p) {
; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_unfolded_gep_offset:
; CHECK: .functype load_zext_v4i8_to_v4i32_with_unfolded_gep_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const 4
; CHECK-NEXT: i32.add
; CHECK-NEXT: v128.load32_zero 0
-; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
+; CHECK-NEXT: i16x8.extend_low_i8x16_u
+; CHECK-NEXT: i32x4.extend_low_i16x8_u
; CHECK-NEXT: # fallthrough-return
%s = getelementptr <4 x i8>, ptr %p, i32 1
%v = load <4 x i8>, ptr %s
@@ -1844,16 +1814,11 @@ define <4 x i32> @load_zext_v4i16_to_v4i32_from_numeric_address() {
define <4 x i32> @load_sext_v4i8_to_v4i32_from_numeric_address() {
; CHECK-LABEL: load_sext_v4i8_to_v4i32_from_numeric_address:
; CHECK: .functype load_sext_v4i8_to_v4i32_from_numeric_address () -> (v128)
-; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: v128.load32_zero 32
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shl
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shr_s
+; CHECK-NEXT: i16x8.extend_low_i8x16_s
+; CHECK-NEXT: i32x4.extend_low_i16x8_s
; CHECK-NEXT: # fallthrough-return
%s = inttoptr i32 32 to ptr
%v = load <4 x i8>, ptr %s
@@ -1865,10 +1830,10 @@ define <4 x i32> @load_zext_v4i8_to_v4i32_from_numeric_address() {
; CHECK-LABEL: load_zext_v4i8_to_v4i32_from_numeric_address:
; CHECK: .functype load_zext_v4i8_to_v4i32_from_numeric_address () -> (v128)
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: v128.load32_zero 32
-; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
+; CHECK-NEXT: i16x8.extend_low_i8x16_u
+; CHECK-NEXT: i32x4.extend_low_i16x8_u
; CHECK-NEXT: # fallthrough-return
%s = inttoptr i32 32 to ptr
%v = load <4 x i8>, ptr %s
@@ -1943,16 +1908,11 @@ define <4 x i32> @load_zext_v4i16_to_v4i32_from_global_address() {
define <4 x i32> @load_sext_v4i8_to_v4i32_from_global_address() {
; CHECK-LABEL: load_sext_v4i8_to_v4i32_from_global_address:
; CHECK: .functype load_sext_v4i8_to_v4i32_from_global_address () -> (v128)
-; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: v128.load32_zero gv_v4i8
-; CHECK-NEXT: local.get 0
-; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shl
-; CHECK-NEXT: i32.const 24
-; CHECK-NEXT: i32x4.shr_s
+; CHECK-NEXT: i16x8.extend_low_i8x16_s
+; CHECK-NEXT: i32x4.extend_low_i16x8_s
; CHECK-NEXT: # fallthrough-return
%v = load <4 x i8>, ptr @gv_v4i8
%v2 = sext <4 x i8> %v to <4 x i32>
@@ -1963,10 +1923,10 @@ define <4 x i32> @load_zext_v4i8_to_v4i32_from_global_address() {
; CHECK-LABEL: load_zext_v4i8_to_v4i32_from_global_address:
; CHECK: .functype load_zext_v4i8_to_v4i32_from_global_address () -> (v128)
; CHECK-NEXT: # %bb.0:
-; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: v128.load32_zero gv_v4i8
-; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
+; CHECK-NEXT: i16x8.extend_low_i8x16_u
+; CHECK-NEXT: i32x4.extend_low_i16x8_u
; CHECK-NEXT: # fallthrough-return
%v = load <4 x i8>, ptr @gv_v4i8
%v2 = zext <4 x i8> %v to <4 x i32>
diff --git a/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll b/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll
new file mode 100644
index 0000000000000..ff3a685018f2c
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-vecreduce-bool.ll
@@ -0,0 +1,320 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
+
+; Tests that bool vecreduce produces anytrue and alltrue instructions
+
+target triple = "wasm32-unknown-unknown"
+
+declare i1 @llvm.vector.reduce.or.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.or.v4i1(<4 x i1>)
+declare i1 @llvm.vector.reduce.or.v7i1(<7 x i1>)
+declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1>)
+declare i1 @llvm.vector.reduce.or.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.and.v2i1(<2 x i1>)
+declare i1 @llvm.vector.reduce.and.v4i1(<4 x i1>)
+declare i1 @llvm.vector.reduce.and.v7i1(<7 x i1>)
+declare i1 @llvm.vector.reduce.and.v8i1(<8 x i1>)
+declare i1 @llvm.vector.reduce.and.v16i1(<16 x i1>)
+
+; =====================
+; Regular vectors of i1
+; =====================
+
+define i1 @test_any_v8i1(<8 x i1> %x) {
+; CHECK-LABEL: test_any_v8i1:
+; CHECK: .functype test_any_v8i1 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.const $push0=, 15
+; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0
+; CHECK-NEXT: i32.const $push4=, 15
+; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop4
+; CHECK-NEXT: v128.any_true $push3=, $pop2
+; CHECK-NEXT: return $pop3
+ %ret = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x)
+ ret i1 %ret
+}
+
+define i1 @test_all_v8i1(<8 x i1> %x) {
+; CHECK-LABEL: test_all_v8i1:
+; CHECK: .functype test_all_v8i1 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.const $push0=, 15
+; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0
+; CHECK-NEXT: i32.const $push4=, 15
+; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop4
+; CHECK-NEXT: i16x8.all_true $push3=, $pop2
+; CHECK-NEXT: return $pop3
+ %ret = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x)
+ ret i1 %ret
+}
+
+define i1 @test_none_v8i1(<8 x i1> %x) {
+; CHECK-LABEL: test_none_v8i1:
+; CHECK: .functype test_none_v8i1 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.const $push0=, 15
+; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0
+; CHECK-NEXT: i32.const $push6=, 15
+; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop6
+; CHECK-NEXT: v128.any_true $push3=, $pop2
+; CHECK-NEXT: i32.const $push4=, 1
+; CHECK-NEXT: i32.xor $push5=, $pop3, $pop4
+; CHECK-NEXT: return $pop5
+ %any = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %x)
+ %none = xor i1 %any, 1
+ ret i1 %none
+}
+
+define i1 @test_not_all_v8i1(<8 x i1> %x) {
+; CHECK-LABEL: test_not_all_v8i1:
+; CHECK: .functype test_not_all_v8i1 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.const $push0=, 15
+; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0
+; CHECK-NEXT: i32.const $push6=, 15
+; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop6
+; CHECK-NEXT: i16x8.all_true $push3=, $pop2
+; CHECK-NEXT: i32.const $push4=, 1
+; CHECK-NEXT: i32.xor $push5=, $pop3, $pop4
+; CHECK-NEXT: return $pop5
+ %all = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %x)
+ %notall = xor i1 %all, 1
+ ret i1 %notall
+}
+
+define i1 @test_any_v16i1(<16 x i1> %x) {
+; CHECK-LABEL: test_any_v16i1:
+; CHECK: .functype test_any_v16i1 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.const $push0=, 7
+; CHECK-NEXT: i8x16.shl $push1=, $0, $pop0
+; CHECK-NEXT: i32.const $push4=, 7
+; CHECK-NEXT: i8x16.shr_s $push2=, $pop1, $pop4
+; CHECK-NEXT: v128.any_true $push3=, $pop2
+; CHECK-NEXT: return $pop3
+ %ret = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %x)
+ ret i1 %ret
+}
+
+define i1 @test_all_v16i1(<16 x i1> %x) {
+; CHECK-LABEL: test_all_v16i1:
+; CHECK: .functype test_all_v16i1 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.const $push0=, 7
+; CHECK-NEXT: i8x16.shl $push1=, $0, $pop0
+; CHECK-NEXT: i32.const $push4=, 7
+; CHECK-NEXT: i8x16.shr_s $push2=, $pop1, $pop4
+; CHECK-NEXT: i8x16.all_true $push3=, $pop2
+; CHECK-NEXT: return $pop3
+ %ret = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %x)
+ ret i1 %ret
+}
+
+; ==================================
+; Regular vectors of larger integers
+; ==================================
+
+define i1 @test_any_v16i8(<16 x i8> %x) {
+; CHECK-LABEL: test_any_v16i8:
+; CHECK: .functype test_any_v16i8 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.const $push0=, 7
+; CHECK-NEXT: i8x16.shl $push1=, $0, $pop0
+; CHECK-NEXT: i32.const $push4=, 7
+; CHECK-NEXT: i8x16.shr_s $push2=, $pop1, $pop4
+; CHECK-NEXT: v128.any_true $push3=, $pop2
+; CHECK-NEXT: return $pop3
+ %bits = trunc <16 x i8> %x to <16 x i1>
+ %ret = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %bits)
+ ret i1 %ret
+}
+
+define i1 @test_all_v16i8(<16 x i8> %x) {
+; CHECK-LABEL: test_all_v16i8:
+; CHECK: .functype test_all_v16i8 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.const $push0=, 7
+; CHECK-NEXT: i8x16.shl $push1=, $0, $pop0
+; CHECK-NEXT: i32.const $push4=, 7
+; CHECK-NEXT: i8x16.shr_s $push2=, $pop1, $pop4
+; CHECK-NEXT: i8x16.all_true $push3=, $pop2
+; CHECK-NEXT: return $pop3
+ %bits = trunc <16 x i8> %x to <16 x i1>
+ %ret = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> %bits)
+ ret i1 %ret
+}
+
+define i1 @test_any_v8i16(<8 x i16> %x) {
+; CHECK-LABEL: test_any_v8i16:
+; CHECK: .functype test_any_v8i16 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.const $push0=, 15
+; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0
+; CHECK-NEXT: i32.const $push4=, 15
+; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop4
+; CHECK-NEXT: v128.any_true $push3=, $pop2
+; CHECK-NEXT: return $pop3
+ %bits = trunc <8 x i16> %x to <8 x i1>
+ %ret = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %bits)
+ ret i1 %ret
+}
+
+define i1 @test_all_v8i16(<8 x i16> %x) {
+; CHECK-LABEL: test_all_v8i16:
+; CHECK: .functype test_all_v8i16 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.const $push0=, 15
+; CHECK-NEXT: i16x8.shl $push1=, $0, $pop0
+; CHECK-NEXT: i32.const $push4=, 15
+; CHECK-NEXT: i16x8.shr_s $push2=, $pop1, $pop4
+; CHECK-NEXT: i16x8.all_true $push3=, $pop2
+; CHECK-NEXT: return $pop3
+ %bits = trunc <8 x i16> %x to <8 x i1>
+ %ret = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %bits)
+ ret i1 %ret
+}
+
+define i1 @test_any_v4i32(<4 x i32> %x) {
+; CHECK-LABEL: test_any_v4i32:
+; CHECK: .functype test_any_v4i32 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.const $push0=, 31
+; CHECK-NEXT: i32x4.shl $push1=, $0, $pop0
+; CHECK-NEXT: i32.const $push4=, 31
+; CHECK-NEXT: i32x4.shr_s $push2=, $pop1, $pop4
+; CHECK-NEXT: v128.any_true $push3=, $pop2
+; CHECK-NEXT: return $pop3
+ %bits = trunc <4 x i32> %x to <4 x i1>
+ %ret = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %bits)
+ ret i1 %ret
+}
+
+define i1 @test_all_v4i32(<4 x i32> %x) {
+; CHECK-LABEL: test_all_v4i32:
+; CHECK: .functype test_all_v4i32 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.const $push0=, 31
+; CHECK-NEXT: i32x4.shl $push1=, $0, $pop0
+; CHECK-NEXT: i32.const $push4=, 31
+; CHECK-NEXT: i32x4.shr_s $push2=, $pop1, $pop4
+; CHECK-NEXT: i32x4.all_true $push3=, $pop2
+; CHECK-NEXT: return $pop3
+ %bits = trunc <4 x i32> %x to <4 x i1>
+ %ret = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %bits)
+ ret i1 %ret
+}
+
+define i1 @test_any_v2i64(<2 x i64> %x) {
+; CHECK-LABEL: test_any_v2i64:
+; CHECK: .functype test_any_v2i64 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.const $push0=, 63
+; CHECK-NEXT: i64x2.shl $push1=, $0, $pop0
+; CHECK-NEXT: i32.const $push4=, 63
+; CHECK-NEXT: i64x2.shr_s $push2=, $pop1, $pop4
+; CHECK-NEXT: v128.any_true $push3=, $pop2
+; CHECK-NEXT: return $pop3
+ %bits = trunc <2 x i64> %x to <2 x i1>
+ %ret = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %bits)
+ ret i1 %ret
+}
+
+define i1 @test_all_v2i64(<2 x i64> %x) {
+; CHECK-LABEL: test_all_v2i64:
+; CHECK: .functype test_all_v2i64 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.const $push0=, 63
+; CHECK-NEXT: i64x2.shl $push1=, $0, $pop0
+; CHECK-NEXT: i32.const $push4=, 63
+; CHECK-NEXT: i64x2.shr_s $push2=, $pop1, $pop4
+; CHECK-NEXT: i64x2.all_true $push3=, $pop2
+; CHECK-NEXT: return $pop3
+ %bits = trunc <2 x i64> %x to <2 x i1>
+ %ret = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> %bits)
+ ret i1 %ret
+}
+
+; ====================
+; Unusual vector sizes
+; ====================
+
+define i1 @test_any_v7i1(<7 x i1> %x) {
+; CHECK-LABEL: test_any_v7i1:
+; CHECK: .functype test_any_v7i1 (i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.or $push0=, $0, $1
+; CHECK-NEXT: i32.or $push1=, $pop0, $2
+; CHECK-NEXT: i32.or $push2=, $pop1, $3
+; CHECK-NEXT: i32.or $push3=, $pop2, $4
+; CHECK-NEXT: i32.or $push4=, $pop3, $5
+; CHECK-NEXT: i32.or $push5=, $pop4, $6
+; CHECK-NEXT: return $pop5
+ %ret = call i1 @llvm.vector.reduce.or.v7i1(<7 x i1> %x)
+ ret i1 %ret
+}
+
+define i1 @test_all_v7i1(<7 x i1> %x) {
+; CHECK-LABEL: test_all_v7i1:
+; CHECK: .functype test_all_v7i1 (i32, i32, i32, i32, i32, i32, i32) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i32.and $push0=, $0, $1
+; CHECK-NEXT: i32.and $push1=, $pop0, $2
+; CHECK-NEXT: i32.and $push2=, $pop1, $3
+; CHECK-NEXT: i32.and $push3=, $pop2, $4
+; CHECK-NEXT: i32.and $push4=, $pop3, $5
+; CHECK-NEXT: i32.and $push5=, $pop4, $6
+; CHECK-NEXT: i32.const $push6=, 1
+; CHECK-NEXT: i32.and $push7=, $pop5, $pop6
+; CHECK-NEXT: return $pop7
+ %ret = call i1 @llvm.vector.reduce.and.v7i1(<7 x i1> %x)
+ ret i1 %ret
+}
+
+define i1 @test_any_v8i8(<8 x i8> %x) {
+; CHECK-LABEL: test_any_v8i8:
+; CHECK: .functype test_any_v8i8 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i8x16.shuffle $push0=, $0, $0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
+; CHECK-NEXT: i32.const $push1=, 15
+; CHECK-NEXT: i16x8.shl $push2=, $pop0, $pop1
+; CHECK-NEXT: i32.const $push5=, 15
+; CHECK-NEXT: i16x8.shr_s $push3=, $pop2, $pop5
+; CHECK-NEXT: v128.any_true $push4=, $pop3
+; CHECK-NEXT: return $pop4
+ %bits = trunc <8 x i8> %x to <8 x i1>
+ %ret = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %bits)
+ ret i1 %ret
+}
+
+define i1 @test_all_v8i8(<8 x i8> %x) {
+; CHECK-LABEL: test_all_v8i8:
+; CHECK: .functype test_all_v8i8 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i8x16.shuffle $push0=, $0, $0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0
+; CHECK-NEXT: i32.const $push1=, 15
+; CHECK-NEXT: i16x8.shl $push2=, $pop0, $pop1
+; CHECK-NEXT: i32.const $push5=, 15
+; CHECK-NEXT: i16x8.shr_s $push3=, $pop2, $pop5
+; CHECK-NEXT: i16x8.all_true $push4=, $pop3
+; CHECK-NEXT: return $pop4
+ %bits = trunc <8 x i8> %x to <8 x i1>
+ %ret = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> %bits)
+ ret i1 %ret
+}
+
+;; =====================
+;; Test reduce after cmp
+;; =====================
+
+define i1 @test_cmp_v16i8(<16 x i8> %x) {
+; CHECK-LABEL: test_cmp_v16i8:
+; CHECK: .functype test_cmp_v16i8 (v128) -> (i32)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: v128.const $push0=, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT: i8x16.eq $push1=, $0, $pop0
+; CHECK-NEXT: v128.any_true $push2=, $pop1
+; CHECK-NEXT: return $pop2
+ %zero = icmp eq <16 x i8> %x, zeroinitializer
+ %ret = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %zero)
+ ret i1 %ret
+}
More information about the llvm-commits
mailing list