[llvm] [AArch64] optimize vselect of bitcast (PR #180375)
Folkert de Vries via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 11 06:04:16 PST 2026
https://github.com/folkertdev updated https://github.com/llvm/llvm-project/pull/180375
>From 6f6392a26609ef4ac08b7b5fc36399eea87c6018 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Sat, 7 Feb 2026 23:28:46 +0100
Subject: [PATCH 1/2] [AArch64] optimize vselect of bitcast
---
.../Target/AArch64/AArch64ISelLowering.cpp | 108 +++++++++++++++++-
1 file changed, 106 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2b2e057c80373..178fd40d94be0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27264,12 +27264,98 @@ static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
{InverseSetCC, SelectB, SelectA});
}
+// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
+static SDValue combineToExtendBoolVectorInReg(
+ unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget &Subtarget) {
+ if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
+ Opcode != ISD::ANY_EXTEND)
+ return SDValue();
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+ if (!Subtarget.hasNEON())
+ return SDValue();
+
+ EVT SVT = VT.getScalarType();
+ EVT InSVT = N0.getValueType().getScalarType();
+ unsigned EltSizeInBits = SVT.getSizeInBits();
+
+ // Input type must be extending a bool vector (bit-casted from a scalar
+ // integer) to legal integer types.
+ if (!VT.isVector())
+ return SDValue();
+ if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
+ return SDValue();
+ if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+ EVT SclVT = N00.getValueType();
+ if (!SclVT.isScalarInteger())
+ return SDValue();
+
+ SDValue Vec;
+ SmallVector<int> ShuffleMask;
+ unsigned NumElts = VT.getVectorNumElements();
+ assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
+
+ // Broadcast the scalar integer to the vector elements.
+ if (NumElts > EltSizeInBits) {
+ // If the scalar integer is greater than the vector element size, then we
+ // must split it down into sub-sections for broadcasting. For example:
+ // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
+ // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
+ assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
+ unsigned Scale = NumElts / EltSizeInBits;
+ EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
+ Vec = DAG.getBitcast(VT, Vec);
+
+ for (unsigned i = 0; i != Scale; ++i) {
+ int Offset = 0;
+ ShuffleMask.append(EltSizeInBits, i + Offset);
+ }
+ Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
+ } else {
+ // For smaller scalar integers, we can simply any-extend it to the vector
+ // element size (we don't care about the upper bits) and broadcast it to all
+ // elements.
+ Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
+ }
+
+ // Now, mask the relevant bit in each element.
+ SmallVector<SDValue, 32> Bits;
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int BitIdx = (i % EltSizeInBits);
+ APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
+ Bits.push_back(DAG.getConstant(Bit, DL, SVT));
+ }
+ SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
+ Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
+
+ // Compare against the bitmask and extend the result.
+ EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
+ Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
+ Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
+
+ // For SEXT, this is now done, otherwise shift the result down for
+ // zero-extension.
+ if (Opcode == ISD::SIGN_EXTEND)
+ return Vec;
+ return DAG.getNode(ISD::SRL, DL, VT, Vec,
+ DAG.getConstant(EltSizeInBits - 1, DL, VT));
+}
+
// vselect (v1i1 setcc) ->
// vselect (v1iXX setcc) (XX is the size of the compared operand type)
// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
// such VSELECT.
-static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue performVSelectCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ SelectionDAG &DAG = DCI.DAG;
+
if (auto SwapResult = trySwapVSelectOperands(N, DAG))
return SwapResult;
@@ -27333,6 +27419,24 @@ static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
}
}
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ // Attempt to convert a (vXi1 bitcast(iX N0)) selection mask before it might
+ // get split by legalization.
+ if (N0.getOpcode() == ISD::BITCAST && CCVT.isVector() &&
+ CCVT.getVectorElementType() == MVT::i1 &&
+ TLI.isTypeLegal(ResVT.getScalarType())) {
+
+ SDLoc DL(N);
+ EVT ExtCondVT = ResVT.changeVectorElementTypeToInteger();
+
+ if (SDValue ExtCond = combineToExtendBoolVectorInReg(
+ ISD::SIGN_EXTEND, DL, ExtCondVT, N0, DAG, DCI, *Subtarget)) {
+ ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CCVT, ExtCond);
+ return DAG.getSelect(DL, ResVT, ExtCond, IfTrue, IfFalse);
+ }
+ }
+
EVT CmpVT = N0.getOperand(0).getValueType();
if (N0.getOpcode() != ISD::SETCC ||
CCVT.getVectorElementCount() != ElementCount::getFixed(1) ||
@@ -28712,7 +28816,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SELECT:
return performSelectCombine(N, DCI);
case ISD::VSELECT:
- return performVSelectCombine(N, DCI.DAG);
+ return performVSelectCombine(N, DCI, Subtarget);
case ISD::SETCC:
return performSETCCCombine(N, DCI, DAG);
case ISD::LOAD:
>From 7b650f7c7d60f72661547c699971f0c31c7139be Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert at folkertdev.nl>
Date: Wed, 11 Feb 2026 15:03:57 +0100
Subject: [PATCH 2/2] add select + bitcast test
---
llvm/test/CodeGen/AArch64/select-bitcast.ll | 369 ++++++++++++++++++++
1 file changed, 369 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/select-bitcast.ll
diff --git a/llvm/test/CodeGen/AArch64/select-bitcast.ll b/llvm/test/CodeGen/AArch64/select-bitcast.ll
new file mode 100644
index 0000000000000..06f2afa002cfb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/select-bitcast.ll
@@ -0,0 +1,369 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-elf %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LE
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-BE
+
+define void @if_then_else8(ptr %out, i8 %mask, ptr %if_true, ptr %if_false) {
+; CHECK-LE-LABEL: if_then_else8:
+; CHECK-LE: // %bb.0: // %start
+; CHECK-LE-NEXT: adrp x8, .LCPI0_1
+; CHECK-LE-NEXT: dup v0.4s, w1
+; CHECK-LE-NEXT: ldr q1, [x8, :lo12:.LCPI0_1]
+; CHECK-LE-NEXT: adrp x8, .LCPI0_0
+; CHECK-LE-NEXT: ldr q2, [x8, :lo12:.LCPI0_0]
+; CHECK-LE-NEXT: ldp q4, q3, [x2]
+; CHECK-LE-NEXT: and v1.16b, v0.16b, v1.16b
+; CHECK-LE-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-LE-NEXT: ldp q5, q2, [x3]
+; CHECK-LE-NEXT: cmeq v1.4s, v1.4s, #0
+; CHECK-LE-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-LE-NEXT: bsl v1.16b, v2.16b, v3.16b
+; CHECK-LE-NEXT: bsl v0.16b, v5.16b, v4.16b
+; CHECK-LE-NEXT: stp q0, q1, [x0]
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: if_then_else8:
+; CHECK-BE: // %bb.0: // %start
+; CHECK-BE-NEXT: adrp x8, .LCPI0_1
+; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI0_1
+; CHECK-BE-NEXT: dup v0.4s, w1
+; CHECK-BE-NEXT: ld1 { v1.4s }, [x8]
+; CHECK-BE-NEXT: adrp x8, .LCPI0_0
+; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI0_0
+; CHECK-BE-NEXT: ld1 { v2.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x2, #16
+; CHECK-BE-NEXT: add x9, x3, #16
+; CHECK-BE-NEXT: ld1 { v3.4s }, [x9]
+; CHECK-BE-NEXT: ld1 { v4.4s }, [x2]
+; CHECK-BE-NEXT: ld1 { v5.4s }, [x3]
+; CHECK-BE-NEXT: and v1.16b, v0.16b, v1.16b
+; CHECK-BE-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-BE-NEXT: ld1 { v2.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x0, #16
+; CHECK-BE-NEXT: cmeq v1.4s, v1.4s, #0
+; CHECK-BE-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-BE-NEXT: bsl v1.16b, v3.16b, v2.16b
+; CHECK-BE-NEXT: bsl v0.16b, v5.16b, v4.16b
+; CHECK-BE-NEXT: st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT: st1 { v0.4s }, [x0]
+; CHECK-BE-NEXT: ret
+start:
+ %t = load <8 x i32>, ptr %if_true, align 4
+ %f = load <8 x i32>, ptr %if_false, align 4
+ %m = bitcast i8 %mask to <8 x i1>
+ %s = select <8 x i1> %m, <8 x i32> %t, <8 x i32> %f
+ store <8 x i32> %s, ptr %out, align 4
+ ret void
+}
+
+define void @if_then_else64(ptr %out, i64 %mask, ptr %if_true, ptr %if_false) {
+; CHECK-LE-LABEL: if_then_else64:
+; CHECK-LE: // %bb.0: // %start
+; CHECK-LE-NEXT: sub sp, sp, #80
+; CHECK-LE-NEXT: stp d15, d14, [sp, #16] // 16-byte Folded Spill
+; CHECK-LE-NEXT: stp d13, d12, [sp, #32] // 16-byte Folded Spill
+; CHECK-LE-NEXT: stp d11, d10, [sp, #48] // 16-byte Folded Spill
+; CHECK-LE-NEXT: stp d9, d8, [sp, #64] // 16-byte Folded Spill
+; CHECK-LE-NEXT: .cfi_def_cfa_offset 80
+; CHECK-LE-NEXT: .cfi_offset b8, -8
+; CHECK-LE-NEXT: .cfi_offset b9, -16
+; CHECK-LE-NEXT: .cfi_offset b10, -24
+; CHECK-LE-NEXT: .cfi_offset b11, -32
+; CHECK-LE-NEXT: .cfi_offset b12, -40
+; CHECK-LE-NEXT: .cfi_offset b13, -48
+; CHECK-LE-NEXT: .cfi_offset b14, -56
+; CHECK-LE-NEXT: .cfi_offset b15, -64
+; CHECK-LE-NEXT: fmov d21, x1
+; CHECK-LE-NEXT: adrp x8, .LCPI1_6
+; CHECK-LE-NEXT: ldr q26, [x8, :lo12:.LCPI1_6]
+; CHECK-LE-NEXT: adrp x8, .LCPI1_7
+; CHECK-LE-NEXT: ldr q27, [x8, :lo12:.LCPI1_7]
+; CHECK-LE-NEXT: adrp x8, .LCPI1_0
+; CHECK-LE-NEXT: dup v5.4s, v21.s[0]
+; CHECK-LE-NEXT: dup v28.4s, v21.s[1]
+; CHECK-LE-NEXT: ldr q22, [x8, :lo12:.LCPI1_0]
+; CHECK-LE-NEXT: ldp q25, q24, [x2, #96]
+; CHECK-LE-NEXT: adrp x8, .LCPI1_3
+; CHECK-LE-NEXT: ldp q21, q31, [x3, #96]
+; CHECK-LE-NEXT: and v23.16b, v5.16b, v26.16b
+; CHECK-LE-NEXT: and v9.16b, v5.16b, v27.16b
+; CHECK-LE-NEXT: and v10.16b, v28.16b, v22.16b
+; CHECK-LE-NEXT: ldp q16, q0, [x2, #128]
+; CHECK-LE-NEXT: and v27.16b, v28.16b, v27.16b
+; CHECK-LE-NEXT: ldp q1, q7, [x2, #160]
+; CHECK-LE-NEXT: and v26.16b, v28.16b, v26.16b
+; CHECK-LE-NEXT: cmeq v8.4s, v23.4s, #0
+; CHECK-LE-NEXT: ldr q23, [x8, :lo12:.LCPI1_3]
+; CHECK-LE-NEXT: adrp x8, .LCPI1_4
+; CHECK-LE-NEXT: ldr q13, [x8, :lo12:.LCPI1_4]
+; CHECK-LE-NEXT: adrp x8, .LCPI1_5
+; CHECK-LE-NEXT: cmeq v27.4s, v27.4s, #0
+; CHECK-LE-NEXT: ldp q6, q2, [x2, #192]
+; CHECK-LE-NEXT: cmeq v26.4s, v26.4s, #0
+; CHECK-LE-NEXT: bif v21.16b, v25.16b, v8.16b
+; CHECK-LE-NEXT: cmeq v25.4s, v9.4s, #0
+; CHECK-LE-NEXT: cmeq v8.4s, v10.4s, #0
+; CHECK-LE-NEXT: and v9.16b, v28.16b, v23.16b
+; CHECK-LE-NEXT: ldp q14, q10, [x3, #128]
+; CHECK-LE-NEXT: and v15.16b, v28.16b, v13.16b
+; CHECK-LE-NEXT: ldp q12, q11, [x3, #192]
+; CHECK-LE-NEXT: bit v24.16b, v31.16b, v25.16b
+; CHECK-LE-NEXT: ldr q25, [x8, :lo12:.LCPI1_5]
+; CHECK-LE-NEXT: adrp x8, .LCPI1_2
+; CHECK-LE-NEXT: bit v16.16b, v14.16b, v8.16b
+; CHECK-LE-NEXT: cmeq v31.4s, v9.4s, #0
+; CHECK-LE-NEXT: str q0, [sp] // 16-byte Spill
+; CHECK-LE-NEXT: ldp q9, q8, [x3, #160]
+; CHECK-LE-NEXT: cmeq v14.4s, v15.4s, #0
+; CHECK-LE-NEXT: ldp q3, q4, [x2, #224]
+; CHECK-LE-NEXT: and v15.16b, v28.16b, v25.16b
+; CHECK-LE-NEXT: ldp q30, q29, [x3, #224]
+; CHECK-LE-NEXT: bit v7.16b, v8.16b, v31.16b
+; CHECK-LE-NEXT: ldr q31, [x8, :lo12:.LCPI1_2]
+; CHECK-LE-NEXT: bit v6.16b, v12.16b, v14.16b
+; CHECK-LE-NEXT: cmeq v14.4s, v15.4s, #0
+; CHECK-LE-NEXT: ldp q17, q19, [x2, #64]
+; CHECK-LE-NEXT: and v12.16b, v28.16b, v31.16b
+; CHECK-LE-NEXT: bit v4.16b, v29.16b, v27.16b
+; CHECK-LE-NEXT: bit v3.16b, v30.16b, v26.16b
+; CHECK-LE-NEXT: ldp q18, q20, [x2, #32]
+; CHECK-LE-NEXT: adrp x8, .LCPI1_1
+; CHECK-LE-NEXT: bit v2.16b, v11.16b, v14.16b
+; CHECK-LE-NEXT: ldp q29, q27, [x2]
+; CHECK-LE-NEXT: cmeq v12.4s, v12.4s, #0
+; CHECK-LE-NEXT: ldp q30, q26, [x3, #64]
+; CHECK-LE-NEXT: ldp q14, q11, [x3, #32]
+; CHECK-LE-NEXT: ldr q8, [x8, :lo12:.LCPI1_1]
+; CHECK-LE-NEXT: ldp q0, q15, [x3]
+; CHECK-LE-NEXT: stp q21, q24, [x0, #96]
+; CHECK-LE-NEXT: bit v1.16b, v9.16b, v12.16b
+; CHECK-LE-NEXT: stp q6, q2, [x0, #192]
+; CHECK-LE-NEXT: and v28.16b, v28.16b, v8.16b
+; CHECK-LE-NEXT: stp q3, q4, [x0, #224]
+; CHECK-LE-NEXT: and v4.16b, v5.16b, v25.16b
+; CHECK-LE-NEXT: and v3.16b, v5.16b, v13.16b
+; CHECK-LE-NEXT: and v2.16b, v5.16b, v23.16b
+; CHECK-LE-NEXT: and v6.16b, v5.16b, v31.16b
+; CHECK-LE-NEXT: ldr q25, [sp] // 16-byte Reload
+; CHECK-LE-NEXT: cmeq v28.4s, v28.4s, #0
+; CHECK-LE-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload
+; CHECK-LE-NEXT: cmeq v4.4s, v4.4s, #0
+; CHECK-LE-NEXT: stp q1, q7, [x0, #160]
+; CHECK-LE-NEXT: and v7.16b, v5.16b, v8.16b
+; CHECK-LE-NEXT: and v5.16b, v5.16b, v22.16b
+; CHECK-LE-NEXT: cmeq v3.4s, v3.4s, #0
+; CHECK-LE-NEXT: cmeq v1.4s, v2.4s, #0
+; CHECK-LE-NEXT: bit v25.16b, v10.16b, v28.16b
+; CHECK-LE-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-LE-NEXT: mov v2.16b, v4.16b
+; CHECK-LE-NEXT: cmeq v4.4s, v6.4s, #0
+; CHECK-LE-NEXT: cmeq v6.4s, v7.4s, #0
+; CHECK-LE-NEXT: cmeq v5.4s, v5.4s, #0
+; CHECK-LE-NEXT: bsl v3.16b, v30.16b, v17.16b
+; CHECK-LE-NEXT: bsl v1.16b, v11.16b, v20.16b
+; CHECK-LE-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload
+; CHECK-LE-NEXT: bsl v2.16b, v26.16b, v19.16b
+; CHECK-LE-NEXT: bsl v4.16b, v14.16b, v18.16b
+; CHECK-LE-NEXT: bsl v6.16b, v15.16b, v27.16b
+; CHECK-LE-NEXT: bif v0.16b, v29.16b, v5.16b
+; CHECK-LE-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload
+; CHECK-LE-NEXT: stp q16, q25, [x0, #128]
+; CHECK-LE-NEXT: stp q4, q1, [x0, #32]
+; CHECK-LE-NEXT: stp q0, q6, [x0]
+; CHECK-LE-NEXT: stp q3, q2, [x0, #64]
+; CHECK-LE-NEXT: add sp, sp, #80
+; CHECK-LE-NEXT: ret
+;
+; CHECK-BE-LABEL: if_then_else64:
+; CHECK-BE: // %bb.0: // %start
+; CHECK-BE-NEXT: stp d11, d10, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-BE-NEXT: stp d9, d8, [sp, #16] // 16-byte Folded Spill
+; CHECK-BE-NEXT: .cfi_def_cfa_offset 32
+; CHECK-BE-NEXT: .cfi_offset b8, -8
+; CHECK-BE-NEXT: .cfi_offset b9, -16
+; CHECK-BE-NEXT: .cfi_offset b10, -24
+; CHECK-BE-NEXT: .cfi_offset b11, -32
+; CHECK-BE-NEXT: fmov d4, x1
+; CHECK-BE-NEXT: add x9, x2, #224
+; CHECK-BE-NEXT: add x8, x2, #240
+; CHECK-BE-NEXT: ld1 { v1.4s }, [x9]
+; CHECK-BE-NEXT: add x9, x2, #128
+; CHECK-BE-NEXT: ld1 { v2.4s }, [x8]
+; CHECK-BE-NEXT: ld1 { v18.4s }, [x9]
+; CHECK-BE-NEXT: add x9, x2, #80
+; CHECK-BE-NEXT: add x8, x2, #208
+; CHECK-BE-NEXT: rev64 v17.4s, v4.4s
+; CHECK-BE-NEXT: adrp x10, .LCPI1_3
+; CHECK-BE-NEXT: add x10, x10, :lo12:.LCPI1_3
+; CHECK-BE-NEXT: ld1 { v19.4s }, [x9]
+; CHECK-BE-NEXT: adrp x9, .LCPI1_4
+; CHECK-BE-NEXT: add x9, x9, :lo12:.LCPI1_4
+; CHECK-BE-NEXT: ld1 { v0.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x2, #192
+; CHECK-BE-NEXT: ld1 { v24.4s }, [x10]
+; CHECK-BE-NEXT: ld1 { v28.4s }, [x9]
+; CHECK-BE-NEXT: add x9, x3, #96
+; CHECK-BE-NEXT: ld1 { v3.4s }, [x8]
+; CHECK-BE-NEXT: dup v6.4s, v17.s[0]
+; CHECK-BE-NEXT: add x8, x2, #176
+; CHECK-BE-NEXT: ld1 { v23.4s }, [x9]
+; CHECK-BE-NEXT: add x9, x3, #48
+; CHECK-BE-NEXT: ld1 { v7.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x2, #160
+; CHECK-BE-NEXT: ld1 { v29.4s }, [x9]
+; CHECK-BE-NEXT: adrp x9, .LCPI1_5
+; CHECK-BE-NEXT: add x9, x9, :lo12:.LCPI1_5
+; CHECK-BE-NEXT: ld1 { v4.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x2, #144
+; CHECK-BE-NEXT: and v16.16b, v6.16b, v24.16b
+; CHECK-BE-NEXT: and v30.16b, v6.16b, v28.16b
+; CHECK-BE-NEXT: ld1 { v25.4s }, [x9]
+; CHECK-BE-NEXT: ld1 { v5.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x2, #96
+; CHECK-BE-NEXT: adrp x9, .LCPI1_6
+; CHECK-BE-NEXT: add x9, x9, :lo12:.LCPI1_6
+; CHECK-BE-NEXT: ld1 { v22.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x2, #48
+; CHECK-BE-NEXT: ld1 { v27.4s }, [x9]
+; CHECK-BE-NEXT: ld1 { v20.4s }, [x8]
+; CHECK-BE-NEXT: cmeq v16.4s, v16.4s, #0
+; CHECK-BE-NEXT: and v8.16b, v6.16b, v25.16b
+; CHECK-BE-NEXT: cmeq v30.4s, v30.4s, #0
+; CHECK-BE-NEXT: add x8, x3, #80
+; CHECK-BE-NEXT: adrp x9, .LCPI1_7
+; CHECK-BE-NEXT: add x9, x9, :lo12:.LCPI1_7
+; CHECK-BE-NEXT: add x10, x2, #64
+; CHECK-BE-NEXT: ld1 { v26.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x3, #64
+; CHECK-BE-NEXT: ld1 { v31.4s }, [x9]
+; CHECK-BE-NEXT: adrp x9, .LCPI1_0
+; CHECK-BE-NEXT: add x9, x9, :lo12:.LCPI1_0
+; CHECK-BE-NEXT: and v10.16b, v6.16b, v27.16b
+; CHECK-BE-NEXT: ld1 { v21.4s }, [x10]
+; CHECK-BE-NEXT: ld1 { v9.4s }, [x8]
+; CHECK-BE-NEXT: bsl v16.16b, v29.16b, v20.16b
+; CHECK-BE-NEXT: ld1 { v20.4s }, [x9]
+; CHECK-BE-NEXT: cmeq v8.4s, v8.4s, #0
+; CHECK-BE-NEXT: dup v29.4s, v17.s[1]
+; CHECK-BE-NEXT: mov v17.16b, v30.16b
+; CHECK-BE-NEXT: add x8, x2, #112
+; CHECK-BE-NEXT: and v11.16b, v6.16b, v31.16b
+; CHECK-BE-NEXT: cmeq v30.4s, v10.4s, #0
+; CHECK-BE-NEXT: add x9, x3, #112
+; CHECK-BE-NEXT: bsl v17.16b, v9.16b, v21.16b
+; CHECK-BE-NEXT: ld1 { v9.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x2, #32
+; CHECK-BE-NEXT: bit v19.16b, v26.16b, v8.16b
+; CHECK-BE-NEXT: and v8.16b, v29.16b, v20.16b
+; CHECK-BE-NEXT: ld1 { v21.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x2, #16
+; CHECK-BE-NEXT: ld1 { v10.4s }, [x9]
+; CHECK-BE-NEXT: cmeq v26.4s, v11.4s, #0
+; CHECK-BE-NEXT: bif v23.16b, v22.16b, v30.16b
+; CHECK-BE-NEXT: ld1 { v22.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x3, #128
+; CHECK-BE-NEXT: and v30.16b, v29.16b, v24.16b
+; CHECK-BE-NEXT: ld1 { v11.4s }, [x8]
+; CHECK-BE-NEXT: cmeq v8.4s, v8.4s, #0
+; CHECK-BE-NEXT: add x9, x3, #176
+; CHECK-BE-NEXT: bsl v26.16b, v10.16b, v9.16b
+; CHECK-BE-NEXT: add x8, x3, #240
+; CHECK-BE-NEXT: ld1 { v9.4s }, [x9]
+; CHECK-BE-NEXT: and v28.16b, v29.16b, v28.16b
+; CHECK-BE-NEXT: and v31.16b, v29.16b, v31.16b
+; CHECK-BE-NEXT: cmeq v30.4s, v30.4s, #0
+; CHECK-BE-NEXT: bit v18.16b, v11.16b, v8.16b
+; CHECK-BE-NEXT: ld1 { v8.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x3, #224
+; CHECK-BE-NEXT: adrp x9, .LCPI1_2
+; CHECK-BE-NEXT: add x9, x9, :lo12:.LCPI1_2
+; CHECK-BE-NEXT: ld1 { v10.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x3, #192
+; CHECK-BE-NEXT: cmeq v28.4s, v28.4s, #0
+; CHECK-BE-NEXT: bit v7.16b, v9.16b, v30.16b
+; CHECK-BE-NEXT: ld1 { v30.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x3, #208
+; CHECK-BE-NEXT: ld1 { v9.4s }, [x8]
+; CHECK-BE-NEXT: and v27.16b, v29.16b, v27.16b
+; CHECK-BE-NEXT: cmeq v31.4s, v31.4s, #0
+; CHECK-BE-NEXT: adrp x8, .LCPI1_1
+; CHECK-BE-NEXT: add x8, x8, :lo12:.LCPI1_1
+; CHECK-BE-NEXT: and v25.16b, v29.16b, v25.16b
+; CHECK-BE-NEXT: bit v3.16b, v30.16b, v28.16b
+; CHECK-BE-NEXT: ld1 { v28.4s }, [x9]
+; CHECK-BE-NEXT: ld1 { v30.4s }, [x8]
+; CHECK-BE-NEXT: cmeq v27.4s, v27.4s, #0
+; CHECK-BE-NEXT: bit v2.16b, v8.16b, v31.16b
+; CHECK-BE-NEXT: add x8, x3, #160
+; CHECK-BE-NEXT: cmeq v25.4s, v25.4s, #0
+; CHECK-BE-NEXT: ld1 { v31.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x3, #144
+; CHECK-BE-NEXT: and v8.16b, v29.16b, v28.16b
+; CHECK-BE-NEXT: and v29.16b, v29.16b, v30.16b
+; CHECK-BE-NEXT: add x9, x3, #32
+; CHECK-BE-NEXT: bit v1.16b, v10.16b, v27.16b
+; CHECK-BE-NEXT: ld1 { v27.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x3, #16
+; CHECK-BE-NEXT: bit v0.16b, v9.16b, v25.16b
+; CHECK-BE-NEXT: ld1 { v25.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x0, #240
+; CHECK-BE-NEXT: cmeq v8.4s, v8.4s, #0
+; CHECK-BE-NEXT: cmeq v29.4s, v29.4s, #0
+; CHECK-BE-NEXT: ld1 { v24.4s }, [x2]
+; CHECK-BE-NEXT: ld1 { v10.4s }, [x9]
+; CHECK-BE-NEXT: ld1 { v9.4s }, [x3]
+; CHECK-BE-NEXT: st1 { v2.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x0, #224
+; CHECK-BE-NEXT: mov v2.16b, v8.16b
+; CHECK-BE-NEXT: st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT: mov v1.16b, v29.16b
+; CHECK-BE-NEXT: add x8, x0, #208
+; CHECK-BE-NEXT: st1 { v0.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x0, #192
+; CHECK-BE-NEXT: and v0.16b, v6.16b, v28.16b
+; CHECK-BE-NEXT: bsl v2.16b, v31.16b, v4.16b
+; CHECK-BE-NEXT: bsl v1.16b, v27.16b, v5.16b
+; CHECK-BE-NEXT: st1 { v3.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x0, #176
+; CHECK-BE-NEXT: and v3.16b, v6.16b, v30.16b
+; CHECK-BE-NEXT: and v4.16b, v6.16b, v20.16b
+; CHECK-BE-NEXT: st1 { v7.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x0, #160
+; CHECK-BE-NEXT: cmeq v0.4s, v0.4s, #0
+; CHECK-BE-NEXT: st1 { v2.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x0, #144
+; CHECK-BE-NEXT: cmeq v2.4s, v4.4s, #0
+; CHECK-BE-NEXT: st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x0, #128
+; CHECK-BE-NEXT: cmeq v1.4s, v3.4s, #0
+; CHECK-BE-NEXT: st1 { v18.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x0, #112
+; CHECK-BE-NEXT: bsl v0.16b, v10.16b, v21.16b
+; CHECK-BE-NEXT: st1 { v26.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x0, #96
+; CHECK-BE-NEXT: bsl v2.16b, v9.16b, v24.16b
+; CHECK-BE-NEXT: st1 { v23.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x0, #80
+; CHECK-BE-NEXT: bsl v1.16b, v25.16b, v22.16b
+; CHECK-BE-NEXT: st1 { v19.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x0, #64
+; CHECK-BE-NEXT: st1 { v17.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x0, #48
+; CHECK-BE-NEXT: st1 { v16.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x0, #32
+; CHECK-BE-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload
+; CHECK-BE-NEXT: st1 { v0.4s }, [x8]
+; CHECK-BE-NEXT: add x8, x0, #16
+; CHECK-BE-NEXT: st1 { v2.4s }, [x0]
+; CHECK-BE-NEXT: st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT: ldp d11, d10, [sp], #32 // 16-byte Folded Reload
+; CHECK-BE-NEXT: ret
+start:
+ %if_true.val = load <64 x i32>, ptr %if_true, align 4
+ %if_false.val = load <64 x i32>, ptr %if_false, align 4
+ %0 = bitcast i64 %mask to <64 x i1>
+ %1 = select <64 x i1> %0, <64 x i32> %if_true.val, <64 x i32> %if_false.val
+ store <64 x i32> %1, ptr %out, align 4
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
More information about the llvm-commits
mailing list