[llvm] [AArch64] Support MI and PL (PR #150314)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 8 05:25:21 PDT 2025
https://github.com/AZero13 updated https://github.com/llvm/llvm-project/pull/150314
>From 69063005e21286e83a6544c45de6450babb9392f Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Tue, 5 Aug 2025 11:08:55 -0400
Subject: [PATCH 1/2] [AArch64] Support MI and PL
Now, why would we want to do this?
There are a small number of places where this works:
1. It helps peepholeopt when less flag checking.
2. It allows the folding of things such as x - 0x80000000 < 0 to be folded to cmp x, register holding this value
3. We can refine the other passes over time for this.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 17 +-
.../GISel/AArch64InstructionSelector.cpp | 30 +++-
.../16bit-float-promotion-with-nofp.ll | 8 +-
.../AArch64/GlobalISel/opt-and-tbnz-tbz.mir | 2 +-
...postlegalizer-lowering-adjust-icmp-imm.mir | 4 +-
.../GlobalISel/select-tbnz-from-cmp.mir | 2 +-
llvm/test/CodeGen/AArch64/arm64-ccmp.ll | 14 +-
llvm/test/CodeGen/AArch64/arm64-fmax.ll | 2 +-
llvm/test/CodeGen/AArch64/arm64-fp128.ll | 4 +-
llvm/test/CodeGen/AArch64/arm64-vabs.ll | 4 +-
.../check-sign-bit-before-extension.ll | 8 +-
llvm/test/CodeGen/AArch64/combine-sdiv.ll | 8 +-
llvm/test/CodeGen/AArch64/csel-cmp-cse.ll | 2 +-
llvm/test/CodeGen/AArch64/fast-isel-sdiv.ll | 52 +++++-
llvm/test/CodeGen/AArch64/fcmp-fp128.ll | 12 +-
llvm/test/CodeGen/AArch64/fcmp.ll | 82 ++++-----
llvm/test/CodeGen/AArch64/fp-intrinsics.ll | 16 +-
llvm/test/CodeGen/AArch64/fpclamptosat.ll | 6 +-
llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll | 12 +-
.../test/CodeGen/AArch64/fptosi-sat-scalar.ll | 30 ++--
.../test/CodeGen/AArch64/fptosi-sat-vector.ll | 76 ++++-----
.../test/CodeGen/AArch64/fptoui-sat-scalar.ll | 6 +-
.../test/CodeGen/AArch64/fptoui-sat-vector.ll | 72 ++++----
.../CodeGen/AArch64/logical_shifted_reg.ll | 4 +-
llvm/test/CodeGen/AArch64/min-max-combine.ll | 8 +-
llvm/test/CodeGen/AArch64/pr72777.ll | 17 +-
llvm/test/CodeGen/AArch64/sdivpow2.ll | 159 ++++++++++++------
.../CodeGen/AArch64/select-constant-xor.ll | 34 ++--
.../CodeGen/AArch64/selectcc-to-shiftand.ll | 12 +-
llvm/test/CodeGen/AArch64/signbit-shift.ll | 4 +-
llvm/test/CodeGen/AArch64/smul_fix_sat.ll | 18 +-
llvm/test/CodeGen/AArch64/srem-pow2.ll | 4 +-
llvm/test/CodeGen/AArch64/sshl_sat.ll | 2 +-
llvm/test/CodeGen/AArch64/stack-hazard.ll | 12 +-
llvm/test/CodeGen/AArch64/tbz-tbnz.ll | 14 +-
llvm/test/CodeGen/AArch64/vecreduce-bool.ll | 24 +--
.../AArch64/vecreduce-fmin-legalization.ll | 2 +-
37 files changed, 449 insertions(+), 334 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2b6ea86ee1af5..2c02630174ad7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3320,7 +3320,8 @@ static bool isZerosVector(const SDNode *N) {
/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
/// CC
-static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
+static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC,
+ SDValue RHS = {}) {
switch (CC) {
default:
llvm_unreachable("Unknown condition code!");
@@ -3331,9 +3332,9 @@ static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
case ISD::SETGT:
return AArch64CC::GT;
case ISD::SETGE:
- return AArch64CC::GE;
+ return (RHS && isNullConstant(RHS)) ? AArch64CC::PL : AArch64CC::GE;
case ISD::SETLT:
- return AArch64CC::LT;
+ return (RHS && isNullConstant(RHS)) ? AArch64CC::MI : AArch64CC::LT;
case ISD::SETLE:
return AArch64CC::LE;
case ISD::SETUGT:
@@ -3782,7 +3783,7 @@ static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
SDLoc DL(Val);
// Determine OutCC and handle FP special case.
if (isInteger) {
- OutCC = changeIntCCToAArch64CC(CC);
+ OutCC = changeIntCCToAArch64CC(CC, RHS);
} else {
assert(LHS.getValueType().isFloatingPoint());
AArch64CC::CondCode ExtraCC;
@@ -4065,7 +4066,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
Cmp = emitComparison(
SExt, DAG.getSignedConstant(ValueofRHS, DL, RHS.getValueType()), CC,
DL, DAG);
- AArch64CC = changeIntCCToAArch64CC(CC);
+ AArch64CC = changeIntCCToAArch64CC(CC, RHS);
}
}
@@ -4079,7 +4080,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
if (!Cmp) {
Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
- AArch64CC = changeIntCCToAArch64CC(CC);
+ AArch64CC = changeIntCCToAArch64CC(CC, RHS);
}
AArch64cc = getCondCode(DAG, AArch64CC);
return Cmp;
@@ -11492,7 +11493,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
!RHSVal->isZero() && !RHSVal->isAllOnes()) {
- AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+ AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC, RHS);
// Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
// "a != C ? x : a" to avoid materializing C.
if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
@@ -11503,7 +11504,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
// Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
// avoid materializing C.
- AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+ AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC, RHS);
if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
Opcode = AArch64ISD::CSINV;
TVal = LHS;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index d9056926ff249..8e8d68acdd5f9 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -1349,7 +1349,9 @@ AArch64InstructionSelector::emitSelect(Register Dst, Register True,
return &*SelectInst;
}
-static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
+static AArch64CC::CondCode
+changeICMPPredToAArch64CC(CmpInst::Predicate P, Register RHS = {},
+ MachineRegisterInfo *MRI = nullptr) {
switch (P) {
default:
llvm_unreachable("Unknown condition code!");
@@ -1360,8 +1362,18 @@ static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
case CmpInst::ICMP_SGT:
return AArch64CC::GT;
case CmpInst::ICMP_SGE:
+ if (RHS && MRI) {
+ auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS, *MRI);
+ if (ValAndVReg && ValAndVReg->Value == 0)
+ return AArch64CC::PL;
+ }
return AArch64CC::GE;
case CmpInst::ICMP_SLT:
+ if (RHS && MRI) {
+ auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS, *MRI);
+ if (ValAndVReg && ValAndVReg->Value == 0)
+ return AArch64CC::MI;
+ }
return AArch64CC::LT;
case CmpInst::ICMP_SLE:
return AArch64CC::LE;
@@ -1813,7 +1825,8 @@ bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
auto &PredOp = ICmp.getOperand(1);
emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
- static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
+ static_cast<CmpInst::Predicate>(PredOp.getPredicate()),
+ ICmp.getOperand(3).getReg(), MIB.getMRI());
MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
I.eraseFromParent();
return true;
@@ -2510,8 +2523,8 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
emitIntegerCompare(/*LHS=*/Cmp->getOperand(2),
/*RHS=*/Cmp->getOperand(3), PredOp, MIB);
auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
- const AArch64CC::CondCode InvCC =
- changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
+ const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
+ CmpInst::getInversePredicate(Pred), Cmp->getOperand(3).getReg(), &MRI);
emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB);
I.eraseFromParent();
return true;
@@ -3577,8 +3590,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
auto &PredOp = I.getOperand(1);
emitIntegerCompare(I.getOperand(2), I.getOperand(3), PredOp, MIB);
auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
- const AArch64CC::CondCode InvCC =
- changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
+ const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
+ CmpInst::getInversePredicate(Pred), I.getOperand(3).getReg(), &MRI);
emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR,
/*Src2=*/AArch64::WZR, InvCC, MIB);
I.eraseFromParent();
@@ -4931,7 +4944,7 @@ MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
if (Negate)
CC = CmpInst::getInversePredicate(CC);
if (isa<GICmp>(Cmp)) {
- OutCC = changeICMPPredToAArch64CC(CC);
+ OutCC = changeICMPPredToAArch64CC(CC, RHS, MIB.getMRI());
} else {
// Handle special FP cases.
AArch64CC::CondCode ExtraCC;
@@ -5101,7 +5114,8 @@ bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3), PredOp,
MIB);
auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
- CondCode = changeICMPPredToAArch64CC(Pred);
+ CondCode =
+ changeICMPPredToAArch64CC(Pred, CondDef->getOperand(3).getReg(), &MRI);
} else {
// Get the condition code for the select.
auto Pred =
diff --git a/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll
index 5d4f9204e7063..c9560e705280b 100644
--- a/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll
+++ b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll
@@ -77,7 +77,7 @@ define double @selectcc_f64(double %a, double %b, i32 %d) {
; CHECK-LABEL: selectcc_f64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp w2, #0
-; CHECK-NEXT: csel x0, x0, x1, lt
+; CHECK-NEXT: csel x0, x0, x1, mi
; CHECK-NEXT: ret
entry:
%c = icmp slt i32 %d, 0
@@ -89,7 +89,7 @@ define float @selectcc_f32(float %a, float %b, i32 %d) {
; CHECK-LABEL: selectcc_f32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp w2, #0
-; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
entry:
%c = icmp slt i32 %d, 0
@@ -101,7 +101,7 @@ define half @selectcc_f16(half %a, half %b, i32 %d) {
; CHECK-LABEL: selectcc_f16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp w2, #0
-; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
entry:
%c = icmp slt i32 %d, 0
@@ -113,7 +113,7 @@ define bfloat @selectcc_bf16(bfloat %a, bfloat %b, i32 %d) {
; CHECK-LABEL: selectcc_bf16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp w2, #0
-; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
entry:
%c = icmp slt i32 %d, 0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/opt-and-tbnz-tbz.mir b/llvm/test/CodeGen/AArch64/GlobalISel/opt-and-tbnz-tbz.mir
index 95ae12f6d59db..a5b6ea487aac4 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/opt-and-tbnz-tbz.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/opt-and-tbnz-tbz.mir
@@ -149,7 +149,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr32 = COPY $w0
; CHECK-NEXT: [[ANDSWri:%[0-9]+]]:gpr32 = ANDSWri [[COPY]], 0, implicit-def $nzcv
- ; CHECK-NEXT: Bcc 11, %bb.1, implicit $nzcv
+ ; CHECK-NEXT: Bcc 4, %bb.1, implicit $nzcv
; CHECK-NEXT: B %bb.0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-adjust-icmp-imm.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-adjust-icmp-imm.mir
index edc33e340c9b6..3b991c3d910d5 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-adjust-icmp-imm.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-adjust-icmp-imm.mir
@@ -661,7 +661,7 @@ body: |
; SELECT-NEXT: %reg0:gpr32common = COPY $w0
; SELECT-NEXT: %reg1:gpr32 = COPY $w1
; SELECT-NEXT: [[SUBSWri:%[0-9]+]]:gpr32 = SUBSWri %reg0, 0, 0, implicit-def $nzcv
- ; SELECT-NEXT: %select:gpr32 = CSELWr %reg0, %reg1, 11, implicit $nzcv
+ ; SELECT-NEXT: %select:gpr32 = CSELWr %reg0, %reg1, 4, implicit $nzcv
; SELECT-NEXT: $w0 = COPY %select
; SELECT-NEXT: RET_ReallyLR implicit $w0
%reg0:_(s32) = COPY $w0
@@ -699,7 +699,7 @@ body: |
; SELECT-NEXT: {{ $}}
; SELECT-NEXT: %reg0:gpr64 = COPY $x0
; SELECT-NEXT: [[ANDSXri:%[0-9]+]]:gpr64 = ANDSXri %reg0, 8000, implicit-def $nzcv
- ; SELECT-NEXT: %cmp:gpr32 = CSINCWr $wzr, $wzr, 11, implicit $nzcv
+ ; SELECT-NEXT: %cmp:gpr32 = CSINCWr $wzr, $wzr, 4, implicit $nzcv
; SELECT-NEXT: $w0 = COPY %cmp
; SELECT-NEXT: RET_ReallyLR implicit $w0
%reg0:gpr(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-tbnz-from-cmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-tbnz-from-cmp.mir
index 30db00aa97813..67262c27e2059 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-tbnz-from-cmp.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-tbnz-from-cmp.mir
@@ -166,7 +166,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: %copy:gpr64 = COPY $x0
; CHECK-NEXT: [[ANDSXri:%[0-9]+]]:gpr64 = ANDSXri %copy, 8000, implicit-def $nzcv
- ; CHECK-NEXT: Bcc 11, %bb.1, implicit $nzcv
+ ; CHECK-NEXT: Bcc 4, %bb.1, implicit $nzcv
; CHECK-NEXT: B %bb.0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index a546ffd7143ad..4fe01e838771d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -600,7 +600,7 @@ define i64 @select_noccmp1(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
; CHECK-SD-LABEL: select_noccmp1:
; CHECK-SD: ; %bb.0:
; CHECK-SD-NEXT: cmp x0, #0
-; CHECK-SD-NEXT: ccmp x0, #13, #4, lt
+; CHECK-SD-NEXT: ccmp x0, #13, #4, mi
; CHECK-SD-NEXT: cset w8, gt
; CHECK-SD-NEXT: cmp x2, #2
; CHECK-SD-NEXT: ccmp x2, #4, #4, lt
@@ -630,7 +630,7 @@ define i64 @select_noccmp2(i64 %v1, i64 %v2, i64 %v3, i64 %r) {
; CHECK-SD-LABEL: select_noccmp2:
; CHECK-SD: ; %bb.0:
; CHECK-SD-NEXT: cmp x0, #0
-; CHECK-SD-NEXT: ccmp x0, #13, #0, ge
+; CHECK-SD-NEXT: ccmp x0, #13, #0, pl
; CHECK-SD-NEXT: cset w8, gt
; CHECK-SD-NEXT: cmp w8, #0
; CHECK-SD-NEXT: csel x0, xzr, x3, ne
@@ -664,7 +664,7 @@ define i32 @select_noccmp3(i32 %v0, i32 %v1, i32 %v2) {
; CHECK-SD-LABEL: select_noccmp3:
; CHECK-SD: ; %bb.0:
; CHECK-SD-NEXT: cmp w0, #0
-; CHECK-SD-NEXT: ccmp w0, #13, #0, ge
+; CHECK-SD-NEXT: ccmp w0, #13, #0, pl
; CHECK-SD-NEXT: cset w8, gt
; CHECK-SD-NEXT: cmp w0, #22
; CHECK-SD-NEXT: mov w9, #44 ; =0x2c
@@ -937,11 +937,11 @@ define i32 @f128_select_and_olt_oge(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3,
; CHECK-SD-NEXT: stp q2, q3, [sp] ; 32-byte Folded Spill
; CHECK-SD-NEXT: bl ___lttf2
; CHECK-SD-NEXT: cmp w0, #0
-; CHECK-SD-NEXT: cset w21, lt
+; CHECK-SD-NEXT: cset w21, mi
; CHECK-SD-NEXT: ldp q0, q1, [sp] ; 32-byte Folded Reload
; CHECK-SD-NEXT: bl ___getf2
; CHECK-SD-NEXT: cmp w0, #0
-; CHECK-SD-NEXT: cset w8, ge
+; CHECK-SD-NEXT: cset w8, pl
; CHECK-SD-NEXT: tst w8, w21
; CHECK-SD-NEXT: csel w0, w20, w19, ne
; CHECK-SD-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
@@ -964,8 +964,8 @@ define i32 @f128_select_and_olt_oge(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3,
; CHECK-GI-NEXT: ldp q1, q0, [sp] ; 32-byte Folded Reload
; CHECK-GI-NEXT: bl ___getf2
; CHECK-GI-NEXT: cmp w21, #0
-; CHECK-GI-NEXT: ccmp w0, #0, #8, lt
-; CHECK-GI-NEXT: csel w0, w19, w20, ge
+; CHECK-GI-NEXT: ccmp w0, #0, #8, mi
+; CHECK-GI-NEXT: csel w0, w19, w20, pl
; CHECK-GI-NEXT: ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
; CHECK-GI-NEXT: ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
; CHECK-GI-NEXT: ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/arm64-fmax.ll b/llvm/test/CodeGen/AArch64/arm64-fmax.ll
index d7d54a6e48a92..85104775339b6 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fmax.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fmax.ll
@@ -60,7 +60,7 @@ define i64 @test_integer(i64 %in) {
; CHECK-LABEL: test_integer:
; CHECK: // %bb.0:
; CHECK-NEXT: cmp x0, #0
-; CHECK-NEXT: csel x0, xzr, x0, lt
+; CHECK-NEXT: csel x0, xzr, x0, mi
; CHECK-NEXT: ret
%cmp = icmp slt i64 %in, 0
%val = select i1 %cmp, i64 0, i64 %in
diff --git a/llvm/test/CodeGen/AArch64/arm64-fp128.ll b/llvm/test/CodeGen/AArch64/arm64-fp128.ll
index a75f6419d5a5a..3e4b887fed55d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fp128.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fp128.ll
@@ -258,7 +258,7 @@ define i32 @test_br_cc(fp128 %lhs, fp128 %rhs) {
; CHECK-SD-NEXT: mov w8, #29 // =0x1d
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: mov w9, #42 // =0x2a
-; CHECK-SD-NEXT: csel w0, w9, w8, lt
+; CHECK-SD-NEXT: csel w0, w9, w8, mi
; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-SD-NEXT: ret
;
@@ -271,7 +271,7 @@ define i32 @test_br_cc(fp128 %lhs, fp128 %rhs) {
; CHECK-GI-NEXT: mov w8, #29 // =0x1d
; CHECK-GI-NEXT: mov w9, #42 // =0x2a
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel w0, w9, w8, lt
+; CHECK-GI-NEXT: csel w0, w9, w8, mi
; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-GI-NEXT: ret
%cond = fcmp olt fp128 %lhs, %rhs
diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
index 78881c80ccc10..00e9a1baec727 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -1830,10 +1830,10 @@ define <2 x i128> @uabd_i64(<2 x i64> %a, <2 x i64> %b) {
; CHECK-GI-NEXT: subs x10, x11, x13
; CHECK-GI-NEXT: sbc x11, x14, x15
; CHECK-GI-NEXT: cmp x9, #0
-; CHECK-GI-NEXT: cset w12, lt
+; CHECK-GI-NEXT: cset w12, mi
; CHECK-GI-NEXT: csel w12, wzr, w12, eq
; CHECK-GI-NEXT: cmp x11, #0
-; CHECK-GI-NEXT: cset w13, lt
+; CHECK-GI-NEXT: cset w13, mi
; CHECK-GI-NEXT: csel w13, wzr, w13, eq
; CHECK-GI-NEXT: negs x14, x8
; CHECK-GI-NEXT: ngc x15, x9
diff --git a/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll b/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll
index 1d60929f2b94c..0960c4c2a3342 100644
--- a/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll
+++ b/llvm/test/CodeGen/AArch64/check-sign-bit-before-extension.ll
@@ -80,7 +80,7 @@ define i32 @g_i8_sign_extend_inreg(i8 %in, i32 %a, i32 %b) nounwind {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sxtb w8, w0
; CHECK-NEXT: cmp w8, #0
-; CHECK-NEXT: csel w8, w1, w2, lt
+; CHECK-NEXT: csel w8, w1, w2, mi
; CHECK-NEXT: add w0, w8, w0, uxtb
; CHECK-NEXT: ret
entry:
@@ -102,7 +102,7 @@ define i32 @g_i16_sign_extend_inreg(i16 %in, i32 %a, i32 %b) nounwind {
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sxth w8, w0
; CHECK-NEXT: cmp w8, #0
-; CHECK-NEXT: csel w8, w1, w2, lt
+; CHECK-NEXT: csel w8, w1, w2, mi
; CHECK-NEXT: add w0, w8, w0, uxth
; CHECK-NEXT: ret
entry:
@@ -123,7 +123,7 @@ define i64 @g_i32_sign_extend_inreg(i32 %in, i64 %a, i64 %b) nounwind {
; CHECK-LABEL: g_i32_sign_extend_inreg:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: csel x8, x1, x2, lt
+; CHECK-NEXT: csel x8, x1, x2, mi
; CHECK-NEXT: add x0, x8, w0, uxtw
; CHECK-NEXT: ret
entry:
@@ -170,7 +170,7 @@ define i64 @g_i32_sign_extend_i64(i32 %in, i64 %a, i64 %b) nounwind {
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-NEXT: sxtw x8, w0
; CHECK-NEXT: cmp x8, #0
-; CHECK-NEXT: csel x8, x1, x2, lt
+; CHECK-NEXT: csel x8, x1, x2, mi
; CHECK-NEXT: add x0, x8, w0, uxtw
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
index e1ba0e98a6c01..6208a697cab11 100644
--- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll
+++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll
@@ -1464,7 +1464,7 @@ define i32 @combine_i32_sdiv_pow2(i32 %x) {
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: add w8, w0, #15
; CHECK-SD-NEXT: cmp w0, #0
-; CHECK-SD-NEXT: csel w8, w8, w0, lt
+; CHECK-SD-NEXT: csel w8, w8, w0, mi
; CHECK-SD-NEXT: asr w0, w8, #4
; CHECK-SD-NEXT: ret
;
@@ -1483,7 +1483,7 @@ define i32 @combine_i32_sdiv_negpow2(i32 %x) {
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: add w8, w0, #255
; CHECK-SD-NEXT: cmp w0, #0
-; CHECK-SD-NEXT: csel w8, w8, w0, lt
+; CHECK-SD-NEXT: csel w8, w8, w0, mi
; CHECK-SD-NEXT: neg w0, w8, asr #8
; CHECK-SD-NEXT: ret
;
@@ -1502,7 +1502,7 @@ define i64 @combine_i64_sdiv_pow2(i64 %x) {
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: add x8, x0, #15
; CHECK-SD-NEXT: cmp x0, #0
-; CHECK-SD-NEXT: csel x8, x8, x0, lt
+; CHECK-SD-NEXT: csel x8, x8, x0, mi
; CHECK-SD-NEXT: asr x0, x8, #4
; CHECK-SD-NEXT: ret
;
@@ -1521,7 +1521,7 @@ define i64 @combine_i64_sdiv_negpow2(i64 %x) {
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: add x8, x0, #255
; CHECK-SD-NEXT: cmp x0, #0
-; CHECK-SD-NEXT: csel x8, x8, x0, lt
+; CHECK-SD-NEXT: csel x8, x8, x0, mi
; CHECK-SD-NEXT: neg x0, x8, asr #8
; CHECK-SD-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/csel-cmp-cse.ll b/llvm/test/CodeGen/AArch64/csel-cmp-cse.ll
index e745326323329..07b32ef0830c2 100644
--- a/llvm/test/CodeGen/AArch64/csel-cmp-cse.ll
+++ b/llvm/test/CodeGen/AArch64/csel-cmp-cse.ll
@@ -706,7 +706,7 @@ define i32 @test_ugtsmax_sub_add_i32(i32 %x0, i32 %x1) {
; CHECK-NEXT: add w9, w0, w1
; CHECK-NEXT: cmp w1, #0
; CHECK-NEXT: add w8, w9, w8
-; CHECK-NEXT: csel w0, wzr, w8, lt
+; CHECK-NEXT: csel w0, wzr, w8, mi
; CHECK-NEXT: ret
%cmp = icmp ugt i32 %x1, 2147483647
%add = add i32 %x0, %x1
diff --git a/llvm/test/CodeGen/AArch64/fast-isel-sdiv.ll b/llvm/test/CodeGen/AArch64/fast-isel-sdiv.ll
index 539fe7e7d3c83..7b6780d0e1e1f 100644
--- a/llvm/test/CodeGen/AArch64/fast-isel-sdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-sdiv.ll
@@ -1,12 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
-; RUN: llc -mtriple=aarch64-linux-gnu -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -fast-isel -fast-isel-abort=1 -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-FAST-ISEL
define i32 @sdiv_i32_exact(i32 %a) {
; CHECK-LABEL: sdiv_i32_exact:
; CHECK: // %bb.0:
; CHECK-NEXT: asr w0, w0, #3
; CHECK-NEXT: ret
+;
+; CHECK-FAST-ISEL-LABEL: sdiv_i32_exact:
+; CHECK-FAST-ISEL: // %bb.0:
+; CHECK-FAST-ISEL-NEXT: asr w0, w0, #3
+; CHECK-FAST-ISEL-NEXT: ret
%1 = sdiv exact i32 %a, 8
ret i32 %1
}
@@ -16,9 +21,17 @@ define i32 @sdiv_i32_pos(i32 %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: add w8, w0, #7
; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: csel w8, w8, w0, lt
+; CHECK-NEXT: csel w8, w8, w0, mi
; CHECK-NEXT: asr w0, w8, #3
; CHECK-NEXT: ret
+;
+; CHECK-FAST-ISEL-LABEL: sdiv_i32_pos:
+; CHECK-FAST-ISEL: // %bb.0:
+; CHECK-FAST-ISEL-NEXT: add w8, w0, #7
+; CHECK-FAST-ISEL-NEXT: cmp w0, #0
+; CHECK-FAST-ISEL-NEXT: csel w8, w8, w0, lt
+; CHECK-FAST-ISEL-NEXT: asr w0, w8, #3
+; CHECK-FAST-ISEL-NEXT: ret
%1 = sdiv i32 %a, 8
ret i32 %1
}
@@ -28,9 +41,17 @@ define i32 @sdiv_i32_neg(i32 %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: add w8, w0, #7
; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: csel w8, w8, w0, lt
+; CHECK-NEXT: csel w8, w8, w0, mi
; CHECK-NEXT: neg w0, w8, asr #3
; CHECK-NEXT: ret
+;
+; CHECK-FAST-ISEL-LABEL: sdiv_i32_neg:
+; CHECK-FAST-ISEL: // %bb.0:
+; CHECK-FAST-ISEL-NEXT: add w8, w0, #7
+; CHECK-FAST-ISEL-NEXT: cmp w0, #0
+; CHECK-FAST-ISEL-NEXT: csel w8, w8, w0, lt
+; CHECK-FAST-ISEL-NEXT: neg w0, w8, asr #3
+; CHECK-FAST-ISEL-NEXT: ret
%1 = sdiv i32 %a, -8
ret i32 %1
}
@@ -40,6 +61,11 @@ define i64 @sdiv_i64_exact(i64 %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: asr x0, x0, #4
; CHECK-NEXT: ret
+;
+; CHECK-FAST-ISEL-LABEL: sdiv_i64_exact:
+; CHECK-FAST-ISEL: // %bb.0:
+; CHECK-FAST-ISEL-NEXT: asr x0, x0, #4
+; CHECK-FAST-ISEL-NEXT: ret
%1 = sdiv exact i64 %a, 16
ret i64 %1
}
@@ -49,9 +75,17 @@ define i64 @sdiv_i64_pos(i64 %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: add x8, x0, #15
; CHECK-NEXT: cmp x0, #0
-; CHECK-NEXT: csel x8, x8, x0, lt
+; CHECK-NEXT: csel x8, x8, x0, mi
; CHECK-NEXT: asr x0, x8, #4
; CHECK-NEXT: ret
+;
+; CHECK-FAST-ISEL-LABEL: sdiv_i64_pos:
+; CHECK-FAST-ISEL: // %bb.0:
+; CHECK-FAST-ISEL-NEXT: add x8, x0, #15
+; CHECK-FAST-ISEL-NEXT: cmp x0, #0
+; CHECK-FAST-ISEL-NEXT: csel x8, x8, x0, lt
+; CHECK-FAST-ISEL-NEXT: asr x0, x8, #4
+; CHECK-FAST-ISEL-NEXT: ret
%1 = sdiv i64 %a, 16
ret i64 %1
}
@@ -61,9 +95,17 @@ define i64 @sdiv_i64_neg(i64 %a) {
; CHECK: // %bb.0:
; CHECK-NEXT: add x8, x0, #15
; CHECK-NEXT: cmp x0, #0
-; CHECK-NEXT: csel x8, x8, x0, lt
+; CHECK-NEXT: csel x8, x8, x0, mi
; CHECK-NEXT: neg x0, x8, asr #4
; CHECK-NEXT: ret
+;
+; CHECK-FAST-ISEL-LABEL: sdiv_i64_neg:
+; CHECK-FAST-ISEL: // %bb.0:
+; CHECK-FAST-ISEL-NEXT: add x8, x0, #15
+; CHECK-FAST-ISEL-NEXT: cmp x0, #0
+; CHECK-FAST-ISEL-NEXT: csel x8, x8, x0, lt
+; CHECK-FAST-ISEL-NEXT: neg x0, x8, asr #4
+; CHECK-FAST-ISEL-NEXT: ret
%1 = sdiv i64 %a, -16
ret i64 %1
}
diff --git a/llvm/test/CodeGen/AArch64/fcmp-fp128.ll b/llvm/test/CodeGen/AArch64/fcmp-fp128.ll
index 503cb8c533bab..a2b4b61864741 100644
--- a/llvm/test/CodeGen/AArch64/fcmp-fp128.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp-fp128.ll
@@ -98,7 +98,7 @@ define double @olt(fp128 %a, fp128 %b, double %d, double %e) {
; CHECK-SD-NEXT: bl __lttf2
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-SD-NEXT: fcsel d0, d9, d8, lt
+; CHECK-SD-NEXT: fcsel d0, d9, d8, mi
; CHECK-SD-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload
; CHECK-SD-NEXT: ret
;
@@ -115,7 +115,7 @@ define double @olt(fp128 %a, fp128 %b, double %d, double %e) {
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT: fcsel d0, d8, d9, lt
+; CHECK-GI-NEXT: fcsel d0, d8, d9, mi
; CHECK-GI-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload
; CHECK-GI-NEXT: ret
entry:
@@ -412,7 +412,7 @@ define double @uge(fp128 %a, fp128 %b, double %d, double %e) {
; CHECK-SD-NEXT: bl __lttf2
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-SD-NEXT: fcsel d0, d9, d8, ge
+; CHECK-SD-NEXT: fcsel d0, d9, d8, pl
; CHECK-SD-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload
; CHECK-SD-NEXT: ret
;
@@ -429,7 +429,7 @@ define double @uge(fp128 %a, fp128 %b, double %d, double %e) {
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT: fcsel d0, d8, d9, ge
+; CHECK-GI-NEXT: fcsel d0, d8, d9, pl
; CHECK-GI-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload
; CHECK-GI-NEXT: ret
entry:
@@ -452,7 +452,7 @@ define double @ult(fp128 %a, fp128 %b, double %d, double %e) {
; CHECK-SD-NEXT: bl __getf2
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-SD-NEXT: fcsel d0, d9, d8, lt
+; CHECK-SD-NEXT: fcsel d0, d9, d8, mi
; CHECK-SD-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload
; CHECK-SD-NEXT: ret
;
@@ -469,7 +469,7 @@ define double @ult(fp128 %a, fp128 %b, double %d, double %e) {
; CHECK-GI-NEXT: bl __getf2
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT: fcsel d0, d8, d9, lt
+; CHECK-GI-NEXT: fcsel d0, d8, d9, mi
; CHECK-GI-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index f33f57e8751ca..6d673f1204c7f 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -15,7 +15,7 @@ define fp128 @f128_fp128(fp128 %a, fp128 %b, fp128 %d, fp128 %e) {
; CHECK-SD-NEXT: bl __lttf2
; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w0, #0
-; CHECK-SD-NEXT: b.ge .LBB0_2
+; CHECK-SD-NEXT: b.pl .LBB0_2
; CHECK-SD-NEXT: // %bb.1: // %entry
; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-SD-NEXT: .LBB0_2: // %entry
@@ -36,9 +36,9 @@ define fp128 @f128_fp128(fp128 %a, fp128 %b, fp128 %d, fp128 %e) {
; CHECK-GI-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
; CHECK-GI-NEXT: mov d0, v2.d[1]
; CHECK-GI-NEXT: mov d1, v3.d[1]
-; CHECK-GI-NEXT: fcsel d2, d2, d3, lt
+; CHECK-GI-NEXT: fcsel d2, d2, d3, mi
; CHECK-GI-NEXT: fmov x8, d2
-; CHECK-GI-NEXT: fcsel d1, d0, d1, lt
+; CHECK-GI-NEXT: fcsel d1, d0, d1, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
; CHECK-GI-NEXT: fmov x8, d1
; CHECK-GI-NEXT: mov v0.d[1], x8
@@ -71,13 +71,13 @@ define i128 @f128_i128(fp128 %a, fp128 %b, i128 %d, i128 %e) {
; CHECK-SD-NEXT: bl __lttf2
; CHECK-SD-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload
; CHECK-SD-NEXT: cmp w0, #0
-; CHECK-SD-NEXT: csel x20, x22, x20, lt
+; CHECK-SD-NEXT: csel x20, x22, x20, mi
; CHECK-SD-NEXT: bl __lttf2
; CHECK-SD-NEXT: mov w8, w0
; CHECK-SD-NEXT: mov x0, x20
; CHECK-SD-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload
; CHECK-SD-NEXT: cmp w8, #0
-; CHECK-SD-NEXT: csel x1, x21, x19, lt
+; CHECK-SD-NEXT: csel x1, x21, x19, mi
; CHECK-SD-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT: add sp, sp, #80
@@ -100,8 +100,8 @@ define i128 @f128_i128(fp128 %a, fp128 %b, i128 %d, i128 %e) {
; CHECK-GI-NEXT: mov x22, x3
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x0, x19, x21, lt
-; CHECK-GI-NEXT: csel x1, x20, x22, lt
+; CHECK-GI-NEXT: csel x0, x19, x21, mi
+; CHECK-GI-NEXT: csel x1, x20, x22, mi
; CHECK-GI-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload
@@ -126,7 +126,7 @@ define double @f128_double(fp128 %a, fp128 %b, double %d, double %e) {
; CHECK-SD-NEXT: bl __lttf2
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-SD-NEXT: fcsel d0, d9, d8, lt
+; CHECK-SD-NEXT: fcsel d0, d9, d8, mi
; CHECK-SD-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload
; CHECK-SD-NEXT: ret
;
@@ -143,7 +143,7 @@ define double @f128_double(fp128 %a, fp128 %b, double %d, double %e) {
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT: fcsel d0, d8, d9, lt
+; CHECK-GI-NEXT: fcsel d0, d8, d9, mi
; CHECK-GI-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload
; CHECK-GI-NEXT: ret
entry:
@@ -166,7 +166,7 @@ define float @f128_float(fp128 %a, fp128 %b, float %d, float %e) {
; CHECK-SD-NEXT: bl __lttf2
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-SD-NEXT: fcsel s0, s9, s8, lt
+; CHECK-SD-NEXT: fcsel s0, s9, s8, mi
; CHECK-SD-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload
; CHECK-SD-NEXT: ret
;
@@ -183,7 +183,7 @@ define float @f128_float(fp128 %a, fp128 %b, float %d, float %e) {
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT: fcsel s0, s8, s9, lt
+; CHECK-GI-NEXT: fcsel s0, s8, s9, mi
; CHECK-GI-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload
; CHECK-GI-NEXT: ret
entry:
@@ -205,7 +205,7 @@ define i32 @f128_i32(fp128 %a, fp128 %b, i32 %d, i32 %e) {
; CHECK-SD-NEXT: mov w20, w0
; CHECK-SD-NEXT: bl __lttf2
; CHECK-SD-NEXT: cmp w0, #0
-; CHECK-SD-NEXT: csel w0, w20, w19, lt
+; CHECK-SD-NEXT: csel w0, w20, w19, mi
; CHECK-SD-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload
; CHECK-SD-NEXT: ret
@@ -222,7 +222,7 @@ define i32 @f128_i32(fp128 %a, fp128 %b, i32 %d, i32 %e) {
; CHECK-GI-NEXT: mov w20, w1
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel w0, w19, w20, lt
+; CHECK-GI-NEXT: csel w0, w19, w20, mi
; CHECK-GI-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
; CHECK-GI-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload
; CHECK-GI-NEXT: ret
@@ -246,7 +246,7 @@ define half @f128_half(fp128 %a, fp128 %b, half %d, half %e) {
; CHECK-SD-NOFP16-NEXT: bl __lttf2
; CHECK-SD-NOFP16-NEXT: cmp w0, #0
; CHECK-SD-NOFP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-SD-NOFP16-NEXT: fcsel s0, s9, s8, lt
+; CHECK-SD-NOFP16-NEXT: fcsel s0, s9, s8, mi
; CHECK-SD-NOFP16-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-SD-NOFP16-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload
; CHECK-SD-NOFP16-NEXT: ret
@@ -264,7 +264,7 @@ define half @f128_half(fp128 %a, fp128 %b, half %d, half %e) {
; CHECK-SD-FP16-NEXT: bl __lttf2
; CHECK-SD-FP16-NEXT: cmp w0, #0
; CHECK-SD-FP16-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-SD-FP16-NEXT: fcsel h0, h9, h8, lt
+; CHECK-SD-FP16-NEXT: fcsel h0, h9, h8, mi
; CHECK-SD-FP16-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload
; CHECK-SD-FP16-NEXT: ret
;
@@ -283,7 +283,7 @@ define half @f128_half(fp128 %a, fp128 %b, half %d, half %e) {
; CHECK-GI-NEXT: fmov w9, s9
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT: csel w8, w8, w9, lt
+; CHECK-GI-NEXT: csel w8, w8, w9, mi
; CHECK-GI-NEXT: fmov s0, w8
; CHECK-GI-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-GI-NEXT: ldp d9, d8, [sp], #32 // 16-byte Folded Reload
@@ -438,7 +438,7 @@ define <2 x fp128> @v2f128_fp128(<2 x fp128> %a, <2 x fp128> %b, <2 x fp128> %d,
; CHECK-SD-NEXT: stp q7, q6, [sp, #64] // 32-byte Folded Spill
; CHECK-SD-NEXT: bl __lttf2
; CHECK-SD-NEXT: cmp w0, #0
-; CHECK-SD-NEXT: b.ge .LBB12_2
+; CHECK-SD-NEXT: b.pl .LBB12_2
; CHECK-SD-NEXT: // %bb.1: // %entry
; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-SD-NEXT: str q0, [sp, #80] // 16-byte Folded Spill
@@ -447,7 +447,7 @@ define <2 x fp128> @v2f128_fp128(<2 x fp128> %a, <2 x fp128> %b, <2 x fp128> %d,
; CHECK-SD-NEXT: bl __lttf2
; CHECK-SD-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w0, #0
-; CHECK-SD-NEXT: b.ge .LBB12_4
+; CHECK-SD-NEXT: b.pl .LBB12_4
; CHECK-SD-NEXT: // %bb.3: // %entry
; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT: .LBB12_4: // %entry
@@ -476,18 +476,18 @@ define <2 x fp128> @v2f128_fp128(<2 x fp128> %a, <2 x fp128> %b, <2 x fp128> %d,
; CHECK-GI-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov d0, v2.d[1]
; CHECK-GI-NEXT: mov d1, v3.d[1]
-; CHECK-GI-NEXT: fcsel d2, d2, d3, lt
+; CHECK-GI-NEXT: fcsel d2, d2, d3, mi
; CHECK-GI-NEXT: fmov x8, d2
-; CHECK-GI-NEXT: fcsel d3, d0, d1, lt
+; CHECK-GI-NEXT: fcsel d3, d0, d1, mi
; CHECK-GI-NEXT: ldp q5, q0, [sp, #64] // 32-byte Folded Reload
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: mov d1, v0.d[1]
; CHECK-GI-NEXT: mov d4, v5.d[1]
-; CHECK-GI-NEXT: fcsel d0, d0, d5, lt
+; CHECK-GI-NEXT: fcsel d0, d0, d5, mi
; CHECK-GI-NEXT: fmov x9, d0
; CHECK-GI-NEXT: mov v0.d[0], x8
; CHECK-GI-NEXT: fmov x8, d3
-; CHECK-GI-NEXT: fcsel d2, d1, d4, lt
+; CHECK-GI-NEXT: fcsel d2, d1, d4, mi
; CHECK-GI-NEXT: mov v1.d[0], x9
; CHECK-GI-NEXT: fmov x9, d2
; CHECK-GI-NEXT: mov v0.d[1], x8
@@ -513,7 +513,7 @@ define <3 x fp128> @v3f128_fp128(<3 x fp128> %a, <3 x fp128> %b, <3 x fp128> %d,
; CHECK-SD-NEXT: stp q6, q7, [sp, #64] // 32-byte Folded Spill
; CHECK-SD-NEXT: bl __lttf2
; CHECK-SD-NEXT: cmp w0, #0
-; CHECK-SD-NEXT: b.lt .LBB13_2
+; CHECK-SD-NEXT: b.mi .LBB13_2
; CHECK-SD-NEXT: // %bb.1:
; CHECK-SD-NEXT: ldr q0, [sp, #128]
; CHECK-SD-NEXT: str q0, [sp, #64] // 16-byte Folded Spill
@@ -521,7 +521,7 @@ define <3 x fp128> @v3f128_fp128(<3 x fp128> %a, <3 x fp128> %b, <3 x fp128> %d,
; CHECK-SD-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload
; CHECK-SD-NEXT: bl __lttf2
; CHECK-SD-NEXT: cmp w0, #0
-; CHECK-SD-NEXT: b.lt .LBB13_4
+; CHECK-SD-NEXT: b.mi .LBB13_4
; CHECK-SD-NEXT: // %bb.3:
; CHECK-SD-NEXT: ldr q0, [sp, #144]
; CHECK-SD-NEXT: str q0, [sp, #80] // 16-byte Folded Spill
@@ -531,7 +531,7 @@ define <3 x fp128> @v3f128_fp128(<3 x fp128> %a, <3 x fp128> %b, <3 x fp128> %d,
; CHECK-SD-NEXT: add x8, sp, #160
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: add x9, sp, #112
-; CHECK-SD-NEXT: csel x8, x9, x8, lt
+; CHECK-SD-NEXT: csel x8, x9, x8, mi
; CHECK-SD-NEXT: ldp q0, q1, [sp, #64] // 32-byte Folded Reload
; CHECK-SD-NEXT: ldr q2, [x8]
; CHECK-SD-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
@@ -571,24 +571,24 @@ define <3 x fp128> @v3f128_fp128(<3 x fp128> %a, <3 x fp128> %b, <3 x fp128> %d,
; CHECK-GI-NEXT: ldr x30, [sp, #160] // 8-byte Folded Reload
; CHECK-GI-NEXT: mov d0, v4.d[1]
; CHECK-GI-NEXT: mov d1, v5.d[1]
-; CHECK-GI-NEXT: fcsel d4, d4, d5, lt
+; CHECK-GI-NEXT: fcsel d4, d4, d5, mi
; CHECK-GI-NEXT: mov d2, v7.d[1]
; CHECK-GI-NEXT: mov d3, v6.d[1]
; CHECK-GI-NEXT: fmov x8, d4
-; CHECK-GI-NEXT: fcsel d5, d0, d1, lt
+; CHECK-GI-NEXT: fcsel d5, d0, d1, mi
; CHECK-GI-NEXT: cmp w20, #0
-; CHECK-GI-NEXT: fcsel d1, d7, d6, lt
+; CHECK-GI-NEXT: fcsel d1, d7, d6, mi
; CHECK-GI-NEXT: ldp q7, q0, [sp, #128] // 32-byte Folded Reload
-; CHECK-GI-NEXT: fcsel d3, d2, d3, lt
+; CHECK-GI-NEXT: fcsel d3, d2, d3, mi
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: ldp x20, x19, [sp, #176] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov d2, v0.d[1]
; CHECK-GI-NEXT: mov d6, v7.d[1]
-; CHECK-GI-NEXT: fcsel d7, d0, d7, lt
+; CHECK-GI-NEXT: fcsel d7, d0, d7, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
; CHECK-GI-NEXT: fmov x8, d1
; CHECK-GI-NEXT: fmov x9, d7
-; CHECK-GI-NEXT: fcsel d4, d2, d6, lt
+; CHECK-GI-NEXT: fcsel d4, d2, d6, mi
; CHECK-GI-NEXT: mov v1.d[0], x8
; CHECK-GI-NEXT: fmov x8, d5
; CHECK-GI-NEXT: mov v2.d[0], x9
@@ -621,7 +621,7 @@ define <2 x double> @v2f128_double(<2 x fp128> %a, <2 x fp128> %b, <2 x double>
; CHECK-SD-NEXT: bl __lttf2
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT: cset w8, lt
+; CHECK-SD-NEXT: cset w8, mi
; CHECK-SD-NEXT: sbfx x8, x8, #0, #1
; CHECK-SD-NEXT: fmov d0, x8
; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
@@ -630,7 +630,7 @@ define <2 x double> @v2f128_double(<2 x fp128> %a, <2 x fp128> %b, <2 x double>
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr x30, [sp, #80] // 8-byte Folded Reload
-; CHECK-SD-NEXT: cset w8, lt
+; CHECK-SD-NEXT: cset w8, mi
; CHECK-SD-NEXT: sbfx x8, x8, #0, #1
; CHECK-SD-NEXT: fmov d0, x8
; CHECK-SD-NEXT: mov v0.d[1], v1.d[0]
@@ -652,11 +652,11 @@ define <2 x double> @v2f128_double(<2 x fp128> %a, <2 x fp128> %b, <2 x double>
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: cset w19, lt
+; CHECK-GI-NEXT: cset w19, mi
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: fmov d0, x19
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: cset w8, lt
+; CHECK-GI-NEXT: cset w8, mi
; CHECK-GI-NEXT: ldp q2, q1, [sp, #32] // 32-byte Folded Reload
; CHECK-GI-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v0.d[1], x8
@@ -696,7 +696,7 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
; CHECK-SD-NEXT: bl __lttf2
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload
-; CHECK-SD-NEXT: cset w8, lt
+; CHECK-SD-NEXT: cset w8, mi
; CHECK-SD-NEXT: sbfx x8, x8, #0, #1
; CHECK-SD-NEXT: fmov d0, x8
; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill
@@ -704,7 +704,7 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
; CHECK-SD-NEXT: bl __lttf2
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-SD-NEXT: cset w8, lt
+; CHECK-SD-NEXT: cset w8, mi
; CHECK-SD-NEXT: sbfx x8, x8, #0, #1
; CHECK-SD-NEXT: fmov d1, x8
; CHECK-SD-NEXT: mov v1.d[1], v0.d[0]
@@ -714,7 +714,7 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
; CHECK-SD-NEXT: ldp q1, q0, [sp, #32] // 32-byte Folded Reload
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: ldp q2, q4, [sp, #64] // 32-byte Folded Reload
-; CHECK-SD-NEXT: cset w8, lt
+; CHECK-SD-NEXT: cset w8, mi
; CHECK-SD-NEXT: sbfx x8, x8, #0, #1
; CHECK-SD-NEXT: ldr q3, [sp, #96] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr x30, [sp, #144] // 8-byte Folded Reload
@@ -755,11 +755,11 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: ldp q1, q0, [sp] // 32-byte Folded Reload
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: cset w21, lt
+; CHECK-GI-NEXT: cset w21, mi
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: ldp q1, q0, [sp, #32] // 32-byte Folded Reload
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: cset w22, lt
+; CHECK-GI-NEXT: cset w22, mi
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: ldp q0, q2, [sp, #64] // 32-byte Folded Reload
; CHECK-GI-NEXT: sbfx x8, x21, #0, #1
@@ -770,7 +770,7 @@ define <3 x double> @v3f128_double(<3 x fp128> %a, <3 x fp128> %b, <3 x double>
; CHECK-GI-NEXT: ldr x30, [sp, #128] // 8-byte Folded Reload
; CHECK-GI-NEXT: mov v2.d[1], v0.d[0]
; CHECK-GI-NEXT: fmov d0, x8
-; CHECK-GI-NEXT: cset w8, lt
+; CHECK-GI-NEXT: cset w8, mi
; CHECK-GI-NEXT: mov v3.d[1], v4.d[0]
; CHECK-GI-NEXT: sbfx x8, x8, #0, #1
; CHECK-GI-NEXT: mov v1.d[1], x9
diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
index f2a14a9b73fa1..919585a9826b9 100644
--- a/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
@@ -2440,7 +2440,7 @@ define i32 @fcmp_olt_f128(fp128 %a, fp128 %b) #0 {
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: bl __lttf2
; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: cset w0, lt
+; CHECK-NEXT: cset w0, mi
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%cmp = call i1 @llvm.experimental.constrained.fcmp.f128(fp128 %a, fp128 %b, metadata !"olt", metadata !"fpexcept.strict") #0
@@ -2488,7 +2488,7 @@ define i32 @fcmp_oge_f128(fp128 %a, fp128 %b) #0 {
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: bl __getf2
; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: cset w0, ge
+; CHECK-NEXT: cset w0, pl
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%cmp = call i1 @llvm.experimental.constrained.fcmp.f128(fp128 %a, fp128 %b, metadata !"oge", metadata !"fpexcept.strict") #0
@@ -2544,7 +2544,7 @@ define i32 @fcmp_ult_f128(fp128 %a, fp128 %b) #0 {
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: bl __getf2
; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: cset w0, lt
+; CHECK-NEXT: cset w0, mi
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%cmp = call i1 @llvm.experimental.constrained.fcmp.f128(fp128 %a, fp128 %b, metadata !"ult", metadata !"fpexcept.strict") #0
@@ -2592,7 +2592,7 @@ define i32 @fcmp_uge_f128(fp128 %a, fp128 %b) #0 {
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: bl __lttf2
; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: cset w0, ge
+; CHECK-NEXT: cset w0, pl
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%cmp = call i1 @llvm.experimental.constrained.fcmp.f128(fp128 %a, fp128 %b, metadata !"uge", metadata !"fpexcept.strict") #0
@@ -2648,7 +2648,7 @@ define i32 @fcmps_olt_f128(fp128 %a, fp128 %b) #0 {
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: bl __lttf2
; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: cset w0, lt
+; CHECK-NEXT: cset w0, mi
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%cmp = call i1 @llvm.experimental.constrained.fcmps.f128(fp128 %a, fp128 %b, metadata !"olt", metadata !"fpexcept.strict") #0
@@ -2696,7 +2696,7 @@ define i32 @fcmps_oge_f128(fp128 %a, fp128 %b) #0 {
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: bl __getf2
; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: cset w0, ge
+; CHECK-NEXT: cset w0, pl
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%cmp = call i1 @llvm.experimental.constrained.fcmps.f128(fp128 %a, fp128 %b, metadata !"oge", metadata !"fpexcept.strict") #0
@@ -2752,7 +2752,7 @@ define i32 @fcmps_ult_f128(fp128 %a, fp128 %b) #0 {
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: bl __getf2
; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: cset w0, lt
+; CHECK-NEXT: cset w0, mi
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%cmp = call i1 @llvm.experimental.constrained.fcmps.f128(fp128 %a, fp128 %b, metadata !"ult", metadata !"fpexcept.strict") #0
@@ -2800,7 +2800,7 @@ define i32 @fcmps_uge_f128(fp128 %a, fp128 %b) #0 {
; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: bl __lttf2
; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: cset w0, ge
+; CHECK-NEXT: cset w0, pl
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
%cmp = call i1 @llvm.experimental.constrained.fcmps.f128(fp128 %a, fp128 %b, metadata !"uge", metadata !"fpexcept.strict") #0
diff --git a/llvm/test/CodeGen/AArch64/fpclamptosat.ll b/llvm/test/CodeGen/AArch64/fpclamptosat.ll
index 06dc11d413fae..00de1530fb72c 100644
--- a/llvm/test/CodeGen/AArch64/fpclamptosat.ll
+++ b/llvm/test/CodeGen/AArch64/fpclamptosat.ll
@@ -903,7 +903,7 @@ define i64 @ustest_f64i64_mm(double %x) {
; CHECK-NEXT: csinc x8, x1, xzr, lt
; CHECK-NEXT: csel x9, x0, xzr, lt
; CHECK-NEXT: cmp x8, #0
-; CHECK-NEXT: csel x0, xzr, x9, lt
+; CHECK-NEXT: csel x0, xzr, x9, mi
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
@@ -956,7 +956,7 @@ define i64 @ustest_f32i64_mm(float %x) {
; CHECK-NEXT: csinc x8, x1, xzr, lt
; CHECK-NEXT: csel x9, x0, xzr, lt
; CHECK-NEXT: cmp x8, #0
-; CHECK-NEXT: csel x0, xzr, x9, lt
+; CHECK-NEXT: csel x0, xzr, x9, mi
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
@@ -1015,7 +1015,7 @@ define i64 @ustest_f16i64_mm(half %x) {
; CHECK-NEXT: csinc x8, x1, xzr, lt
; CHECK-NEXT: csel x9, x0, xzr, lt
; CHECK-NEXT: cmp x8, #0
-; CHECK-NEXT: csel x0, xzr, x9, lt
+; CHECK-NEXT: csel x0, xzr, x9, mi
; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
index 9157bcba59e9b..83ea72c865283 100644
--- a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
@@ -1009,9 +1009,9 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
; CHECK-NEXT: csel x11, x19, xzr, lt
; CHECK-NEXT: cmp x10, #0
; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: csel x10, xzr, x11, lt
+; CHECK-NEXT: csel x10, xzr, x11, mi
; CHECK-NEXT: cmp x9, #0
-; CHECK-NEXT: csel x8, xzr, x8, lt
+; CHECK-NEXT: csel x8, xzr, x8, mi
; CHECK-NEXT: fmov d0, x10
; CHECK-NEXT: fmov d1, x8
; CHECK-NEXT: mov v0.d[1], v1.d[0]
@@ -1104,9 +1104,9 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
; CHECK-NEXT: csel x11, x19, xzr, lt
; CHECK-NEXT: cmp x10, #0
; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: csel x10, xzr, x11, lt
+; CHECK-NEXT: csel x10, xzr, x11, mi
; CHECK-NEXT: cmp x9, #0
-; CHECK-NEXT: csel x8, xzr, x8, lt
+; CHECK-NEXT: csel x8, xzr, x8, mi
; CHECK-NEXT: fmov d0, x10
; CHECK-NEXT: fmov d1, x8
; CHECK-NEXT: mov v0.d[1], v1.d[0]
@@ -1215,9 +1215,9 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
; CHECK-NEXT: csel x11, x19, xzr, lt
; CHECK-NEXT: cmp x10, #0
; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: csel x10, xzr, x11, lt
+; CHECK-NEXT: csel x10, xzr, x11, mi
; CHECK-NEXT: cmp x9, #0
-; CHECK-NEXT: csel x8, xzr, x8, lt
+; CHECK-NEXT: csel x8, xzr, x8, mi
; CHECK-NEXT: fmov d0, x10
; CHECK-NEXT: fmov d1, x8
; CHECK-NEXT: mov v0.d[1], v1.d[0]
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll
index 39e2db3a52d2c..e3aef487890f9 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-scalar.ll
@@ -24,7 +24,7 @@ define i1 @test_signed_i1_f32(float %f) nounwind {
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: fcvtzs w8, s0
; CHECK-SD-NEXT: ands w8, w8, w8, asr #31
-; CHECK-SD-NEXT: csinv w8, w8, wzr, ge
+; CHECK-SD-NEXT: csinv w8, w8, wzr, pl
; CHECK-SD-NEXT: and w0, w8, #0x1
; CHECK-SD-NEXT: ret
;
@@ -32,9 +32,9 @@ define i1 @test_signed_i1_f32(float %f) nounwind {
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fcvtzs w8, s0
; CHECK-GI-NEXT: cmp w8, #0
-; CHECK-GI-NEXT: csel w8, w8, wzr, lt
+; CHECK-GI-NEXT: csel w8, w8, wzr, mi
; CHECK-GI-NEXT: cmp w8, #0
-; CHECK-GI-NEXT: csinv w8, w8, wzr, ge
+; CHECK-GI-NEXT: csinv w8, w8, wzr, pl
; CHECK-GI-NEXT: and w0, w8, #0x1
; CHECK-GI-NEXT: ret
%x = call i1 @llvm.fptosi.sat.i1.f32(float %f)
@@ -269,7 +269,7 @@ define i1 @test_signed_i1_f64(double %f) nounwind {
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: fcvtzs w8, d0
; CHECK-SD-NEXT: ands w8, w8, w8, asr #31
-; CHECK-SD-NEXT: csinv w8, w8, wzr, ge
+; CHECK-SD-NEXT: csinv w8, w8, wzr, pl
; CHECK-SD-NEXT: and w0, w8, #0x1
; CHECK-SD-NEXT: ret
;
@@ -277,9 +277,9 @@ define i1 @test_signed_i1_f64(double %f) nounwind {
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fcvtzs w8, d0
; CHECK-GI-NEXT: cmp w8, #0
-; CHECK-GI-NEXT: csel w8, w8, wzr, lt
+; CHECK-GI-NEXT: csel w8, w8, wzr, mi
; CHECK-GI-NEXT: cmp w8, #0
-; CHECK-GI-NEXT: csinv w8, w8, wzr, ge
+; CHECK-GI-NEXT: csinv w8, w8, wzr, pl
; CHECK-GI-NEXT: and w0, w8, #0x1
; CHECK-GI-NEXT: ret
%x = call i1 @llvm.fptosi.sat.i1.f64(double %f)
@@ -519,7 +519,7 @@ define i1 @test_signed_i1_f16(half %f) nounwind {
; CHECK-SD-CVT-NEXT: fcvt s0, h0
; CHECK-SD-CVT-NEXT: fcvtzs w8, s0
; CHECK-SD-CVT-NEXT: ands w8, w8, w8, asr #31
-; CHECK-SD-CVT-NEXT: csinv w8, w8, wzr, ge
+; CHECK-SD-CVT-NEXT: csinv w8, w8, wzr, pl
; CHECK-SD-CVT-NEXT: and w0, w8, #0x1
; CHECK-SD-CVT-NEXT: ret
;
@@ -527,7 +527,7 @@ define i1 @test_signed_i1_f16(half %f) nounwind {
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w8, h0
; CHECK-SD-FP16-NEXT: ands w8, w8, w8, asr #31
-; CHECK-SD-FP16-NEXT: csinv w8, w8, wzr, ge
+; CHECK-SD-FP16-NEXT: csinv w8, w8, wzr, pl
; CHECK-SD-FP16-NEXT: and w0, w8, #0x1
; CHECK-SD-FP16-NEXT: ret
;
@@ -536,9 +536,9 @@ define i1 @test_signed_i1_f16(half %f) nounwind {
; CHECK-GI-CVT-NEXT: fcvt s0, h0
; CHECK-GI-CVT-NEXT: fcvtzs w8, s0
; CHECK-GI-CVT-NEXT: cmp w8, #0
-; CHECK-GI-CVT-NEXT: csel w8, w8, wzr, lt
+; CHECK-GI-CVT-NEXT: csel w8, w8, wzr, mi
; CHECK-GI-CVT-NEXT: cmp w8, #0
-; CHECK-GI-CVT-NEXT: csinv w8, w8, wzr, ge
+; CHECK-GI-CVT-NEXT: csinv w8, w8, wzr, pl
; CHECK-GI-CVT-NEXT: and w0, w8, #0x1
; CHECK-GI-CVT-NEXT: ret
;
@@ -546,9 +546,9 @@ define i1 @test_signed_i1_f16(half %f) nounwind {
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: fcvtzs w8, h0
; CHECK-GI-FP16-NEXT: cmp w8, #0
-; CHECK-GI-FP16-NEXT: csel w8, w8, wzr, lt
+; CHECK-GI-FP16-NEXT: csel w8, w8, wzr, mi
; CHECK-GI-FP16-NEXT: cmp w8, #0
-; CHECK-GI-FP16-NEXT: csinv w8, w8, wzr, ge
+; CHECK-GI-FP16-NEXT: csinv w8, w8, wzr, pl
; CHECK-GI-FP16-NEXT: and w0, w8, #0x1
; CHECK-GI-FP16-NEXT: ret
%x = call i1 @llvm.fptosi.sat.i1.f16(half %f)
@@ -959,7 +959,7 @@ define i32 @test_signed_f128_i32(fp128 %f) {
; CHECK-SD-NEXT: cmp w19, #0
; CHECK-SD-NEXT: mov w8, #-2147483648 // =0x80000000
; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-SD-NEXT: csel w19, w8, w0, lt
+; CHECK-SD-NEXT: csel w19, w8, w0, mi
; CHECK-SD-NEXT: adrp x8, .LCPI30_1
; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI30_1]
; CHECK-SD-NEXT: bl __gttf2
@@ -1001,11 +1001,11 @@ define i32 @test_signed_f128_i32(fp128 %f) {
; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI30_0]
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x8, x19, xzr, lt
+; CHECK-GI-NEXT: csel x8, x19, xzr, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
; CHECK-GI-NEXT: mov x8, #281474976448512 // =0xfffffffc0000
; CHECK-GI-NEXT: movk x8, #16413, lsl #48
-; CHECK-GI-NEXT: csel x8, x20, x8, lt
+; CHECK-GI-NEXT: csel x8, x20, x8, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixtfsi
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index 9c21d2bf083a2..77dd6c6425207 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -499,7 +499,7 @@ define <1 x i32> @test_signed_v1f128_v1i32(<1 x fp128> %f) {
; CHECK-SD-NEXT: cmp w19, #0
; CHECK-SD-NEXT: mov w8, #-2147483648 // =0x80000000
; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-SD-NEXT: csel w19, w8, w0, lt
+; CHECK-SD-NEXT: csel w19, w8, w0, mi
; CHECK-SD-NEXT: adrp x8, .LCPI14_1
; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI14_1]
; CHECK-SD-NEXT: bl __gttf2
@@ -542,11 +542,11 @@ define <1 x i32> @test_signed_v1f128_v1i32(<1 x fp128> %f) {
; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_0]
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x8, x19, xzr, lt
+; CHECK-GI-NEXT: csel x8, x19, xzr, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
; CHECK-GI-NEXT: mov x8, #281474976448512 // =0xfffffffc0000
; CHECK-GI-NEXT: movk x8, #16413, lsl #48
-; CHECK-GI-NEXT: csel x8, x20, x8, lt
+; CHECK-GI-NEXT: csel x8, x20, x8, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixtfsi
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
@@ -592,7 +592,7 @@ define <2 x i32> @test_signed_v2f128_v2i32(<2 x fp128> %f) {
; CHECK-SD-NEXT: cmp w19, #0
; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI15_1]
; CHECK-SD-NEXT: mov w20, #-2147483648 // =0x80000000
-; CHECK-SD-NEXT: csel w19, w20, w0, lt
+; CHECK-SD-NEXT: csel w19, w20, w0, mi
; CHECK-SD-NEXT: str q1, [sp] // 16-byte Folded Spill
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -612,7 +612,7 @@ define <2 x i32> @test_signed_v2f128_v2i32(<2 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
-; CHECK-SD-NEXT: csel w19, w20, w0, lt
+; CHECK-SD-NEXT: csel w19, w20, w0, mi
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w0, #0
@@ -664,10 +664,10 @@ define <2 x i32> @test_signed_v2f128_v2i32(<2 x fp128> %f) {
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: mov x22, #281474976448512 // =0xfffffffc0000
-; CHECK-GI-NEXT: csel x8, x19, xzr, lt
+; CHECK-GI-NEXT: csel x8, x19, xzr, mi
; CHECK-GI-NEXT: movk x22, #16413, lsl #48
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x21, x22, lt
+; CHECK-GI-NEXT: csel x8, x21, x22, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixtfsi
; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -690,9 +690,9 @@ define <2 x i32> @test_signed_v2f128_v2i32(<2 x fp128> %f) {
; CHECK-GI-NEXT: mov v0.d[1], x20
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x8, x19, xzr, lt
+; CHECK-GI-NEXT: csel x8, x19, xzr, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x20, x22, lt
+; CHECK-GI-NEXT: csel x8, x20, x22, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixtfsi
; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
@@ -742,7 +742,7 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
; CHECK-SD-NEXT: cmp w19, #0
; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI16_1]
; CHECK-SD-NEXT: mov w20, #-2147483648 // =0x80000000
-; CHECK-SD-NEXT: csel w19, w20, w0, lt
+; CHECK-SD-NEXT: csel w19, w20, w0, mi
; CHECK-SD-NEXT: str q1, [sp] // 16-byte Folded Spill
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -762,7 +762,7 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
-; CHECK-SD-NEXT: csel w19, w20, w0, lt
+; CHECK-SD-NEXT: csel w19, w20, w0, mi
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w0, #0
@@ -780,7 +780,7 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
-; CHECK-SD-NEXT: csel w19, w20, w0, lt
+; CHECK-SD-NEXT: csel w19, w20, w0, mi
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w0, #0
@@ -833,10 +833,10 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: mov x22, #281474976448512 // =0xfffffffc0000
-; CHECK-GI-NEXT: csel x8, x19, xzr, lt
+; CHECK-GI-NEXT: csel x8, x19, xzr, mi
; CHECK-GI-NEXT: movk x22, #16413, lsl #48
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x21, x22, lt
+; CHECK-GI-NEXT: csel x8, x21, x22, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixtfsi
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
@@ -858,9 +858,9 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
; CHECK-GI-NEXT: mov v0.d[1], x23
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x8, x19, xzr, lt
+; CHECK-GI-NEXT: csel x8, x19, xzr, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x23, x22, lt
+; CHECK-GI-NEXT: csel x8, x23, x22, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixtfsi
; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
@@ -883,9 +883,9 @@ define <3 x i32> @test_signed_v3f128_v3i32(<3 x fp128> %f) {
; CHECK-GI-NEXT: mov v0.d[1], x20
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x8, x19, xzr, lt
+; CHECK-GI-NEXT: csel x8, x19, xzr, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x20, x22, lt
+; CHECK-GI-NEXT: csel x8, x20, x22, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixtfsi
; CHECK-GI-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload
@@ -936,7 +936,7 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-SD-NEXT: cmp w19, #0
; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI17_1]
; CHECK-SD-NEXT: mov w20, #-2147483648 // =0x80000000
-; CHECK-SD-NEXT: csel w19, w20, w0, lt
+; CHECK-SD-NEXT: csel w19, w20, w0, mi
; CHECK-SD-NEXT: str q1, [sp, #16] // 16-byte Folded Spill
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload
@@ -955,7 +955,7 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
-; CHECK-SD-NEXT: csel w19, w20, w0, lt
+; CHECK-SD-NEXT: csel w19, w20, w0, mi
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w0, #0
@@ -976,7 +976,7 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
-; CHECK-SD-NEXT: csel w19, w20, w0, lt
+; CHECK-SD-NEXT: csel w19, w20, w0, mi
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w0, #0
@@ -996,7 +996,7 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
-; CHECK-SD-NEXT: csel w19, w20, w0, lt
+; CHECK-SD-NEXT: csel w19, w20, w0, mi
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w0, #0
@@ -1049,10 +1049,10 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: mov x22, #281474976448512 // =0xfffffffc0000
-; CHECK-GI-NEXT: csel x8, x19, xzr, lt
+; CHECK-GI-NEXT: csel x8, x19, xzr, mi
; CHECK-GI-NEXT: movk x22, #16413, lsl #48
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x21, x22, lt
+; CHECK-GI-NEXT: csel x8, x21, x22, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixtfsi
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
@@ -1075,9 +1075,9 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-GI-NEXT: mov v0.d[1], x23
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x8, x19, xzr, lt
+; CHECK-GI-NEXT: csel x8, x19, xzr, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x23, x22, lt
+; CHECK-GI-NEXT: csel x8, x23, x22, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixtfsi
; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
@@ -1099,9 +1099,9 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-GI-NEXT: mov v0.d[1], x24
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x8, x19, xzr, lt
+; CHECK-GI-NEXT: csel x8, x19, xzr, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x24, x22, lt
+; CHECK-GI-NEXT: csel x8, x24, x22, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixtfsi
; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -1123,9 +1123,9 @@ define <4 x i32> @test_signed_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-GI-NEXT: mov v0.d[1], x20
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x8, x19, xzr, lt
+; CHECK-GI-NEXT: csel x8, x19, xzr, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x20, x22, lt
+; CHECK-GI-NEXT: csel x8, x20, x22, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixtfsi
; CHECK-GI-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload
@@ -2359,9 +2359,9 @@ define <2 x i1> @test_signed_v2f64_v2i1(<2 x double> %f) {
; CHECK-SD-NEXT: fcvtzs w9, d0
; CHECK-SD-NEXT: fcvtzs w8, d1
; CHECK-SD-NEXT: ands w8, w8, w8, asr #31
-; CHECK-SD-NEXT: csinv w8, w8, wzr, ge
+; CHECK-SD-NEXT: csinv w8, w8, wzr, pl
; CHECK-SD-NEXT: ands w9, w9, w9, asr #31
-; CHECK-SD-NEXT: csinv w9, w9, wzr, ge
+; CHECK-SD-NEXT: csinv w9, w9, wzr, pl
; CHECK-SD-NEXT: fmov s0, w9
; CHECK-SD-NEXT: mov v0.s[1], w8
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
@@ -5425,7 +5425,7 @@ define <2 x i64> @test_signed_v2f128_v2i64(<2 x fp128> %f) {
; CHECK-SD-NEXT: cmp w19, #0
; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI86_1]
; CHECK-SD-NEXT: mov x20, #-9223372036854775808 // =0x8000000000000000
-; CHECK-SD-NEXT: csel x19, x20, x0, lt
+; CHECK-SD-NEXT: csel x19, x20, x0, mi
; CHECK-SD-NEXT: str q1, [sp] // 16-byte Folded Spill
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -5447,7 +5447,7 @@ define <2 x i64> @test_signed_v2f128_v2i64(<2 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
-; CHECK-SD-NEXT: csel x19, x20, x0, lt
+; CHECK-SD-NEXT: csel x19, x20, x0, mi
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w0, #0
@@ -5500,9 +5500,9 @@ define <2 x i64> @test_signed_v2f128_v2i64(<2 x fp128> %f) {
; CHECK-GI-NEXT: mov x22, #-1125899906842624 // =0xfffc000000000000
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: mov x23, #4629137466983448575 // =0x403dffffffffffff
-; CHECK-GI-NEXT: csel x8, x19, x22, lt
+; CHECK-GI-NEXT: csel x8, x19, x22, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x21, x23, lt
+; CHECK-GI-NEXT: csel x8, x21, x23, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixtfdi
; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -5525,9 +5525,9 @@ define <2 x i64> @test_signed_v2f128_v2i64(<2 x fp128> %f) {
; CHECK-GI-NEXT: mov v0.d[1], x20
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x8, x19, x22, lt
+; CHECK-GI-NEXT: csel x8, x19, x22, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x20, x23, lt
+; CHECK-GI-NEXT: csel x8, x20, x23, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixtfdi
; CHECK-GI-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll
index 46950e7a60349..07e49e331415e 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-scalar.ll
@@ -777,7 +777,7 @@ define i32 @test_unsigned_f128_i32(fp128 %f) {
; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI30_1]
-; CHECK-SD-NEXT: csel w19, wzr, w0, lt
+; CHECK-SD-NEXT: csel w19, wzr, w0, mi
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: csinv w0, w19, wzr, le
@@ -811,11 +811,11 @@ define i32 @test_unsigned_f128_i32(fp128 %f) {
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-GI-NEXT: csel x8, x19, xzr, lt
+; CHECK-GI-NEXT: csel x8, x19, xzr, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
; CHECK-GI-NEXT: mov x8, #281474976579584 // =0xfffffffe0000
; CHECK-GI-NEXT: movk x8, #16414, lsl #48
-; CHECK-GI-NEXT: csel x8, x20, x8, lt
+; CHECK-GI-NEXT: csel x8, x20, x8, mi
; CHECK-GI-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: add sp, sp, #48
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index 44847a41287d6..1b3a8a3b70e13 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -463,7 +463,7 @@ define <1 x i32> @test_unsigned_v1f128_v1i32(<1 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI14_1]
-; CHECK-SD-NEXT: csel w19, wzr, w0, lt
+; CHECK-SD-NEXT: csel w19, wzr, w0, mi
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: csinv w8, w19, wzr, le
@@ -497,11 +497,11 @@ define <1 x i32> @test_unsigned_v1f128_v1i32(<1 x fp128> %f) {
; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_0]
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x8, x19, xzr, lt
+; CHECK-GI-NEXT: csel x8, x19, xzr, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
; CHECK-GI-NEXT: mov x8, #281474976579584 // =0xfffffffe0000
; CHECK-GI-NEXT: movk x8, #16414, lsl #48
-; CHECK-GI-NEXT: csel x8, x20, x8, lt
+; CHECK-GI-NEXT: csel x8, x20, x8, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixunstfsi
; CHECK-GI-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
@@ -537,7 +537,7 @@ define <2 x i32> @test_unsigned_v2f128_v2i32(<2 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI15_1]
-; CHECK-SD-NEXT: csel w19, wzr, w0, lt
+; CHECK-SD-NEXT: csel w19, wzr, w0, mi
; CHECK-SD-NEXT: str q1, [sp] // 16-byte Folded Spill
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
@@ -551,7 +551,7 @@ define <2 x i32> @test_unsigned_v2f128_v2i32(<2 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
-; CHECK-SD-NEXT: csel w19, wzr, w0, lt
+; CHECK-SD-NEXT: csel w19, wzr, w0, mi
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload
@@ -595,10 +595,10 @@ define <2 x i32> @test_unsigned_v2f128_v2i32(<2 x fp128> %f) {
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: mov x21, #281474976579584 // =0xfffffffe0000
-; CHECK-GI-NEXT: csel x8, x19, xzr, lt
+; CHECK-GI-NEXT: csel x8, x19, xzr, mi
; CHECK-GI-NEXT: movk x21, #16414, lsl #48
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x20, x21, lt
+; CHECK-GI-NEXT: csel x8, x20, x21, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixunstfsi
; CHECK-GI-NEXT: ldp q1, q0, [sp, #16] // 32-byte Folded Reload
@@ -615,9 +615,9 @@ define <2 x i32> @test_unsigned_v2f128_v2i32(<2 x fp128> %f) {
; CHECK-GI-NEXT: mov v0.d[1], x22
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x8, x20, xzr, lt
+; CHECK-GI-NEXT: csel x8, x20, xzr, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x22, x21, lt
+; CHECK-GI-NEXT: csel x8, x22, x21, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixunstfsi
; CHECK-GI-NEXT: fmov s0, w19
@@ -657,7 +657,7 @@ define <3 x i32> @test_unsigned_v3f128_v3i32(<3 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI16_1]
-; CHECK-SD-NEXT: csel w19, wzr, w0, lt
+; CHECK-SD-NEXT: csel w19, wzr, w0, mi
; CHECK-SD-NEXT: str q1, [sp, #16] // 16-byte Folded Spill
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: ldp q1, q0, [sp, #32] // 32-byte Folded Reload
@@ -670,7 +670,7 @@ define <3 x i32> @test_unsigned_v3f128_v3i32(<3 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
-; CHECK-SD-NEXT: csel w19, wzr, w0, lt
+; CHECK-SD-NEXT: csel w19, wzr, w0, mi
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
@@ -683,7 +683,7 @@ define <3 x i32> @test_unsigned_v3f128_v3i32(<3 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp, #64] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
-; CHECK-SD-NEXT: csel w19, wzr, w0, lt
+; CHECK-SD-NEXT: csel w19, wzr, w0, mi
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: csinv w8, w19, wzr, le
@@ -727,10 +727,10 @@ define <3 x i32> @test_unsigned_v3f128_v3i32(<3 x fp128> %f) {
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: mov x21, #281474976579584 // =0xfffffffe0000
-; CHECK-GI-NEXT: csel x8, x19, xzr, lt
+; CHECK-GI-NEXT: csel x8, x19, xzr, mi
; CHECK-GI-NEXT: movk x21, #16414, lsl #48
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x20, x21, lt
+; CHECK-GI-NEXT: csel x8, x20, x21, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixunstfsi
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
@@ -747,9 +747,9 @@ define <3 x i32> @test_unsigned_v3f128_v3i32(<3 x fp128> %f) {
; CHECK-GI-NEXT: mov v0.d[1], x22
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x8, x20, xzr, lt
+; CHECK-GI-NEXT: csel x8, x20, xzr, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x22, x21, lt
+; CHECK-GI-NEXT: csel x8, x22, x21, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixunstfsi
; CHECK-GI-NEXT: ldp q1, q0, [sp, #32] // 32-byte Folded Reload
@@ -766,9 +766,9 @@ define <3 x i32> @test_unsigned_v3f128_v3i32(<3 x fp128> %f) {
; CHECK-GI-NEXT: mov v0.d[1], x23
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x8, x22, xzr, lt
+; CHECK-GI-NEXT: csel x8, x22, xzr, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x23, x21, lt
+; CHECK-GI-NEXT: csel x8, x23, x21, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixunstfsi
; CHECK-GI-NEXT: fmov s0, w19
@@ -809,7 +809,7 @@ define <4 x i32> @test_unsigned_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI17_1]
-; CHECK-SD-NEXT: csel w19, wzr, w0, lt
+; CHECK-SD-NEXT: csel w19, wzr, w0, mi
; CHECK-SD-NEXT: str q1, [sp, #48] // 16-byte Folded Spill
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
@@ -823,7 +823,7 @@ define <4 x i32> @test_unsigned_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
-; CHECK-SD-NEXT: csel w19, wzr, w0, lt
+; CHECK-SD-NEXT: csel w19, wzr, w0, mi
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload
@@ -838,7 +838,7 @@ define <4 x i32> @test_unsigned_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-SD-NEXT: bl __fixunstfsi
; CHECK-SD-NEXT: ldp q0, q1, [sp, #32] // 32-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
-; CHECK-SD-NEXT: csel w19, wzr, w0, lt
+; CHECK-SD-NEXT: csel w19, wzr, w0, mi
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
@@ -853,7 +853,7 @@ define <4 x i32> @test_unsigned_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp, #80] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
-; CHECK-SD-NEXT: csel w19, wzr, w0, lt
+; CHECK-SD-NEXT: csel w19, wzr, w0, mi
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
@@ -900,10 +900,10 @@ define <4 x i32> @test_unsigned_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: mov x22, #281474976579584 // =0xfffffffe0000
-; CHECK-GI-NEXT: csel x8, x19, xzr, lt
+; CHECK-GI-NEXT: csel x8, x19, xzr, mi
; CHECK-GI-NEXT: movk x22, #16414, lsl #48
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x20, x22, lt
+; CHECK-GI-NEXT: csel x8, x20, x22, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixunstfsi
; CHECK-GI-NEXT: ldr q0, [sp] // 16-byte Folded Reload
@@ -921,9 +921,9 @@ define <4 x i32> @test_unsigned_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-GI-NEXT: mov v0.d[1], x21
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x8, x20, xzr, lt
+; CHECK-GI-NEXT: csel x8, x20, xzr, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x21, x22, lt
+; CHECK-GI-NEXT: csel x8, x21, x22, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixunstfsi
; CHECK-GI-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
@@ -941,9 +941,9 @@ define <4 x i32> @test_unsigned_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-GI-NEXT: mov v0.d[1], x23
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x8, x21, xzr, lt
+; CHECK-GI-NEXT: csel x8, x21, xzr, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x23, x22, lt
+; CHECK-GI-NEXT: csel x8, x23, x22, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixunstfsi
; CHECK-GI-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
@@ -960,9 +960,9 @@ define <4 x i32> @test_unsigned_v4f128_v4i32(<4 x fp128> %f) {
; CHECK-GI-NEXT: mov v0.d[1], x24
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x8, x23, xzr, lt
+; CHECK-GI-NEXT: csel x8, x23, xzr, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x24, x22, lt
+; CHECK-GI-NEXT: csel x8, x24, x22, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixunstfsi
; CHECK-GI-NEXT: fmov s0, w19
@@ -4419,7 +4419,7 @@ define <2 x i64> @test_signed_v2f128_v2i64(<2 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
; CHECK-SD-NEXT: ldr q1, [x8, :lo12:.LCPI86_1]
-; CHECK-SD-NEXT: csel x19, xzr, x0, lt
+; CHECK-SD-NEXT: csel x19, xzr, x0, mi
; CHECK-SD-NEXT: str q1, [sp] // 16-byte Folded Spill
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: cmp w0, #0
@@ -4435,7 +4435,7 @@ define <2 x i64> @test_signed_v2f128_v2i64(<2 x fp128> %f) {
; CHECK-SD-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload
; CHECK-SD-NEXT: cmp w19, #0
-; CHECK-SD-NEXT: csel x19, xzr, x0, lt
+; CHECK-SD-NEXT: csel x19, xzr, x0, mi
; CHECK-SD-NEXT: bl __gttf2
; CHECK-SD-NEXT: cmp w0, #0
; CHECK-SD-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload
@@ -4480,9 +4480,9 @@ define <2 x i64> @test_signed_v2f128_v2i64(<2 x fp128> %f) {
; CHECK-GI-NEXT: mov x21, #-562949953421312 // =0xfffe000000000000
; CHECK-GI-NEXT: cmp w0, #0
; CHECK-GI-NEXT: mov x22, #4629418941960159231 // =0x403effffffffffff
-; CHECK-GI-NEXT: csel x8, x19, x21, lt
+; CHECK-GI-NEXT: csel x8, x19, x21, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x20, x22, lt
+; CHECK-GI-NEXT: csel x8, x20, x22, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixunstfdi
; CHECK-GI-NEXT: ldp q1, q0, [sp, #16] // 32-byte Folded Reload
@@ -4499,9 +4499,9 @@ define <2 x i64> @test_signed_v2f128_v2i64(<2 x fp128> %f) {
; CHECK-GI-NEXT: mov v0.d[1], x23
; CHECK-GI-NEXT: bl __lttf2
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x8, x20, x21, lt
+; CHECK-GI-NEXT: csel x8, x20, x21, mi
; CHECK-GI-NEXT: mov v0.d[0], x8
-; CHECK-GI-NEXT: csel x8, x23, x22, lt
+; CHECK-GI-NEXT: csel x8, x23, x22, mi
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: bl __fixunstfdi
; CHECK-GI-NEXT: fmov d0, x19
diff --git a/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll b/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
index 81f13b8e7439a..3b43b4209b08f 100644
--- a/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
+++ b/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
@@ -256,7 +256,7 @@ define void @flag_setting() {
; CHECK-SD-NEXT: b.gt .LBB2_4
; CHECK-SD-NEXT: // %bb.1: // %test2
; CHECK-SD-NEXT: tst x9, x10, lsl #63
-; CHECK-SD-NEXT: b.lt .LBB2_4
+; CHECK-SD-NEXT: b.mi .LBB2_4
; CHECK-SD-NEXT: // %bb.2: // %test3
; CHECK-SD-NEXT: tst x9, x10, asr #12
; CHECK-SD-NEXT: b.gt .LBB2_4
@@ -277,7 +277,7 @@ define void @flag_setting() {
; CHECK-GI-NEXT: b.gt .LBB2_4
; CHECK-GI-NEXT: // %bb.1: // %test2
; CHECK-GI-NEXT: tst x9, x10, lsl #63
-; CHECK-GI-NEXT: b.lt .LBB2_4
+; CHECK-GI-NEXT: b.mi .LBB2_4
; CHECK-GI-NEXT: // %bb.2: // %test3
; CHECK-GI-NEXT: asr x10, x10, #12
; CHECK-GI-NEXT: tst x10, x9
diff --git a/llvm/test/CodeGen/AArch64/min-max-combine.ll b/llvm/test/CodeGen/AArch64/min-max-combine.ll
index 5111f838b73aa..558d4b8b499ba 100644
--- a/llvm/test/CodeGen/AArch64/min-max-combine.ll
+++ b/llvm/test/CodeGen/AArch64/min-max-combine.ll
@@ -123,7 +123,7 @@ define i8 @smini8_zero(i8 %a) {
; CHECK-GLOBAL: // %bb.0:
; CHECK-GLOBAL-NEXT: sxtb w8, w0
; CHECK-GLOBAL-NEXT: cmp w8, #0
-; CHECK-GLOBAL-NEXT: csel w0, w0, wzr, lt
+; CHECK-GLOBAL-NEXT: csel w0, w0, wzr, mi
; CHECK-GLOBAL-NEXT: ret
%c = call i8 @llvm.smin.i8(i8 %a, i8 0)
ret i8 %c
@@ -148,7 +148,7 @@ define i16 @smini16_zero(i16 %a) {
; CHECK-GLOBAL: // %bb.0:
; CHECK-GLOBAL-NEXT: sxth w8, w0
; CHECK-GLOBAL-NEXT: cmp w8, #0
-; CHECK-GLOBAL-NEXT: csel w0, w0, wzr, lt
+; CHECK-GLOBAL-NEXT: csel w0, w0, wzr, mi
; CHECK-GLOBAL-NEXT: ret
%c = call i16 @llvm.smin.i16(i16 %a, i16 0)
ret i16 %c
@@ -170,7 +170,7 @@ define i32 @smini32_zero(i32 %a) {
; CHECK-GLOBAL-LABEL: smini32_zero:
; CHECK-GLOBAL: // %bb.0:
; CHECK-GLOBAL-NEXT: cmp w0, #0
-; CHECK-GLOBAL-NEXT: csel w0, w0, wzr, lt
+; CHECK-GLOBAL-NEXT: csel w0, w0, wzr, mi
; CHECK-GLOBAL-NEXT: ret
%c = call i32 @llvm.smin.i32(i32 %a, i32 0)
ret i32 %c
@@ -192,7 +192,7 @@ define i64 @smini64_zero(i64 %a) {
; CHECK-GLOBAL-LABEL: smini64_zero:
; CHECK-GLOBAL: // %bb.0:
; CHECK-GLOBAL-NEXT: cmp x0, #0
-; CHECK-GLOBAL-NEXT: csel x0, x0, xzr, lt
+; CHECK-GLOBAL-NEXT: csel x0, x0, xzr, mi
; CHECK-GLOBAL-NEXT: ret
%c = call i64 @llvm.smin.i64(i64 %a, i64 0)
ret i64 %c
diff --git a/llvm/test/CodeGen/AArch64/pr72777.ll b/llvm/test/CodeGen/AArch64/pr72777.ll
index e9021d605f1fe..fa9f82f8c93c2 100644
--- a/llvm/test/CodeGen/AArch64/pr72777.ll
+++ b/llvm/test/CodeGen/AArch64/pr72777.ll
@@ -4,15 +4,14 @@
define i64 @f(i64 %0, i64 %1) {
; CHECK-LABEL: f:
; CHECK: // %bb.0:
-; CHECK-NEXT: orr x9, x1, #0x1
-; CHECK-NEXT: add x10, x0, x0
-; CHECK-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000
-; CHECK-NEXT: add x9, x9, x10
-; CHECK-NEXT: lsl x10, x9, #1
-; CHECK-NEXT: cmp x9, #0
-; CHECK-NEXT: cinv x8, x8, ge
-; CHECK-NEXT: cmp x9, x10, asr #1
-; CHECK-NEXT: csel x0, x8, x10, ne
+; CHECK-NEXT: orr x8, x1, #0x1
+; CHECK-NEXT: add x9, x0, x0
+; CHECK-NEXT: mov x10, #-9223372036854775808 // =0x8000000000000000
+; CHECK-NEXT: adds x8, x8, x9
+; CHECK-NEXT: lsl x9, x8, #1
+; CHECK-NEXT: cinv x10, x10, pl
+; CHECK-NEXT: cmp x8, x9, asr #1
+; CHECK-NEXT: csel x0, x10, x9, ne
; CHECK-NEXT: ret
%3 = or i64 1, %1
%4 = add i64 %3, %0
diff --git a/llvm/test/CodeGen/AArch64/sdivpow2.ll b/llvm/test/CodeGen/AArch64/sdivpow2.ll
index 2551be8555ce6..bb18ceb3fe69c 100644
--- a/llvm/test/CodeGen/AArch64/sdivpow2.ll
+++ b/llvm/test/CodeGen/AArch64/sdivpow2.ll
@@ -3,86 +3,143 @@
; RUN: llc -mtriple=aarch64-linux-gnu -fast-isel=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,FAST
define i32 @test1(i32 %x) {
-; CHECK-LABEL: test1:
-; CHECK: // %bb.0:
-; CHECK-NEXT: add w8, w0, #7
-; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: csel w8, w8, w0, lt
-; CHECK-NEXT: asr w0, w8, #3
-; CHECK-NEXT: ret
+; ISEL-LABEL: test1:
+; ISEL: // %bb.0:
+; ISEL-NEXT: add w8, w0, #7
+; ISEL-NEXT: cmp w0, #0
+; ISEL-NEXT: csel w8, w8, w0, mi
+; ISEL-NEXT: asr w0, w8, #3
+; ISEL-NEXT: ret
+;
+; FAST-LABEL: test1:
+; FAST: // %bb.0:
+; FAST-NEXT: add w8, w0, #7
+; FAST-NEXT: cmp w0, #0
+; FAST-NEXT: csel w8, w8, w0, lt
+; FAST-NEXT: asr w0, w8, #3
+; FAST-NEXT: ret
%div = sdiv i32 %x, 8
ret i32 %div
}
define i32 @test2(i32 %x) {
-; CHECK-LABEL: test2:
-; CHECK: // %bb.0:
-; CHECK-NEXT: add w8, w0, #7
-; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: csel w8, w8, w0, lt
-; CHECK-NEXT: neg w0, w8, asr #3
-; CHECK-NEXT: ret
+; ISEL-LABEL: test2:
+; ISEL: // %bb.0:
+; ISEL-NEXT: add w8, w0, #7
+; ISEL-NEXT: cmp w0, #0
+; ISEL-NEXT: csel w8, w8, w0, mi
+; ISEL-NEXT: neg w0, w8, asr #3
+; ISEL-NEXT: ret
+;
+; FAST-LABEL: test2:
+; FAST: // %bb.0:
+; FAST-NEXT: add w8, w0, #7
+; FAST-NEXT: cmp w0, #0
+; FAST-NEXT: csel w8, w8, w0, lt
+; FAST-NEXT: neg w0, w8, asr #3
+; FAST-NEXT: ret
%div = sdiv i32 %x, -8
ret i32 %div
}
define i32 @test3(i32 %x) {
-; CHECK-LABEL: test3:
-; CHECK: // %bb.0:
-; CHECK-NEXT: add w8, w0, #31
-; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: csel w8, w8, w0, lt
-; CHECK-NEXT: asr w0, w8, #5
-; CHECK-NEXT: ret
+; ISEL-LABEL: test3:
+; ISEL: // %bb.0:
+; ISEL-NEXT: add w8, w0, #31
+; ISEL-NEXT: cmp w0, #0
+; ISEL-NEXT: csel w8, w8, w0, mi
+; ISEL-NEXT: asr w0, w8, #5
+; ISEL-NEXT: ret
+;
+; FAST-LABEL: test3:
+; FAST: // %bb.0:
+; FAST-NEXT: add w8, w0, #31
+; FAST-NEXT: cmp w0, #0
+; FAST-NEXT: csel w8, w8, w0, lt
+; FAST-NEXT: asr w0, w8, #5
+; FAST-NEXT: ret
%div = sdiv i32 %x, 32
ret i32 %div
}
define i64 @test4(i64 %x) {
-; CHECK-LABEL: test4:
-; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #7
-; CHECK-NEXT: cmp x0, #0
-; CHECK-NEXT: csel x8, x8, x0, lt
-; CHECK-NEXT: asr x0, x8, #3
-; CHECK-NEXT: ret
+; ISEL-LABEL: test4:
+; ISEL: // %bb.0:
+; ISEL-NEXT: add x8, x0, #7
+; ISEL-NEXT: cmp x0, #0
+; ISEL-NEXT: csel x8, x8, x0, mi
+; ISEL-NEXT: asr x0, x8, #3
+; ISEL-NEXT: ret
+;
+; FAST-LABEL: test4:
+; FAST: // %bb.0:
+; FAST-NEXT: add x8, x0, #7
+; FAST-NEXT: cmp x0, #0
+; FAST-NEXT: csel x8, x8, x0, lt
+; FAST-NEXT: asr x0, x8, #3
+; FAST-NEXT: ret
%div = sdiv i64 %x, 8
ret i64 %div
}
define i64 @test5(i64 %x) {
-; CHECK-LABEL: test5:
-; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #7
-; CHECK-NEXT: cmp x0, #0
-; CHECK-NEXT: csel x8, x8, x0, lt
-; CHECK-NEXT: neg x0, x8, asr #3
-; CHECK-NEXT: ret
+; ISEL-LABEL: test5:
+; ISEL: // %bb.0:
+; ISEL-NEXT: add x8, x0, #7
+; ISEL-NEXT: cmp x0, #0
+; ISEL-NEXT: csel x8, x8, x0, mi
+; ISEL-NEXT: neg x0, x8, asr #3
+; ISEL-NEXT: ret
+;
+; FAST-LABEL: test5:
+; FAST: // %bb.0:
+; FAST-NEXT: add x8, x0, #7
+; FAST-NEXT: cmp x0, #0
+; FAST-NEXT: csel x8, x8, x0, lt
+; FAST-NEXT: neg x0, x8, asr #3
+; FAST-NEXT: ret
%div = sdiv i64 %x, -8
ret i64 %div
}
define i64 @test6(i64 %x) {
-; CHECK-LABEL: test6:
-; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, #63
-; CHECK-NEXT: cmp x0, #0
-; CHECK-NEXT: csel x8, x8, x0, lt
-; CHECK-NEXT: asr x0, x8, #6
-; CHECK-NEXT: ret
+; ISEL-LABEL: test6:
+; ISEL: // %bb.0:
+; ISEL-NEXT: add x8, x0, #63
+; ISEL-NEXT: cmp x0, #0
+; ISEL-NEXT: csel x8, x8, x0, mi
+; ISEL-NEXT: asr x0, x8, #6
+; ISEL-NEXT: ret
+;
+; FAST-LABEL: test6:
+; FAST: // %bb.0:
+; FAST-NEXT: add x8, x0, #63
+; FAST-NEXT: cmp x0, #0
+; FAST-NEXT: csel x8, x8, x0, lt
+; FAST-NEXT: asr x0, x8, #6
+; FAST-NEXT: ret
%div = sdiv i64 %x, 64
ret i64 %div
}
define i64 @test7(i64 %x) {
-; CHECK-LABEL: test7:
-; CHECK: // %bb.0:
-; CHECK-NEXT: mov x8, #281474976710655 // =0xffffffffffff
-; CHECK-NEXT: cmp x0, #0
-; CHECK-NEXT: add x8, x0, x8
-; CHECK-NEXT: csel x8, x8, x0, lt
-; CHECK-NEXT: asr x0, x8, #48
-; CHECK-NEXT: ret
+; ISEL-LABEL: test7:
+; ISEL: // %bb.0:
+; ISEL-NEXT: mov x8, #281474976710655 // =0xffffffffffff
+; ISEL-NEXT: cmp x0, #0
+; ISEL-NEXT: add x8, x0, x8
+; ISEL-NEXT: csel x8, x8, x0, mi
+; ISEL-NEXT: asr x0, x8, #48
+; ISEL-NEXT: ret
+;
+; FAST-LABEL: test7:
+; FAST: // %bb.0:
+; FAST-NEXT: mov x8, #281474976710655 // =0xffffffffffff
+; FAST-NEXT: cmp x0, #0
+; FAST-NEXT: add x8, x0, x8
+; FAST-NEXT: csel x8, x8, x0, lt
+; FAST-NEXT: asr x0, x8, #48
+; FAST-NEXT: ret
%div = sdiv i64 %x, 281474976710656
ret i64 %div
}
@@ -132,3 +189,5 @@ define i32 @sdiv_int(i32 %begin, i32 %first) #0 {
}
attributes #0 = { "target-features"="+sve" vscale_range(2,2) }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/select-constant-xor.ll b/llvm/test/CodeGen/AArch64/select-constant-xor.ll
index fe9a2c0fad830..97ad579a39f78 100644
--- a/llvm/test/CodeGen/AArch64/select-constant-xor.ll
+++ b/llvm/test/CodeGen/AArch64/select-constant-xor.ll
@@ -26,7 +26,7 @@ define i64 @selecti64i64(i64 %a) {
; CHECK-GI-NEXT: mov x8, #-2147483648 // =0xffffffff80000000
; CHECK-GI-NEXT: mov w9, #2147483647 // =0x7fffffff
; CHECK-GI-NEXT: cmp x0, #0
-; CHECK-GI-NEXT: csel x0, x9, x8, ge
+; CHECK-GI-NEXT: csel x0, x9, x8, pl
; CHECK-GI-NEXT: ret
%c = icmp sgt i64 %a, -1
%s = select i1 %c, i64 2147483647, i64 -2147483648
@@ -44,7 +44,7 @@ define i32 @selecti64i32(i64 %a) {
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: cmp x0, #0
; CHECK-GI-NEXT: mov w9, #-2147483648 // =0x80000000
-; CHECK-GI-NEXT: cset w8, ge
+; CHECK-GI-NEXT: cset w8, pl
; CHECK-GI-NEXT: sbfx w8, w8, #0, #1
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
@@ -66,7 +66,7 @@ define i64 @selecti32i64(i32 %a) {
; CHECK-GI-NEXT: mov x8, #-2147483648 // =0xffffffff80000000
; CHECK-GI-NEXT: mov w9, #2147483647 // =0x7fffffff
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel x0, x9, x8, ge
+; CHECK-GI-NEXT: csel x0, x9, x8, pl
; CHECK-GI-NEXT: ret
%c = icmp sgt i32 %a, -1
%s = select i1 %c, i64 2147483647, i64 -2147483648
@@ -99,7 +99,7 @@ define i32 @selecti32i32(i32 %a) {
; CHECK-GI-NEXT: mov w8, #-85 // =0xffffffab
; CHECK-GI-NEXT: mov w9, #84 // =0x54
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel w0, w9, w8, ge
+; CHECK-GI-NEXT: csel w0, w9, w8, pl
; CHECK-GI-NEXT: ret
%c = icmp sgt i32 %a, -1
%s = select i1 %c, i32 84, i32 -85
@@ -118,7 +118,7 @@ define i8 @selecti32i8(i32 %a) {
; CHECK-GI-NEXT: mov w8, #84 // =0x54
; CHECK-GI-NEXT: mov w9, #-85 // =0xffffffab
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel w0, w8, w9, ge
+; CHECK-GI-NEXT: csel w0, w8, w9, pl
; CHECK-GI-NEXT: ret
%c = icmp sgt i32 %a, -1
%s = select i1 %c, i8 84, i8 -85
@@ -139,7 +139,7 @@ define i32 @selecti8i32(i8 %a) {
; CHECK-GI-NEXT: mov w9, #-85 // =0xffffffab
; CHECK-GI-NEXT: mov w10, #84 // =0x54
; CHECK-GI-NEXT: cmp w8, #0
-; CHECK-GI-NEXT: csel w0, w10, w9, ge
+; CHECK-GI-NEXT: csel w0, w10, w9, pl
; CHECK-GI-NEXT: ret
%c = icmp sgt i8 %a, -1
%s = select i1 %c, i32 84, i32 -85
@@ -150,7 +150,7 @@ define i32 @icmpasreq(i32 %input, i32 %a, i32 %b) {
; CHECK-SD-LABEL: icmpasreq:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: cmp w0, #0
-; CHECK-SD-NEXT: csel w0, w1, w2, lt
+; CHECK-SD-NEXT: csel w0, w1, w2, mi
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: icmpasreq:
@@ -193,7 +193,7 @@ define i32 @selecti32i32_0(i32 %a) {
; CHECK-GI-LABEL: selecti32i32_0:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: cset w8, lt
+; CHECK-GI-NEXT: cset w8, mi
; CHECK-GI-NEXT: sbfx w0, w8, #0, #1
; CHECK-GI-NEXT: ret
%c = icmp sgt i32 %a, -1
@@ -211,7 +211,7 @@ define i32 @selecti32i32_m1(i32 %a) {
; CHECK-GI-LABEL: selecti32i32_m1:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: cset w8, ge
+; CHECK-GI-NEXT: cset w8, pl
; CHECK-GI-NEXT: sbfx w0, w8, #0, #1
; CHECK-GI-NEXT: ret
%c = icmp sgt i32 %a, -1
@@ -230,7 +230,7 @@ define i32 @selecti32i32_1(i32 %a) {
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov w8, #-2 // =0xfffffffe
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csinc w0, w8, wzr, lt
+; CHECK-GI-NEXT: csinc w0, w8, wzr, mi
; CHECK-GI-NEXT: ret
%c = icmp sgt i32 %a, -1
%s = select i1 %c, i32 1, i32 -2
@@ -249,7 +249,7 @@ define i32 @selecti32i32_sge(i32 %a) {
; CHECK-GI-NEXT: mov w8, #-13 // =0xfffffff3
; CHECK-GI-NEXT: mov w9, #12 // =0xc
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel w0, w9, w8, ge
+; CHECK-GI-NEXT: csel w0, w9, w8, pl
; CHECK-GI-NEXT: ret
%c = icmp sge i32 %a, 0
%s = select i1 %c, i32 12, i32 -13
@@ -268,7 +268,7 @@ define i32 @selecti32i32_slt(i32 %a) {
; CHECK-GI-NEXT: mov w8, #12 // =0xc
; CHECK-GI-NEXT: mov w9, #-13 // =0xfffffff3
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel w0, w9, w8, lt
+; CHECK-GI-NEXT: csel w0, w9, w8, mi
; CHECK-GI-NEXT: ret
%c = icmp slt i32 %a, 0
%s = select i1 %c, i32 -13, i32 12
@@ -287,7 +287,7 @@ define i32 @selecti32i32_sle(i32 %a) {
; CHECK-GI-NEXT: mov w8, #12 // =0xc
; CHECK-GI-NEXT: mov w9, #-13 // =0xfffffff3
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel w0, w9, w8, lt
+; CHECK-GI-NEXT: csel w0, w9, w8, mi
; CHECK-GI-NEXT: ret
%c = icmp sle i32 %a, -1
%s = select i1 %c, i32 -13, i32 12
@@ -306,7 +306,7 @@ define i32 @selecti32i32_sgt(i32 %a) {
; CHECK-GI-NEXT: mov w8, #12 // =0xc
; CHECK-GI-NEXT: mov w9, #-13 // =0xfffffff3
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel w0, w9, w8, lt
+; CHECK-GI-NEXT: csel w0, w9, w8, mi
; CHECK-GI-NEXT: ret
%c = icmp sle i32 %a, -1
%s = select i1 %c, i32 -13, i32 12
@@ -318,7 +318,7 @@ define i32 @oneusecmp(i32 %a, i32 %b, i32 %d) {
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: asr w8, w0, #31
; CHECK-SD-NEXT: cmp w0, #0
-; CHECK-SD-NEXT: csel w9, w2, w1, lt
+; CHECK-SD-NEXT: csel w9, w2, w1, mi
; CHECK-SD-NEXT: eor w8, w8, #0x7f
; CHECK-SD-NEXT: add w0, w8, w9
; CHECK-SD-NEXT: ret
@@ -328,8 +328,8 @@ define i32 @oneusecmp(i32 %a, i32 %b, i32 %d) {
; CHECK-GI-NEXT: mov w8, #127 // =0x7f
; CHECK-GI-NEXT: mov w9, #-128 // =0xffffff80
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel w8, w9, w8, lt
-; CHECK-GI-NEXT: csel w9, w2, w1, lt
+; CHECK-GI-NEXT: csel w8, w9, w8, mi
+; CHECK-GI-NEXT: csel w9, w2, w1, mi
; CHECK-GI-NEXT: add w0, w8, w9
; CHECK-GI-NEXT: ret
%c = icmp sle i32 %a, -1
diff --git a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
index 0d4a636446164..293b74ecd9d3a 100644
--- a/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
+++ b/llvm/test/CodeGen/AArch64/selectcc-to-shiftand.ll
@@ -15,7 +15,7 @@ define i32 @neg_sel_constants(i32 %a) {
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov w8, #5 // =0x5
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel w0, w8, wzr, lt
+; CHECK-GI-NEXT: csel w0, w8, wzr, mi
; CHECK-GI-NEXT: ret
%tmp.1 = icmp slt i32 %a, 0
%retval = select i1 %tmp.1, i32 5, i32 0
@@ -34,7 +34,7 @@ define i32 @neg_sel_special_constant(i32 %a) {
; CHECK-GI-LABEL: neg_sel_special_constant:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: cset w8, lt
+; CHECK-GI-NEXT: cset w8, mi
; CHECK-GI-NEXT: lsl w0, w8, #9
; CHECK-GI-NEXT: ret
%tmp.1 = icmp slt i32 %a, 0
@@ -53,7 +53,7 @@ define i32 @neg_sel_variable_and_zero(i32 %a, i32 %b) {
; CHECK-GI-LABEL: neg_sel_variable_and_zero:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel w0, w1, wzr, lt
+; CHECK-GI-NEXT: csel w0, w1, wzr, mi
; CHECK-GI-NEXT: ret
%tmp.1 = icmp slt i32 %a, 0
%retval = select i1 %tmp.1, i32 %b, i32 0
@@ -93,7 +93,7 @@ define i32 @pos_sel_constants(i32 %a) {
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: mov w8, #5 // =0x5
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel w0, w8, wzr, ge
+; CHECK-GI-NEXT: csel w0, w8, wzr, pl
; CHECK-GI-NEXT: ret
%tmp.1 = icmp sgt i32 %a, -1
%retval = select i1 %tmp.1, i32 5, i32 0
@@ -112,7 +112,7 @@ define i32 @pos_sel_special_constant(i32 %a) {
; CHECK-GI-LABEL: pos_sel_special_constant:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: cset w8, ge
+; CHECK-GI-NEXT: cset w8, pl
; CHECK-GI-NEXT: lsl w0, w8, #9
; CHECK-GI-NEXT: ret
%tmp.1 = icmp sgt i32 %a, -1
@@ -131,7 +131,7 @@ define i32 @pos_sel_variable_and_zero(i32 %a, i32 %b) {
; CHECK-GI-LABEL: pos_sel_variable_and_zero:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: cmp w0, #0
-; CHECK-GI-NEXT: csel w0, w1, wzr, ge
+; CHECK-GI-NEXT: csel w0, w1, wzr, pl
; CHECK-GI-NEXT: ret
%tmp.1 = icmp sgt i32 %a, -1
%retval = select i1 %tmp.1, i32 %b, i32 0
diff --git a/llvm/test/CodeGen/AArch64/signbit-shift.ll b/llvm/test/CodeGen/AArch64/signbit-shift.ll
index 0e6da326a31f4..ce8a96386d04c 100644
--- a/llvm/test/CodeGen/AArch64/signbit-shift.ll
+++ b/llvm/test/CodeGen/AArch64/signbit-shift.ll
@@ -128,7 +128,7 @@ define i32 @sel_ifneg_tval_bigger(i32 %x) {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #41 // =0x29
; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: cinc w0, w8, lt
+; CHECK-NEXT: cinc w0, w8, mi
; CHECK-NEXT: ret
%c = icmp slt i32 %x, 0
%r = select i1 %c, i32 42, i32 41
@@ -162,7 +162,7 @@ define i32 @sel_ifneg_fval_bigger(i32 %x) {
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #41 // =0x29
; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: cinc w0, w8, ge
+; CHECK-NEXT: cinc w0, w8, pl
; CHECK-NEXT: ret
%c = icmp slt i32 %x, 0
%r = select i1 %c, i32 41, i32 42
diff --git a/llvm/test/CodeGen/AArch64/smul_fix_sat.ll b/llvm/test/CodeGen/AArch64/smul_fix_sat.ll
index c2d8d34b9305a..7cb680b8449cf 100644
--- a/llvm/test/CodeGen/AArch64/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/AArch64/smul_fix_sat.ll
@@ -63,7 +63,7 @@ define i32 @func4(i32 %x, i32 %y) nounwind {
; CHECK-NEXT: eor w10, w0, w1
; CHECK-NEXT: mov w8, #-2147483648 // =0x80000000
; CHECK-NEXT: cmp w10, #0
-; CHECK-NEXT: cinv w8, w8, ge
+; CHECK-NEXT: cinv w8, w8, pl
; CHECK-NEXT: cmp x9, w9, sxtw
; CHECK-NEXT: csel w0, w8, w9, ne
; CHECK-NEXT: ret
@@ -79,7 +79,7 @@ define i64 @func5(i64 %x, i64 %y) {
; CHECK-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000
; CHECK-NEXT: cmp x11, #0
; CHECK-NEXT: smulh x10, x0, x1
-; CHECK-NEXT: cinv x8, x8, ge
+; CHECK-NEXT: cinv x8, x8, pl
; CHECK-NEXT: cmp x10, x9, asr #63
; CHECK-NEXT: csel x0, x8, x9, ne
; CHECK-NEXT: ret
@@ -96,7 +96,7 @@ define i4 @func6(i4 %x, i4 %y) nounwind {
; CHECK-NEXT: smull x11, w10, w9
; CHECK-NEXT: eor w9, w10, w9
; CHECK-NEXT: cmp w9, #0
-; CHECK-NEXT: cinv w8, w8, ge
+; CHECK-NEXT: cinv w8, w8, pl
; CHECK-NEXT: cmp x11, w11, sxtw
; CHECK-NEXT: csel w8, w8, w11, ne
; CHECK-NEXT: asr w0, w8, #28
@@ -158,11 +158,11 @@ define <2 x i32> @vec(<2 x i32> %x, <2 x i32> %y) nounwind {
; CHECK-NEXT: cmp w9, #0
; CHECK-NEXT: smull x9, w12, w10
; CHECK-NEXT: eor w10, w12, w10
-; CHECK-NEXT: cinv w12, w8, ge
+; CHECK-NEXT: cinv w12, w8, pl
; CHECK-NEXT: cmp x11, w11, sxtw
; CHECK-NEXT: csel w11, w12, w11, ne
; CHECK-NEXT: cmp w10, #0
-; CHECK-NEXT: cinv w8, w8, ge
+; CHECK-NEXT: cinv w8, w8, pl
; CHECK-NEXT: cmp x9, w9, sxtw
; CHECK-NEXT: csel w8, w8, w9, ne
; CHECK-NEXT: fmov s0, w8
@@ -188,12 +188,12 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
; CHECK-NEXT: cmp w11, #0
; CHECK-NEXT: smull x11, w13, w12
; CHECK-NEXT: eor w12, w13, w12
-; CHECK-NEXT: cinv w13, w8, ge
+; CHECK-NEXT: cinv w13, w8, pl
; CHECK-NEXT: cmp x9, w9, sxtw
; CHECK-NEXT: csel w9, w13, w9, ne
; CHECK-NEXT: cmp w12, #0
; CHECK-NEXT: mov w13, v1.s[3]
-; CHECK-NEXT: cinv w12, w8, ge
+; CHECK-NEXT: cinv w12, w8, pl
; CHECK-NEXT: cmp x11, w11, sxtw
; CHECK-NEXT: csel w11, w12, w11, ne
; CHECK-NEXT: mov w12, v0.s[3]
@@ -203,13 +203,13 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
; CHECK-NEXT: eor w9, w14, w10
; CHECK-NEXT: smull x10, w12, w13
; CHECK-NEXT: cmp w9, #0
-; CHECK-NEXT: cinv w9, w8, ge
+; CHECK-NEXT: cinv w9, w8, pl
; CHECK-NEXT: cmp x11, w11, sxtw
; CHECK-NEXT: csel w9, w9, w11, ne
; CHECK-NEXT: mov v0.s[2], w9
; CHECK-NEXT: eor w9, w12, w13
; CHECK-NEXT: cmp w9, #0
-; CHECK-NEXT: cinv w8, w8, ge
+; CHECK-NEXT: cinv w8, w8, pl
; CHECK-NEXT: cmp x10, w10, sxtw
; CHECK-NEXT: csel w8, w8, w10, ne
; CHECK-NEXT: mov v0.s[3], w8
diff --git a/llvm/test/CodeGen/AArch64/srem-pow2.ll b/llvm/test/CodeGen/AArch64/srem-pow2.ll
index 4c114d185997e..a0124b9ab4a5e 100644
--- a/llvm/test/CodeGen/AArch64/srem-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/srem-pow2.ll
@@ -45,7 +45,7 @@ define i32 @fold_srem_2_i64(i32 %x) {
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w0, #0x1
; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: cneg w0, w8, lt
+; CHECK-NEXT: cneg w0, w8, mi
; CHECK-NEXT: ret
%1 = srem i32 %x, 2
ret i32 %1
@@ -56,7 +56,7 @@ define i64 @fold_srem_2_i32(i64 %x) {
; CHECK: // %bb.0:
; CHECK-NEXT: and x8, x0, #0x1
; CHECK-NEXT: cmp x0, #0
-; CHECK-NEXT: cneg x0, x8, lt
+; CHECK-NEXT: cneg x0, x8, mi
; CHECK-NEXT: ret
%1 = srem i64 %x, 2
ret i64 %1
diff --git a/llvm/test/CodeGen/AArch64/sshl_sat.ll b/llvm/test/CodeGen/AArch64/sshl_sat.ll
index fbcd2db1298f0..be2b3e763733b 100644
--- a/llvm/test/CodeGen/AArch64/sshl_sat.ll
+++ b/llvm/test/CodeGen/AArch64/sshl_sat.ll
@@ -146,7 +146,7 @@ define i16 @combine_shlsat_to_shl_no_fold(i16 %x) nounwind {
; CHECK-NEXT: mov w9, #-65536 // =0xffff0000
; CHECK-NEXT: mov w10, #-2147483648 // =0x80000000
; CHECK-NEXT: ands w8, w9, w8, lsl #14
-; CHECK-NEXT: cinv w10, w10, ge
+; CHECK-NEXT: cinv w10, w10, pl
; CHECK-NEXT: lsl w9, w8, #3
; CHECK-NEXT: cmp w8, w9, asr #3
; CHECK-NEXT: csel w8, w10, w9, ne
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index 3a33405200132..a85ee22200398 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -1612,7 +1612,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
; CHECK0-NEXT: .LBB27_4:
; CHECK0-NEXT: cmp w0, #0
; CHECK0-NEXT: .cfi_restore vg
-; CHECK0-NEXT: cset w21, lt
+; CHECK0-NEXT: cset w21, mi
; CHECK0-NEXT: bl __arm_sme_state
; CHECK0-NEXT: and x22, x0, #0x1
; CHECK0-NEXT: .cfi_offset vg, -40
@@ -1627,7 +1627,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
; CHECK0-NEXT: smstart sm
; CHECK0-NEXT: .LBB27_8:
; CHECK0-NEXT: cmp w0, #0
-; CHECK0-NEXT: cset w8, ge
+; CHECK0-NEXT: cset w8, pl
; CHECK0-NEXT: tst w8, w21
; CHECK0-NEXT: csel w0, w20, w19, ne
; CHECK0-NEXT: .cfi_restore vg
@@ -1701,7 +1701,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
; CHECK64-NEXT: .LBB27_4:
; CHECK64-NEXT: cmp w0, #0
; CHECK64-NEXT: .cfi_restore vg
-; CHECK64-NEXT: cset w21, lt
+; CHECK64-NEXT: cset w21, mi
; CHECK64-NEXT: bl __arm_sme_state
; CHECK64-NEXT: and x22, x0, #0x1
; CHECK64-NEXT: .cfi_offset vg, -48
@@ -1716,7 +1716,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
; CHECK64-NEXT: smstart sm
; CHECK64-NEXT: .LBB27_8:
; CHECK64-NEXT: cmp w0, #0
-; CHECK64-NEXT: cset w8, ge
+; CHECK64-NEXT: cset w8, pl
; CHECK64-NEXT: tst w8, w21
; CHECK64-NEXT: csel w0, w20, w19, ne
; CHECK64-NEXT: .cfi_restore vg
@@ -1799,7 +1799,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
; CHECK1024-NEXT: .LBB27_4:
; CHECK1024-NEXT: cmp w0, #0
; CHECK1024-NEXT: .cfi_restore vg
-; CHECK1024-NEXT: cset w21, lt
+; CHECK1024-NEXT: cset w21, mi
; CHECK1024-NEXT: bl __arm_sme_state
; CHECK1024-NEXT: and x22, x0, #0x1
; CHECK1024-NEXT: .cfi_offset vg, -48
@@ -1815,7 +1815,7 @@ define i32 @f128_libcall(fp128 %v0, fp128 %v1, fp128 %v2, fp128 %v3, i32 %a, i32
; CHECK1024-NEXT: smstart sm
; CHECK1024-NEXT: .LBB27_8:
; CHECK1024-NEXT: cmp w0, #0
-; CHECK1024-NEXT: cset w8, ge
+; CHECK1024-NEXT: cset w8, pl
; CHECK1024-NEXT: tst w8, w21
; CHECK1024-NEXT: csel w0, w20, w19, ne
; CHECK1024-NEXT: .cfi_restore vg
diff --git a/llvm/test/CodeGen/AArch64/tbz-tbnz.ll b/llvm/test/CodeGen/AArch64/tbz-tbnz.ll
index 3fe7346b3db28..4a04934971711 100644
--- a/llvm/test/CodeGen/AArch64/tbz-tbnz.ll
+++ b/llvm/test/CodeGen/AArch64/tbz-tbnz.ll
@@ -200,18 +200,18 @@ define void @test8(i64 %val1, i64 %val2, i64 %val3) {
; CHECK-SD-LABEL: test8:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: tst x0, x1
-; CHECK-SD-NEXT: b.ge .LBB7_3
+; CHECK-SD-NEXT: b.pl .LBB7_3
; CHECK-SD-NEXT: // %bb.1:
; CHECK-SD-NEXT: and x8, x1, x2
; CHECK-SD-NEXT: tbnz x8, #63, .LBB7_3
; CHECK-SD-NEXT: // %bb.2: // %if.then2
; CHECK-SD-NEXT: tst x0, x1, lsl #63
-; CHECK-SD-NEXT: b.lt .LBB7_4
+; CHECK-SD-NEXT: b.mi .LBB7_4
; CHECK-SD-NEXT: .LBB7_3: // %if.end
; CHECK-SD-NEXT: ret
; CHECK-SD-NEXT: .LBB7_4: // %if.then3
; CHECK-SD-NEXT: tst x0, x1, lsl #62
-; CHECK-SD-NEXT: b.lt .LBB7_3
+; CHECK-SD-NEXT: b.mi .LBB7_3
; CHECK-SD-NEXT: // %bb.5: // %if.then4
; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
@@ -223,18 +223,18 @@ define void @test8(i64 %val1, i64 %val2, i64 %val3) {
; CHECK-GI-LABEL: test8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: tst x0, x1
-; CHECK-GI-NEXT: b.ge .LBB7_3
+; CHECK-GI-NEXT: b.pl .LBB7_3
; CHECK-GI-NEXT: // %bb.1:
; CHECK-GI-NEXT: tst x1, x2
-; CHECK-GI-NEXT: b.lt .LBB7_3
+; CHECK-GI-NEXT: b.mi .LBB7_3
; CHECK-GI-NEXT: // %bb.2: // %if.then2
; CHECK-GI-NEXT: tst x0, x1, lsl #63
-; CHECK-GI-NEXT: b.lt .LBB7_4
+; CHECK-GI-NEXT: b.mi .LBB7_4
; CHECK-GI-NEXT: .LBB7_3: // %if.end
; CHECK-GI-NEXT: ret
; CHECK-GI-NEXT: .LBB7_4: // %if.then3
; CHECK-GI-NEXT: tst x0, x1, lsl #62
-; CHECK-GI-NEXT: b.lt .LBB7_3
+; CHECK-GI-NEXT: b.mi .LBB7_3
; CHECK-GI-NEXT: // %bb.5: // %if.then4
; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
index 1bdf7bbb7f813..62d41fca10db3 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
@@ -28,7 +28,7 @@ define i32 @reduce_and_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: smov w8, v0.b[0]
; CHECK-NEXT: cmp w8, #0
-; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i8> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
@@ -122,7 +122,7 @@ define i32 @reduce_and_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: smov w8, v0.h[0]
; CHECK-NEXT: cmp w8, #0
-; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i16> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
@@ -200,7 +200,7 @@ define i32 @reduce_and_v1i32(<1 x i32> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: cmp w8, #0
-; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i32> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
@@ -261,7 +261,7 @@ define i32 @reduce_and_v1i64(<1 x i64> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: cmp x8, #0
-; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i64> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> %x)
@@ -307,7 +307,7 @@ define i32 @reduce_or_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: smov w8, v0.b[0]
; CHECK-NEXT: cmp w8, #0
-; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i8> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
@@ -401,7 +401,7 @@ define i32 @reduce_or_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: smov w8, v0.h[0]
; CHECK-NEXT: cmp w8, #0
-; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i16> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
@@ -479,7 +479,7 @@ define i32 @reduce_or_v1i32(<1 x i32> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: cmp w8, #0
-; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i32> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
@@ -540,7 +540,7 @@ define i32 @reduce_or_v1i64(<1 x i64> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: cmp x8, #0
-; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i64> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %x)
@@ -586,7 +586,7 @@ define i32 @reduce_xor_v1i8(<1 x i8> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: smov w8, v0.b[0]
; CHECK-NEXT: cmp w8, #0
-; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i8> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)
@@ -681,7 +681,7 @@ define i32 @reduce_xor_v1i16(<1 x i16> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: smov w8, v0.h[0]
; CHECK-NEXT: cmp w8, #0
-; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i16> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)
@@ -759,7 +759,7 @@ define i32 @reduce_xor_v1i32(<1 x i32> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: cmp w8, #0
-; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i32> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)
@@ -820,7 +820,7 @@ define i32 @reduce_xor_v1i64(<1 x i64> %a0, i32 %a1, i32 %a2) nounwind {
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: cmp x8, #0
-; CHECK-NEXT: csel w0, w0, w1, lt
+; CHECK-NEXT: csel w0, w0, w1, mi
; CHECK-NEXT: ret
%x = icmp slt <1 x i64> %a0, zeroinitializer
%y = call i1 @llvm.vector.reduce.xor.v1i1(<1 x i1> %x)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
index 300081dc3ec40..79a8fc35e833d 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
@@ -589,7 +589,7 @@ define fp128 @test_v2f128(<2 x fp128> %a) nounwind {
; CHECK-NEXT: bl __lttf2
; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: cmp w0, #0
-; CHECK-NEXT: b.ge .LBB18_2
+; CHECK-NEXT: b.pl .LBB18_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .LBB18_2:
>From 60ab44a85773ce5f941e6a1bf5da3a40bf7ce9dd Mon Sep 17 00:00:00 2001
From: AZero13 <gfunni234 at gmail.com>
Date: Fri, 8 Aug 2025 08:25:12 -0400
Subject: [PATCH 2/2] Fix concerns
---
.../Target/AArch64/AArch64ISelLowering.cpp | 4524 +----------------
1 file changed, 4 insertions(+), 4520 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2c02630174ad7..1348b38876d98 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -4066,7 +4066,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
Cmp = emitComparison(
SExt, DAG.getSignedConstant(ValueofRHS, DL, RHS.getValueType()), CC,
DL, DAG);
- AArch64CC = changeIntCCToAArch64CC(CC, RHS);
+ AArch64CC = changeIntCCToAArch64CC(CC);
}
}
@@ -11493,7 +11493,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
!RHSVal->isZero() && !RHSVal->isAllOnes()) {
- AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC, RHS);
+ AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
// Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
// "a != C ? x : a" to avoid materializing C.
if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
@@ -11504,7 +11504,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(
assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
// Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
// avoid materializing C.
- AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC, RHS);
+ AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
Opcode = AArch64ISD::CSINV;
TVal = LHS;
@@ -26340,4520 +26340,4 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
}
- // In the case of non-temporal gather loads and quadword gather loads there's
- // only one addressing mode : "vector + scalar", e.g.
- // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
- // Since we do have intrinsics that allow the arguments to be in a different
- // order, we may need to swap them to match the spec.
- if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
- Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
- Offset.getValueType().isVector())
- std::swap(Base, Offset);
-
- // GLD{FF}1_IMM requires that the offset is an immediate that is:
- // * a multiple of #SizeInBytes,
- // * in the range [0, 31 x #SizeInBytes],
- // where #SizeInBytes is the size in bytes of the loaded items. For
- // immediates outside that range and non-immediate scalar offsets use
- // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
- if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
- Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
- if (!isValidImmForSVEVecImmAddrMode(Offset,
- RetVT.getScalarSizeInBits() / 8)) {
- if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
- Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
- ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
- : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
- else
- Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
- ? AArch64ISD::GLD1_MERGE_ZERO
- : AArch64ISD::GLDFF1_MERGE_ZERO;
-
- std::swap(Base, Offset);
- }
- }
-
- auto &TLI = DAG.getTargetLoweringInfo();
- if (!TLI.isTypeLegal(Base.getValueType()))
- return SDValue();
-
- // Some gather load variants allow unpacked offsets, but only as nxv2i32
- // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
- // nxv2i64. Legalize accordingly.
- if (!OnlyPackedOffsets &&
- Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
- Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
-
- // Return value type that is representable in hardware
- EVT HwRetVt = getSVEContainerType(RetVT);
-
- // Keep the original output value type around - this is needed to be able to
- // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
- // values we want the integer equivalent, so just use HwRetVT.
- SDValue OutVT = DAG.getValueType(RetVT);
- if (RetVT.isFloatingPoint())
- OutVT = DAG.getValueType(HwRetVt);
-
- SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
- SDValue Ops[] = {N->getOperand(0), // Chain
- N->getOperand(2), // Pg
- Base, Offset, OutVT};
-
- SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
- SDValue LoadChain = SDValue(Load.getNode(), 1);
-
- if (RetVT.isInteger() && (RetVT != HwRetVt))
- Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
-
- // If the original return value was FP, bitcast accordingly. Doing it here
- // means that we can avoid adding TableGen patterns for FPs.
- if (RetVT.isFloatingPoint())
- Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
-
- return DAG.getMergeValues({Load, LoadChain}, DL);
-}
-
-static SDValue
-performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
- SelectionDAG &DAG) {
- SDLoc DL(N);
- SDValue Src = N->getOperand(0);
- unsigned Opc = Src->getOpcode();
-
- // Sign extend of an unsigned unpack -> signed unpack
- if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
-
- unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
- : AArch64ISD::SUNPKLO;
-
- // Push the sign extend to the operand of the unpack
- // This is necessary where, for example, the operand of the unpack
- // is another unpack:
- // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
- // ->
- // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
- // ->
- // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
- SDValue ExtOp = Src->getOperand(0);
- auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
- EVT EltTy = VT.getVectorElementType();
- (void)EltTy;
-
- assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
- "Sign extending from an invalid type");
-
- EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
-
- SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
- ExtOp, DAG.getValueType(ExtVT));
-
- return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
- }
-
- if (DCI.isBeforeLegalizeOps())
- return SDValue();
-
- if (!EnableCombineMGatherIntrinsics)
- return SDValue();
-
- // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
- // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
- unsigned NewOpc;
- unsigned MemVTOpNum = 4;
- switch (Opc) {
- case AArch64ISD::LD1_MERGE_ZERO:
- NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
- MemVTOpNum = 3;
- break;
- case AArch64ISD::LDNF1_MERGE_ZERO:
- NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
- MemVTOpNum = 3;
- break;
- case AArch64ISD::LDFF1_MERGE_ZERO:
- NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
- MemVTOpNum = 3;
- break;
- case AArch64ISD::GLD1_MERGE_ZERO:
- NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
- break;
- case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
- NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
- break;
- case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
- NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
- break;
- case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
- NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
- break;
- case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
- NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
- break;
- case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
- NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
- break;
- case AArch64ISD::GLD1_IMM_MERGE_ZERO:
- NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
- break;
- case AArch64ISD::GLDFF1_MERGE_ZERO:
- NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
- break;
- case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
- NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
- break;
- case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
- NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
- break;
- case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
- NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
- break;
- case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
- NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
- break;
- case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
- NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
- break;
- case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
- NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
- break;
- case AArch64ISD::GLDNT1_MERGE_ZERO:
- NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
- break;
- default:
- return SDValue();
- }
-
- EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
- EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
-
- if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
- return SDValue();
-
- EVT DstVT = N->getValueType(0);
- SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
-
- SmallVector<SDValue, 5> Ops;
- for (unsigned I = 0; I < Src->getNumOperands(); ++I)
- Ops.push_back(Src->getOperand(I));
-
- SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
- DCI.CombineTo(N, ExtLoad);
- DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
-
- // Return N so it doesn't get rechecked
- return SDValue(N, 0);
-}
-
-/// Legalize the gather prefetch (scalar + vector addressing mode) when the
-/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
-/// != nxv2i32) do not need legalization.
-static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
- const unsigned OffsetPos = 4;
- SDValue Offset = N->getOperand(OffsetPos);
-
- // Not an unpacked vector, bail out.
- if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
- return SDValue();
-
- // Extend the unpacked offset vector to 64-bit lanes.
- SDLoc DL(N);
- Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
- SmallVector<SDValue, 5> Ops(N->ops());
- // Replace the offset operand with the 64-bit one.
- Ops[OffsetPos] = Offset;
-
- return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
-}
-
-/// Combines a node carrying the intrinsic
-/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
-/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
-/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
-/// sve gather prefetch instruction with vector plus immediate addressing mode.
-static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
- unsigned ScalarSizeInBytes) {
- const unsigned ImmPos = 4, OffsetPos = 3;
- // No need to combine the node if the immediate is valid...
- if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
- return SDValue();
-
- // ...otherwise swap the offset base with the offset...
- SmallVector<SDValue, 5> Ops(N->ops());
- std::swap(Ops[ImmPos], Ops[OffsetPos]);
- // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
- // `aarch64_sve_prfb_gather_uxtw_index`.
- SDLoc DL(N);
- Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
- MVT::i64);
-
- return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
-}
-
-// Return true if the vector operation can guarantee only the first lane of its
-// result contains data, with all bits in other lanes set to zero.
-static bool isLanes1toNKnownZero(SDValue Op) {
- switch (Op.getOpcode()) {
- default:
- return false;
- case AArch64ISD::ANDV_PRED:
- case AArch64ISD::EORV_PRED:
- case AArch64ISD::FADDA_PRED:
- case AArch64ISD::FADDV_PRED:
- case AArch64ISD::FMAXNMV_PRED:
- case AArch64ISD::FMAXV_PRED:
- case AArch64ISD::FMINNMV_PRED:
- case AArch64ISD::FMINV_PRED:
- case AArch64ISD::ORV_PRED:
- case AArch64ISD::SADDV_PRED:
- case AArch64ISD::SMAXV_PRED:
- case AArch64ISD::SMINV_PRED:
- case AArch64ISD::UADDV_PRED:
- case AArch64ISD::UMAXV_PRED:
- case AArch64ISD::UMINV_PRED:
- return true;
- }
-}
-
-static SDValue removeRedundantInsertVectorElt(SDNode *N) {
- assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
- SDValue InsertVec = N->getOperand(0);
- SDValue InsertElt = N->getOperand(1);
- SDValue InsertIdx = N->getOperand(2);
-
- // We only care about inserts into the first element...
- if (!isNullConstant(InsertIdx))
- return SDValue();
- // ...of a zero'd vector...
- if (!ISD::isConstantSplatVectorAllZeros(InsertVec.getNode()))
- return SDValue();
- // ...where the inserted data was previously extracted...
- if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
- return SDValue();
-
- SDValue ExtractVec = InsertElt.getOperand(0);
- SDValue ExtractIdx = InsertElt.getOperand(1);
-
- // ...from the first element of a vector.
- if (!isNullConstant(ExtractIdx))
- return SDValue();
-
- // If we get here we are effectively trying to zero lanes 1-N of a vector.
-
- // Ensure there's no type conversion going on.
- if (N->getValueType(0) != ExtractVec.getValueType())
- return SDValue();
-
- if (!isLanes1toNKnownZero(ExtractVec))
- return SDValue();
-
- // The explicit zeroing is redundant.
- return ExtractVec;
-}
-
-static SDValue
-performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
- if (SDValue Res = removeRedundantInsertVectorElt(N))
- return Res;
-
- return performPostLD1Combine(N, DCI, true);
-}
-
-static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const AArch64Subtarget *Subtarget) {
- SDValue N0 = N->getOperand(0);
- EVT VT = N->getValueType(0);
-
- // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
- if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)
- return SDValue();
-
- auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
- EVT EltVT = VT.getVectorElementType();
- return EltVT == MVT::f32 || EltVT == MVT::f64;
- };
-
- // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
- // We purposefully don't care about legality of the nodes here as we know
- // they can be split down into something legal.
- if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
- N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
- VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
- VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
- LoadSDNode *LN0 = cast<LoadSDNode>(N0);
- SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
- LN0->getChain(), LN0->getBasePtr(),
- N0.getValueType(), LN0->getMemOperand());
- DCI.CombineTo(N, ExtLoad);
- DCI.CombineTo(
- N0.getNode(),
- DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
- DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
- ExtLoad.getValue(1));
- return SDValue(N, 0); // Return N so it doesn't get rechecked!
- }
-
- return SDValue();
-}
-
-static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
- const AArch64Subtarget *Subtarget) {
- EVT VT = N->getValueType(0);
-
- // Don't expand for NEON, SVE2 or SME
- if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
- return SDValue();
-
- SDLoc DL(N);
-
- SDValue Mask = N->getOperand(0);
- SDValue In1 = N->getOperand(1);
- SDValue In2 = N->getOperand(2);
-
- SDValue InvMask = DAG.getNOT(DL, Mask, VT);
- SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
- SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
- return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
-}
-
-static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
- EVT VT = N->getValueType(0);
-
- SDValue Insert = N->getOperand(0);
- if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
- return SDValue();
-
- if (!Insert.getOperand(0).isUndef())
- return SDValue();
-
- uint64_t IdxInsert = Insert.getConstantOperandVal(2);
- uint64_t IdxDupLane = N->getConstantOperandVal(1);
- if (IdxInsert != 0 || IdxDupLane != 0)
- return SDValue();
-
- SDValue Bitcast = Insert.getOperand(1);
- if (Bitcast.getOpcode() != ISD::BITCAST)
- return SDValue();
-
- SDValue Subvec = Bitcast.getOperand(0);
- EVT SubvecVT = Subvec.getValueType();
- if (!SubvecVT.is128BitVector())
- return SDValue();
- EVT NewSubvecVT =
- getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType());
-
- SDLoc DL(N);
- SDValue NewInsert =
- DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
- DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
- SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
- NewInsert, N->getOperand(1));
- return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
-}
-
-// Try to combine mull with uzp1.
-static SDValue tryCombineMULLWithUZP1(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- SelectionDAG &DAG) {
- if (DCI.isBeforeLegalizeOps())
- return SDValue();
-
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
-
- SDValue ExtractHigh;
- SDValue ExtractLow;
- SDValue TruncHigh;
- SDValue TruncLow;
- SDLoc DL(N);
-
- // Check the operands are trunc and extract_high.
- if (isEssentiallyExtractHighSubvector(LHS) &&
- RHS.getOpcode() == ISD::TRUNCATE) {
- TruncHigh = RHS;
- if (LHS.getOpcode() == ISD::BITCAST)
- ExtractHigh = LHS.getOperand(0);
- else
- ExtractHigh = LHS;
- } else if (isEssentiallyExtractHighSubvector(RHS) &&
- LHS.getOpcode() == ISD::TRUNCATE) {
- TruncHigh = LHS;
- if (RHS.getOpcode() == ISD::BITCAST)
- ExtractHigh = RHS.getOperand(0);
- else
- ExtractHigh = RHS;
- } else
- return SDValue();
-
- // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
- // with uzp1.
- // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
- SDValue TruncHighOp = TruncHigh.getOperand(0);
- EVT TruncHighOpVT = TruncHighOp.getValueType();
- if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
- DAG.isSplatValue(TruncHighOp, false))
- return SDValue();
-
- // Check there is other extract_high with same source vector.
- // For example,
- //
- // t18: v4i16 = extract_subvector t2, Constant:i64<0>
- // t12: v4i16 = truncate t11
- // t31: v4i32 = AArch64ISD::SMULL t18, t12
- // t23: v4i16 = extract_subvector t2, Constant:i64<4>
- // t16: v4i16 = truncate t15
- // t30: v4i32 = AArch64ISD::SMULL t23, t1
- //
- // This dagcombine assumes the two extract_high uses same source vector in
- // order to detect the pair of the mull. If they have different source vector,
- // this code will not work.
- // TODO: Should also try to look through a bitcast.
- bool HasFoundMULLow = true;
- SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
- if (ExtractHighSrcVec->use_size() != 2)
- HasFoundMULLow = false;
-
- // Find ExtractLow.
- for (SDNode *User : ExtractHighSrcVec.getNode()->users()) {
- if (User == ExtractHigh.getNode())
- continue;
-
- if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
- !isNullConstant(User->getOperand(1))) {
- HasFoundMULLow = false;
- break;
- }
-
- ExtractLow.setNode(User);
- }
-
- if (!ExtractLow || !ExtractLow->hasOneUse())
- HasFoundMULLow = false;
-
- // Check ExtractLow's user.
- if (HasFoundMULLow) {
- SDNode *ExtractLowUser = *ExtractLow.getNode()->user_begin();
- if (ExtractLowUser->getOpcode() != N->getOpcode()) {
- HasFoundMULLow = false;
- } else {
- if (ExtractLowUser->getOperand(0) == ExtractLow) {
- if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
- TruncLow = ExtractLowUser->getOperand(1);
- else
- HasFoundMULLow = false;
- } else {
- if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
- TruncLow = ExtractLowUser->getOperand(0);
- else
- HasFoundMULLow = false;
- }
- }
- }
-
- // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
- // with uzp1.
- // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
- EVT TruncHighVT = TruncHigh.getValueType();
- EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
- SDValue TruncLowOp =
- HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
- EVT TruncLowOpVT = TruncLowOp.getValueType();
- if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
- DAG.isSplatValue(TruncLowOp, false)))
- return SDValue();
-
- // Create uzp1, extract_high and extract_low.
- if (TruncHighOpVT != UZP1VT)
- TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
- if (TruncLowOpVT != UZP1VT)
- TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
-
- SDValue UZP1 =
- DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
- SDValue HighIdxCst =
- DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
- SDValue NewTruncHigh =
- DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
- DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
-
- if (HasFoundMULLow) {
- EVT TruncLowVT = TruncLow.getValueType();
- SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
- UZP1, ExtractLow.getOperand(1));
- DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
- }
-
- return SDValue(N, 0);
-}
-
-static SDValue performMULLCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- SelectionDAG &DAG) {
- if (SDValue Val =
- tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG))
- return Val;
-
- if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
- return Val;
-
- return SDValue();
-}
-
-static SDValue
-performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
- SelectionDAG &DAG) {
- // Let's do below transform.
- //
- // t34: v4i32 = AArch64ISD::UADDLV t2
- // t35: i32 = extract_vector_elt t34, Constant:i64<0>
- // t7: i64 = zero_extend t35
- // t20: v1i64 = scalar_to_vector t7
- // ==>
- // t34: v4i32 = AArch64ISD::UADDLV t2
- // t39: v2i32 = extract_subvector t34, Constant:i64<0>
- // t40: v1i64 = AArch64ISD::NVCAST t39
- if (DCI.isBeforeLegalizeOps())
- return SDValue();
-
- EVT VT = N->getValueType(0);
- if (VT != MVT::v1i64)
- return SDValue();
-
- SDValue ZEXT = N->getOperand(0);
- if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
- return SDValue();
-
- SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
- if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
- EXTRACT_VEC_ELT.getValueType() != MVT::i32)
- return SDValue();
-
- if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
- return SDValue();
-
- SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
- if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
- UADDLV.getValueType() != MVT::v4i32 ||
- UADDLV.getOperand(0).getValueType() != MVT::v8i8)
- return SDValue();
-
- // Let's generate new sequence with AArch64ISD::NVCAST.
- SDLoc DL(N);
- SDValue EXTRACT_SUBVEC =
- DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
- DAG.getConstant(0, DL, MVT::i64));
- SDValue NVCAST =
- DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
-
- return NVCAST;
-}
-
-/// If the operand is a bitwise AND with a constant RHS, and the shift has a
-/// constant RHS and is the only use, we can pull it out of the shift, i.e.
-///
-/// (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
-///
-/// We prefer this canonical form to match existing isel patterns.
-static SDValue performSHLCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- SelectionDAG &DAG) {
- if (DCI.isBeforeLegalizeOps())
- return SDValue();
-
- SDValue Op0 = N->getOperand(0);
- if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
- return SDValue();
-
- SDValue C1 = Op0->getOperand(1);
- SDValue C2 = N->getOperand(1);
- if (!isa<ConstantSDNode>(C1) || !isa<ConstantSDNode>(C2))
- return SDValue();
-
- // Might be folded into shifted op, do not lower.
- if (N->hasOneUse()) {
- unsigned UseOpc = N->user_begin()->getOpcode();
- if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
- UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
- return SDValue();
- }
-
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
-
- // Don't combine unless (shl C1, C2) can be constant folded. Otherwise,
- // DAGCombiner will simplify (and (op x...), (op y...)) -> (op (and x, y))
- // causing infinite loop. Result may also be worse.
- SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
- if (!isa<ConstantSDNode>(NewRHS))
- return SDValue();
-
- SDValue X = Op0->getOperand(0);
- SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
- return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
-}
-
-static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG) {
- unsigned IntrinsicID = N->getConstantOperandVal(1);
- auto Register =
- (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
- : AArch64SysReg::RNDRRS);
- SDLoc DL(N);
- SDValue A = DAG.getNode(
- AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
- N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
- SDValue B = DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
- DAG.getConstant(0, DL, MVT::i32),
- DAG.getConstant(0, DL, MVT::i32),
- getCondCode(DAG, AArch64CC::NE), A.getValue(1));
- return DAG.getMergeValues(
- {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
-}
-
-SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
- DAGCombinerInfo &DCI) const {
- SelectionDAG &DAG = DCI.DAG;
- switch (N->getOpcode()) {
- default:
- LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
- break;
- case ISD::VECREDUCE_AND:
- case ISD::VECREDUCE_OR:
- case ISD::VECREDUCE_XOR:
- return performVecReduceBitwiseCombine(N, DCI, DAG);
- case ISD::ADD:
- case ISD::SUB:
- return performAddSubCombine(N, DCI);
- case ISD::BUILD_VECTOR:
- return performBuildVectorCombine(N, DCI, DAG);
- case ISD::SMIN:
- return performSMINCombine(N, DAG);
- case ISD::TRUNCATE:
- return performTruncateCombine(N, DAG, DCI);
- case AArch64ISD::ANDS:
- return performFlagSettingCombine(N, DCI, ISD::AND);
- case AArch64ISD::ADC:
- if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
- return R;
- return foldADCToCINC(N, DAG);
- case AArch64ISD::SBC:
- return foldOverflowCheck(N, DAG, /* IsAdd */ false);
- case AArch64ISD::ADCS:
- if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
- return R;
- return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
- case AArch64ISD::SBCS:
- if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
- return R;
- return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
- case AArch64ISD::BICi: {
- APInt DemandedBits =
- APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
- APInt DemandedElts =
- APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
-
- if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(
- SDValue(N, 0), DemandedBits, DemandedElts, DCI))
- return SDValue();
-
- break;
- }
- case ISD::XOR:
- return performXorCombine(N, DAG, DCI, Subtarget);
- case ISD::MUL:
- return performMulCombine(N, DAG, DCI, Subtarget);
- case ISD::SINT_TO_FP:
- case ISD::UINT_TO_FP:
- return performIntToFpCombine(N, DAG, DCI, Subtarget);
- case ISD::FP_TO_SINT:
- case ISD::FP_TO_UINT:
- case ISD::FP_TO_SINT_SAT:
- case ISD::FP_TO_UINT_SAT:
- return performFpToIntCombine(N, DAG, DCI, Subtarget);
- case ISD::OR:
- return performORCombine(N, DCI, Subtarget, *this);
- case ISD::AND:
- return performANDCombine(N, DCI);
- case ISD::FADD:
- return performFADDCombine(N, DCI);
- case ISD::INTRINSIC_WO_CHAIN:
- return performIntrinsicCombine(N, DCI, Subtarget);
- case ISD::ANY_EXTEND:
- case ISD::ZERO_EXTEND:
- case ISD::SIGN_EXTEND:
- return performExtendCombine(N, DCI, DAG);
- case ISD::SIGN_EXTEND_INREG:
- return performSignExtendInRegCombine(N, DCI, DAG);
- case ISD::CONCAT_VECTORS:
- return performConcatVectorsCombine(N, DCI, DAG);
- case ISD::EXTRACT_SUBVECTOR:
- return performExtractSubvectorCombine(N, DCI, DAG);
- case ISD::INSERT_SUBVECTOR:
- return performInsertSubvectorCombine(N, DCI, DAG);
- case ISD::SELECT:
- return performSelectCombine(N, DCI);
- case ISD::VSELECT:
- return performVSelectCombine(N, DCI.DAG);
- case ISD::SETCC:
- return performSETCCCombine(N, DCI, DAG);
- case ISD::LOAD:
- return performLOADCombine(N, DCI, DAG, Subtarget);
- case ISD::STORE:
- return performSTORECombine(N, DCI, DAG, Subtarget);
- case ISD::MSTORE:
- return performMSTORECombine(N, DCI, DAG, Subtarget);
- case ISD::MGATHER:
- case ISD::MSCATTER:
- case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
- return performMaskedGatherScatterCombine(N, DCI, DAG);
- case ISD::FP_EXTEND:
- return performFPExtendCombine(N, DAG, DCI, Subtarget);
- case AArch64ISD::BRCOND:
- return performBRCONDCombine(N, DCI, DAG);
- case AArch64ISD::TBNZ:
- case AArch64ISD::TBZ:
- return performTBZCombine(N, DCI, DAG);
- case AArch64ISD::CSEL:
- return performCSELCombine(N, DCI, DAG);
- case AArch64ISD::DUP:
- case AArch64ISD::DUPLANE8:
- case AArch64ISD::DUPLANE16:
- case AArch64ISD::DUPLANE32:
- case AArch64ISD::DUPLANE64:
- return performDUPCombine(N, DCI);
- case AArch64ISD::DUPLANE128:
- return performDupLane128Combine(N, DAG);
- case AArch64ISD::NVCAST:
- return performNVCASTCombine(N, DAG);
- case AArch64ISD::SPLICE:
- return performSpliceCombine(N, DAG);
- case AArch64ISD::UUNPKLO:
- case AArch64ISD::UUNPKHI:
- return performUnpackCombine(N, DAG, Subtarget);
- case AArch64ISD::UZP1:
- case AArch64ISD::UZP2:
- return performUzpCombine(N, DAG, Subtarget);
- case AArch64ISD::SETCC_MERGE_ZERO:
- return performSetccMergeZeroCombine(N, DCI);
- case AArch64ISD::REINTERPRET_CAST:
- return performReinterpretCastCombine(N);
- case AArch64ISD::GLD1_MERGE_ZERO:
- case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
- case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
- case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
- case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
- case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
- case AArch64ISD::GLD1_IMM_MERGE_ZERO:
- case AArch64ISD::GLD1S_MERGE_ZERO:
- case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
- case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
- case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
- case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
- case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
- case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
- return performGLD1Combine(N, DAG);
- case AArch64ISD::VASHR:
- case AArch64ISD::VLSHR:
- return performVectorShiftCombine(N, *this, DCI);
- case AArch64ISD::SUNPKLO:
- return performSunpkloCombine(N, DAG);
- case AArch64ISD::BSP:
- return performBSPExpandForSVE(N, DAG, Subtarget);
- case ISD::INSERT_VECTOR_ELT:
- return performInsertVectorEltCombine(N, DCI);
- case ISD::EXTRACT_VECTOR_ELT:
- return performExtractVectorEltCombine(N, DCI, Subtarget);
- case ISD::VECREDUCE_ADD:
- return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
- case ISD::GET_ACTIVE_LANE_MASK:
- return performActiveLaneMaskCombine(N, DCI, Subtarget);
- case AArch64ISD::UADDV:
- return performUADDVCombine(N, DAG);
- case AArch64ISD::SMULL:
- case AArch64ISD::UMULL:
- case AArch64ISD::PMULL:
- return performMULLCombine(N, DCI, DAG);
- case ISD::INTRINSIC_VOID:
- case ISD::INTRINSIC_W_CHAIN:
- switch (N->getConstantOperandVal(1)) {
- case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
- return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
- case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
- return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
- case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
- return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
- case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
- return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
- case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
- case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
- case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
- case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
- case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
- case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
- case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
- case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
- return legalizeSVEGatherPrefetchOffsVec(N, DAG);
- case Intrinsic::aarch64_neon_ld2:
- case Intrinsic::aarch64_neon_ld3:
- case Intrinsic::aarch64_neon_ld4:
- case Intrinsic::aarch64_neon_ld1x2:
- case Intrinsic::aarch64_neon_ld1x3:
- case Intrinsic::aarch64_neon_ld1x4:
- case Intrinsic::aarch64_neon_ld2lane:
- case Intrinsic::aarch64_neon_ld3lane:
- case Intrinsic::aarch64_neon_ld4lane:
- case Intrinsic::aarch64_neon_ld2r:
- case Intrinsic::aarch64_neon_ld3r:
- case Intrinsic::aarch64_neon_ld4r:
- case Intrinsic::aarch64_neon_st2:
- case Intrinsic::aarch64_neon_st3:
- case Intrinsic::aarch64_neon_st4:
- case Intrinsic::aarch64_neon_st1x2:
- case Intrinsic::aarch64_neon_st1x3:
- case Intrinsic::aarch64_neon_st1x4:
- case Intrinsic::aarch64_neon_st2lane:
- case Intrinsic::aarch64_neon_st3lane:
- case Intrinsic::aarch64_neon_st4lane:
- return performNEONPostLDSTCombine(N, DCI, DAG);
- case Intrinsic::aarch64_sve_ldnt1:
- return performLDNT1Combine(N, DAG);
- case Intrinsic::aarch64_sve_ld1rq:
- return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
- case Intrinsic::aarch64_sve_ld1ro:
- return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
- case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
- return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
- case Intrinsic::aarch64_sve_ldnt1_gather:
- return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
- case Intrinsic::aarch64_sve_ldnt1_gather_index:
- return performGatherLoadCombine(N, DAG,
- AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
- case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
- return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
- case Intrinsic::aarch64_sve_ld1:
- return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
- case Intrinsic::aarch64_sve_ldnf1:
- return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
- case Intrinsic::aarch64_sve_ldff1:
- return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
- case Intrinsic::aarch64_sve_st1:
- return performST1Combine(N, DAG);
- case Intrinsic::aarch64_sve_stnt1:
- return performSTNT1Combine(N, DAG);
- case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
- return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
- case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
- return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
- case Intrinsic::aarch64_sve_stnt1_scatter:
- return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
- case Intrinsic::aarch64_sve_stnt1_scatter_index:
- return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
- case Intrinsic::aarch64_sve_ld1_gather:
- return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
- case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
- case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
- return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
- case Intrinsic::aarch64_sve_ld1q_gather_index:
- return performGatherLoadCombine(N, DAG,
- AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
- case Intrinsic::aarch64_sve_ld1_gather_index:
- return performGatherLoadCombine(N, DAG,
- AArch64ISD::GLD1_SCALED_MERGE_ZERO);
- case Intrinsic::aarch64_sve_ld1_gather_sxtw:
- return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
- /*OnlyPackedOffsets=*/false);
- case Intrinsic::aarch64_sve_ld1_gather_uxtw:
- return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
- /*OnlyPackedOffsets=*/false);
- case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
- return performGatherLoadCombine(N, DAG,
- AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
- /*OnlyPackedOffsets=*/false);
- case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
- return performGatherLoadCombine(N, DAG,
- AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
- /*OnlyPackedOffsets=*/false);
- case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
- return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
- case Intrinsic::aarch64_sve_ldff1_gather:
- return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
- case Intrinsic::aarch64_sve_ldff1_gather_index:
- return performGatherLoadCombine(N, DAG,
- AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
- case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
- return performGatherLoadCombine(N, DAG,
- AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
- /*OnlyPackedOffsets=*/false);
- case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
- return performGatherLoadCombine(N, DAG,
- AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
- /*OnlyPackedOffsets=*/false);
- case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
- return performGatherLoadCombine(N, DAG,
- AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
- /*OnlyPackedOffsets=*/false);
- case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
- return performGatherLoadCombine(N, DAG,
- AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
- /*OnlyPackedOffsets=*/false);
- case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
- return performGatherLoadCombine(N, DAG,
- AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
- case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
- case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
- return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
- case Intrinsic::aarch64_sve_st1q_scatter_index:
- return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
- case Intrinsic::aarch64_sve_st1_scatter:
- return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
- case Intrinsic::aarch64_sve_st1_scatter_index:
- return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
- case Intrinsic::aarch64_sve_st1_scatter_sxtw:
- return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
- /*OnlyPackedOffsets=*/false);
- case Intrinsic::aarch64_sve_st1_scatter_uxtw:
- return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
- /*OnlyPackedOffsets=*/false);
- case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
- return performScatterStoreCombine(N, DAG,
- AArch64ISD::SST1_SXTW_SCALED_PRED,
- /*OnlyPackedOffsets=*/false);
- case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
- return performScatterStoreCombine(N, DAG,
- AArch64ISD::SST1_UXTW_SCALED_PRED,
- /*OnlyPackedOffsets=*/false);
- case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
- return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
- case Intrinsic::aarch64_rndr:
- case Intrinsic::aarch64_rndrrs:
- return performRNDRCombine(N, DAG);
- case Intrinsic::aarch64_sme_ldr_zt:
- return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
- DAG.getVTList(MVT::Other), N->getOperand(0),
- N->getOperand(2), N->getOperand(3));
- case Intrinsic::aarch64_sme_str_zt:
- return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
- DAG.getVTList(MVT::Other), N->getOperand(0),
- N->getOperand(2), N->getOperand(3));
- default:
- break;
- }
- break;
- case ISD::GlobalAddress:
- return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
- case ISD::CTLZ:
- return performCTLZCombine(N, DAG, Subtarget);
- case ISD::SCALAR_TO_VECTOR:
- return performScalarToVectorCombine(N, DCI, DAG);
- case ISD::SHL:
- return performSHLCombine(N, DCI, DAG);
- }
- return SDValue();
-}
-
-// Check if the return value is used as only a return value, as otherwise
-// we can't perform a tail-call. In particular, we need to check for
-// target ISD nodes that are returns and any other "odd" constructs
-// that the generic analysis code won't necessarily catch.
-bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
- SDValue &Chain) const {
- if (N->getNumValues() != 1)
- return false;
- if (!N->hasNUsesOfValue(1, 0))
- return false;
-
- SDValue TCChain = Chain;
- SDNode *Copy = *N->user_begin();
- if (Copy->getOpcode() == ISD::CopyToReg) {
- // If the copy has a glue operand, we conservatively assume it isn't safe to
- // perform a tail call.
- if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
- MVT::Glue)
- return false;
- TCChain = Copy->getOperand(0);
- } else if (Copy->getOpcode() != ISD::FP_EXTEND)
- return false;
-
- bool HasRet = false;
- for (SDNode *Node : Copy->users()) {
- if (Node->getOpcode() != AArch64ISD::RET_GLUE)
- return false;
- HasRet = true;
- }
-
- if (!HasRet)
- return false;
-
- Chain = TCChain;
- return true;
-}
-
-// Return whether the an instruction can potentially be optimized to a tail
-// call. This will cause the optimizers to attempt to move, or duplicate,
-// return instructions to help enable tail call optimizations for this
-// instruction.
-bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
- return CI->isTailCall();
-}
-
-bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
- Register Offset, bool IsPre,
- MachineRegisterInfo &MRI) const {
- auto CstOffset = getIConstantVRegVal(Offset, MRI);
- if (!CstOffset || CstOffset->isZero())
- return false;
-
- // All of the indexed addressing mode instructions take a signed 9 bit
- // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
- // encodes the sign/indexing direction.
- return isInt<9>(CstOffset->getSExtValue());
-}
-
-bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
- SDValue &Base,
- SDValue &Offset,
- SelectionDAG &DAG) const {
- if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
- return false;
-
- // Non-null if there is exactly one user of the loaded value (ignoring chain).
- SDNode *ValOnlyUser = nullptr;
- for (SDUse &U : N->uses()) {
- if (U.getResNo() == 1)
- continue; // Ignore chain.
- if (ValOnlyUser == nullptr)
- ValOnlyUser = U.getUser();
- else {
- ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
- break;
- }
- }
-
- auto IsUndefOrZero = [](SDValue V) {
- return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
- };
-
- // If the only user of the value is a scalable vector splat, it is
- // preferable to do a replicating load (ld1r*).
- if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
- (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
- (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
- IsUndefOrZero(ValOnlyUser->getOperand(2)))))
- return false;
-
- Base = Op->getOperand(0);
- // All of the indexed addressing mode instructions take a signed
- // 9 bit immediate offset.
- if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
- int64_t RHSC = RHS->getSExtValue();
- if (Op->getOpcode() == ISD::SUB)
- RHSC = -(uint64_t)RHSC;
- if (!isInt<9>(RHSC))
- return false;
- // When big-endian VLD1/VST1 are used for vector load and store, and these
- // only allow an offset that's equal to the store size.
- EVT MemType = cast<MemSDNode>(N)->getMemoryVT();
- if (!Subtarget->isLittleEndian() && MemType.isVector() &&
- (uint64_t)RHSC != MemType.getStoreSize())
- return false;
- // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
- // when dealing with subtraction.
- Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
- return true;
- }
- return false;
-}
-
-bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
- SDValue &Offset,
- ISD::MemIndexedMode &AM,
- SelectionDAG &DAG) const {
- EVT VT;
- SDValue Ptr;
- if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
- VT = LD->getMemoryVT();
- Ptr = LD->getBasePtr();
- } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
- VT = ST->getMemoryVT();
- Ptr = ST->getBasePtr();
- } else
- return false;
-
- if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
- return false;
- AM = ISD::PRE_INC;
- return true;
-}
-
-bool AArch64TargetLowering::getPostIndexedAddressParts(
- SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
- ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
- EVT VT;
- SDValue Ptr;
- if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
- VT = LD->getMemoryVT();
- Ptr = LD->getBasePtr();
- } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
- VT = ST->getMemoryVT();
- Ptr = ST->getBasePtr();
- } else
- return false;
-
- if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
- return false;
- // Post-indexing updates the base, so it's not a valid transform
- // if that's not the same as the load's pointer.
- if (Ptr != Base)
- return false;
- AM = ISD::POST_INC;
- return true;
-}
-
-static void replaceBoolVectorBitcast(SDNode *N,
- SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG) {
- SDLoc DL(N);
- SDValue Op = N->getOperand(0);
- EVT VT = N->getValueType(0);
- [[maybe_unused]] EVT SrcVT = Op.getValueType();
- assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
- "Must be bool vector.");
-
- // Special handling for Clang's __builtin_convertvector. For vectors with <8
- // elements, it adds a vector concatenation with undef(s). If we encounter
- // this here, we can skip the concat.
- if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
- bool AllUndef = true;
- for (unsigned I = 1; I < Op.getNumOperands(); ++I)
- AllUndef &= Op.getOperand(I).isUndef();
-
- if (AllUndef)
- Op = Op.getOperand(0);
- }
-
- SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
- if (VectorBits)
- Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
-}
-
-static void CustomNonLegalBITCASTResults(SDNode *N,
- SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG, EVT ExtendVT,
- EVT CastVT) {
- SDLoc DL(N);
- SDValue Op = N->getOperand(0);
- EVT VT = N->getValueType(0);
-
- // Use SCALAR_TO_VECTOR for lane zero
- SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
- SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
- SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
- Results.push_back(
- DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
-}
-
-void AArch64TargetLowering::ReplaceBITCASTResults(
- SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
- SDLoc DL(N);
- SDValue Op = N->getOperand(0);
- EVT VT = N->getValueType(0);
- EVT SrcVT = Op.getValueType();
-
- if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
- CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
- return;
- }
-
- if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
- CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
- return;
- }
-
- if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
- CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
- return;
- }
-
- if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
- assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
- "Expected fp->int bitcast!");
-
- // Bitcasting between unpacked vector types of different element counts is
- // not a NOP because the live elements are laid out differently.
- // 01234567
- // e.g. nxv2i32 = XX??XX??
- // nxv4f16 = X?X?X?X?
- if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
- return;
-
- SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
- Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
- return;
- }
-
- if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
- !VT.isVector())
- return replaceBoolVectorBitcast(N, Results, DAG);
-
- if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
- return;
-
- Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
- DAG.getUNDEF(MVT::i32), Op);
- Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
- Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
-}
-
-static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG,
- const AArch64Subtarget *Subtarget) {
- EVT VT = N->getValueType(0);
- if (!VT.is256BitVector() ||
- (VT.getScalarType().isFloatingPoint() &&
- !N->getFlags().hasAllowReassociation()) ||
- (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
- VT.getScalarType() == MVT::bf16)
- return;
-
- SDValue X = N->getOperand(0);
- auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
- if (!Shuf) {
- Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
- X = N->getOperand(1);
- if (!Shuf)
- return;
- }
-
- if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
- return;
-
- // Check the mask is 1,0,3,2,5,4,...
- ArrayRef<int> Mask = Shuf->getMask();
- for (int I = 0, E = Mask.size(); I < E; I++)
- if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
- return;
-
- SDLoc DL(N);
- auto LoHi = DAG.SplitVector(X, DL);
- assert(LoHi.first.getValueType() == LoHi.second.getValueType());
- SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
- LoHi.first, LoHi.second);
-
- // Shuffle the elements back into order.
- SmallVector<int> NMask;
- for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
- NMask.push_back(I);
- NMask.push_back(I);
- }
- Results.push_back(
- DAG.getVectorShuffle(VT, DL,
- DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
- DAG.getUNDEF(LoHi.first.getValueType())),
- DAG.getUNDEF(VT), NMask));
-}
-
-static void ReplaceReductionResults(SDNode *N,
- SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG, unsigned InterOp,
- unsigned AcrossOp) {
- EVT LoVT, HiVT;
- SDValue Lo, Hi;
- SDLoc DL(N);
- std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
- std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
- SDValue InterVal = DAG.getNode(InterOp, DL, LoVT, Lo, Hi);
- SDValue SplitVal = DAG.getNode(AcrossOp, DL, LoVT, InterVal);
- Results.push_back(SplitVal);
-}
-
-void AArch64TargetLowering::ReplaceExtractSubVectorResults(
- SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
- SDValue In = N->getOperand(0);
- EVT InVT = In.getValueType();
-
- // Common code will handle these just fine.
- if (!InVT.isScalableVector() || !InVT.isInteger())
- return;
-
- SDLoc DL(N);
- EVT VT = N->getValueType(0);
-
- // The following checks bail if this is not a halving operation.
-
- ElementCount ResEC = VT.getVectorElementCount();
-
- if (InVT.getVectorElementCount() != (ResEC * 2))
- return;
-
- auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (!CIndex)
- return;
-
- unsigned Index = CIndex->getZExtValue();
- if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
- return;
-
- unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
- EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
-
- SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
- Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
-}
-
-void AArch64TargetLowering::ReplaceGetActiveLaneMaskResults(
- SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
- assert((Subtarget->hasSVE2p1() ||
- (Subtarget->hasSME2() && Subtarget->isStreaming())) &&
- "Custom lower of get.active.lane.mask missing required feature.");
-
- assert(N->getValueType(0) == MVT::nxv32i1 &&
- "Unexpected result type for get.active.lane.mask");
-
- SDLoc DL(N);
- SDValue Idx = N->getOperand(0);
- SDValue TC = N->getOperand(1);
-
- assert(Idx.getValueType().getFixedSizeInBits() <= 64 &&
- "Unexpected operand type for get.active.lane.mask");
-
- if (Idx.getValueType() != MVT::i64) {
- Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
- TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
- }
-
- SDValue ID =
- DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
- EVT HalfVT = N->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
- auto WideMask =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {HalfVT, HalfVT}, {ID, Idx, TC});
-
- Results.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),
- {WideMask.getValue(0), WideMask.getValue(1)}));
-}
-
-// Create an even/odd pair of X registers holding integer value V.
-static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
- SDLoc DL(V.getNode());
- auto [VLo, VHi] = DAG.SplitScalar(V, DL, MVT::i64, MVT::i64);
- if (DAG.getDataLayout().isBigEndian())
- std::swap (VLo, VHi);
- SDValue RegClass =
- DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, DL, MVT::i32);
- SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, DL, MVT::i32);
- SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, DL, MVT::i32);
- const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
- return SDValue(
- DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops), 0);
-}
-
-static void ReplaceCMP_SWAP_128Results(SDNode *N,
- SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG,
- const AArch64Subtarget *Subtarget) {
- assert(N->getValueType(0) == MVT::i128 &&
- "AtomicCmpSwap on types less than 128 should be legal");
-
- MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
- if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
- // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
- // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
- SDValue Ops[] = {
- createGPRPairNode(DAG, N->getOperand(2)), // Compare value
- createGPRPairNode(DAG, N->getOperand(3)), // Store value
- N->getOperand(1), // Ptr
- N->getOperand(0), // Chain in
- };
-
- unsigned Opcode;
- switch (MemOp->getMergedOrdering()) {
- case AtomicOrdering::Monotonic:
- Opcode = AArch64::CASPX;
- break;
- case AtomicOrdering::Acquire:
- Opcode = AArch64::CASPAX;
- break;
- case AtomicOrdering::Release:
- Opcode = AArch64::CASPLX;
- break;
- case AtomicOrdering::AcquireRelease:
- case AtomicOrdering::SequentiallyConsistent:
- Opcode = AArch64::CASPALX;
- break;
- default:
- llvm_unreachable("Unexpected ordering!");
- }
-
- MachineSDNode *CmpSwap = DAG.getMachineNode(
- Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
- DAG.setNodeMemRefs(CmpSwap, {MemOp});
-
- unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
- if (DAG.getDataLayout().isBigEndian())
- std::swap(SubReg1, SubReg2);
- SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
- SDValue(CmpSwap, 0));
- SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
- SDValue(CmpSwap, 0));
- Results.push_back(
- DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
- Results.push_back(SDValue(CmpSwap, 1)); // Chain out
- return;
- }
-
- unsigned Opcode;
- switch (MemOp->getMergedOrdering()) {
- case AtomicOrdering::Monotonic:
- Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
- break;
- case AtomicOrdering::Acquire:
- Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
- break;
- case AtomicOrdering::Release:
- Opcode = AArch64::CMP_SWAP_128_RELEASE;
- break;
- case AtomicOrdering::AcquireRelease:
- case AtomicOrdering::SequentiallyConsistent:
- Opcode = AArch64::CMP_SWAP_128;
- break;
- default:
- llvm_unreachable("Unexpected ordering!");
- }
-
- SDLoc DL(N);
- auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
- auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
- SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
- New.first, New.second, N->getOperand(0)};
- SDNode *CmpSwap = DAG.getMachineNode(
- Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
- Ops);
- DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
-
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
- SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
- Results.push_back(SDValue(CmpSwap, 3));
-}
-
-static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
- AtomicOrdering Ordering) {
- // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
- // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
- // the type is not legal. Therefore we shouldn't expect to see a 128-bit
- // ATOMIC_LOAD_CLR at any point.
- assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
- "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
- assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
- assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
-
- if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
- // The operand will need to be XORed in a separate step.
- switch (Ordering) {
- case AtomicOrdering::Monotonic:
- return AArch64::LDCLRP;
- break;
- case AtomicOrdering::Acquire:
- return AArch64::LDCLRPA;
- break;
- case AtomicOrdering::Release:
- return AArch64::LDCLRPL;
- break;
- case AtomicOrdering::AcquireRelease:
- case AtomicOrdering::SequentiallyConsistent:
- return AArch64::LDCLRPAL;
- break;
- default:
- llvm_unreachable("Unexpected ordering!");
- }
- }
-
- if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
- switch (Ordering) {
- case AtomicOrdering::Monotonic:
- return AArch64::LDSETP;
- break;
- case AtomicOrdering::Acquire:
- return AArch64::LDSETPA;
- break;
- case AtomicOrdering::Release:
- return AArch64::LDSETPL;
- break;
- case AtomicOrdering::AcquireRelease:
- case AtomicOrdering::SequentiallyConsistent:
- return AArch64::LDSETPAL;
- break;
- default:
- llvm_unreachable("Unexpected ordering!");
- }
- }
-
- if (ISDOpcode == ISD::ATOMIC_SWAP) {
- switch (Ordering) {
- case AtomicOrdering::Monotonic:
- return AArch64::SWPP;
- break;
- case AtomicOrdering::Acquire:
- return AArch64::SWPPA;
- break;
- case AtomicOrdering::Release:
- return AArch64::SWPPL;
- break;
- case AtomicOrdering::AcquireRelease:
- case AtomicOrdering::SequentiallyConsistent:
- return AArch64::SWPPAL;
- break;
- default:
- llvm_unreachable("Unexpected ordering!");
- }
- }
-
- llvm_unreachable("Unexpected ISDOpcode!");
-}
-
-static void ReplaceATOMIC_LOAD_128Results(SDNode *N,
- SmallVectorImpl<SDValue> &Results,
- SelectionDAG &DAG,
- const AArch64Subtarget *Subtarget) {
- // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
- // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
- // rather than the CASP instructions, because CASP has register classes for
- // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
- // to present them as single operands. LSE128 instructions use the GPR64
- // register class (because the pair does not have to be sequential), like
- // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
-
- assert(N->getValueType(0) == MVT::i128 &&
- "AtomicLoadXXX on types less than 128 should be legal");
-
- if (!Subtarget->hasLSE128())
- return;
-
- MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
- const SDValue &Chain = N->getOperand(0);
- const SDValue &Ptr = N->getOperand(1);
- const SDValue &Val128 = N->getOperand(2);
- std::pair<SDValue, SDValue> Val2x64 =
- DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
-
- const unsigned ISDOpcode = N->getOpcode();
- const unsigned MachineOpcode =
- getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
-
- if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
- SDLoc DL(Val128);
- Val2x64.first =
- DAG.getNode(ISD::XOR, DL, MVT::i64,
- DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.first);
- Val2x64.second =
- DAG.getNode(ISD::XOR, DL, MVT::i64,
- DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.second);
- }
-
- SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
- if (DAG.getDataLayout().isBigEndian())
- std::swap(Ops[0], Ops[1]);
-
- MachineSDNode *AtomicInst =
- DAG.getMachineNode(MachineOpcode, SDLoc(N),
- DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
-
- DAG.setNodeMemRefs(AtomicInst, {MemOp});
-
- SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
- if (DAG.getDataLayout().isBigEndian())
- std::swap(Lo, Hi);
-
- Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
- Results.push_back(SDValue(AtomicInst, 2)); // Chain out
-}
-
-void AArch64TargetLowering::ReplaceNodeResults(
- SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
- switch (N->getOpcode()) {
- default:
- llvm_unreachable("Don't know how to custom expand this");
- case ISD::BITCAST:
- ReplaceBITCASTResults(N, Results, DAG);
- return;
- case ISD::VECREDUCE_ADD:
- case ISD::VECREDUCE_SMAX:
- case ISD::VECREDUCE_SMIN:
- case ISD::VECREDUCE_UMAX:
- case ISD::VECREDUCE_UMIN:
- Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
- return;
- case ISD::VECTOR_COMPRESS:
- if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
- Results.push_back(Res);
- return;
- case ISD::ADD:
- case ISD::FADD:
- ReplaceAddWithADDP(N, Results, DAG, Subtarget);
- return;
-
- case ISD::CTPOP:
- case ISD::PARITY:
- if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
- Results.push_back(Result);
- return;
- case AArch64ISD::SADDV:
- ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
- return;
- case AArch64ISD::UADDV:
- ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
- return;
- case AArch64ISD::SMINV:
- ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
- return;
- case AArch64ISD::UMINV:
- ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
- return;
- case AArch64ISD::SMAXV:
- ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
- return;
- case AArch64ISD::UMAXV:
- ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
- return;
- case ISD::MULHS:
- if (useSVEForFixedLengthVectorVT(SDValue(N, 0).getValueType()))
- Results.push_back(
- LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
- return;
- case ISD::MULHU:
- if (useSVEForFixedLengthVectorVT(SDValue(N, 0).getValueType()))
- Results.push_back(
- LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
- return;
- case ISD::FP_TO_UINT:
- case ISD::FP_TO_SINT:
- case ISD::STRICT_FP_TO_SINT:
- case ISD::STRICT_FP_TO_UINT:
- assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
- // Let normal code take care of it by not adding anything to Results.
- return;
- case ISD::ATOMIC_CMP_SWAP:
- ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
- return;
- case ISD::ATOMIC_LOAD_CLR:
- assert(N->getValueType(0) != MVT::i128 &&
- "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
- break;
- case ISD::ATOMIC_LOAD_AND:
- case ISD::ATOMIC_LOAD_OR:
- case ISD::ATOMIC_SWAP: {
- assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
- "Expected 128-bit atomicrmw.");
- // These need custom type legalisation so we go directly to instruction.
- ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
- return;
- }
- case ISD::ADDRSPACECAST: {
- SDValue V = LowerADDRSPACECAST(SDValue(N, 0), DAG);
- Results.push_back(V);
- return;
- }
- case ISD::ATOMIC_LOAD:
- case ISD::LOAD: {
- MemSDNode *LoadNode = cast<MemSDNode>(N);
- EVT MemVT = LoadNode->getMemoryVT();
- // Handle lowering 256 bit non temporal loads into LDNP for little-endian
- // targets.
- if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
- MemVT.getSizeInBits() == 256u &&
- (MemVT.getScalarSizeInBits() == 8u ||
- MemVT.getScalarSizeInBits() == 16u ||
- MemVT.getScalarSizeInBits() == 32u ||
- MemVT.getScalarSizeInBits() == 64u)) {
-
- EVT HalfVT = MemVT.getHalfNumVectorElementsVT(*DAG.getContext());
- SDValue Result = DAG.getMemIntrinsicNode(
- AArch64ISD::LDNP, SDLoc(N),
- DAG.getVTList({MVT::v2i64, MVT::v2i64, MVT::Other}),
- {LoadNode->getChain(), LoadNode->getBasePtr()},
- LoadNode->getMemoryVT(), LoadNode->getMemOperand());
-
- SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
- DAG.getBitcast(HalfVT, Result.getValue(0)),
- DAG.getBitcast(HalfVT, Result.getValue(1)));
- Results.append({Pair, Result.getValue(2) /* Chain */});
- return;
- }
-
- if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
- LoadNode->getMemoryVT() != MVT::i128) {
- // Non-volatile or atomic loads are optimized later in AArch64's load/store
- // optimizer.
- return;
- }
-
- if (SDValue(N, 0).getValueType() == MVT::i128) {
- auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
- bool isLoadAcquire =
- AN && AN->getSuccessOrdering() == AtomicOrdering::Acquire;
- unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
-
- if (isLoadAcquire)
- assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
-
- SDValue Result = DAG.getMemIntrinsicNode(
- Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
- {LoadNode->getChain(), LoadNode->getBasePtr()},
- LoadNode->getMemoryVT(), LoadNode->getMemOperand());
-
- unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
-
- SDValue Pair =
- DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
- Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
- Results.append({Pair, Result.getValue(2) /* Chain */});
- }
- return;
- }
- case ISD::EXTRACT_SUBVECTOR:
- ReplaceExtractSubVectorResults(N, Results, DAG);
- return;
- case ISD::INSERT_SUBVECTOR:
- case ISD::CONCAT_VECTORS:
- // Custom lowering has been requested for INSERT_SUBVECTOR and
- // CONCAT_VECTORS -- but delegate to common code for result type
- // legalisation
- return;
- case ISD::GET_ACTIVE_LANE_MASK:
- ReplaceGetActiveLaneMaskResults(N, Results, DAG);
- return;
- case ISD::INTRINSIC_WO_CHAIN: {
- EVT VT = N->getValueType(0);
-
- Intrinsic::ID IntID =
- static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
- switch (IntID) {
- default:
- return;
- case Intrinsic::aarch64_sve_clasta_n: {
- assert((VT == MVT::i8 || VT == MVT::i16) &&
- "custom lowering for unexpected type");
- SDLoc DL(N);
- auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
- auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
- N->getOperand(1), Op2, N->getOperand(3));
- Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
- return;
- }
- case Intrinsic::aarch64_sve_clastb_n: {
- assert((VT == MVT::i8 || VT == MVT::i16) &&
- "custom lowering for unexpected type");
- SDLoc DL(N);
- auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
- auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
- N->getOperand(1), Op2, N->getOperand(3));
- Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
- return;
- }
- case Intrinsic::aarch64_sve_lasta: {
- assert((VT == MVT::i8 || VT == MVT::i16) &&
- "custom lowering for unexpected type");
- SDLoc DL(N);
- auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
- N->getOperand(1), N->getOperand(2));
- Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
- return;
- }
- case Intrinsic::aarch64_sve_lastb: {
- assert((VT == MVT::i8 || VT == MVT::i16) &&
- "custom lowering for unexpected type");
- SDLoc DL(N);
- auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
- N->getOperand(1), N->getOperand(2));
- Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
- return;
- }
- case Intrinsic::aarch64_sme_in_streaming_mode: {
- SDLoc DL(N);
- SDValue Chain = DAG.getEntryNode();
- SDValue RuntimePStateSM =
- getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
- Results.push_back(
- DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));
- return;
- }
- case Intrinsic::experimental_vector_match: {
- if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
- return;
-
- // NOTE: Only trivial type promotion is supported.
- EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
- if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
- return;
-
- SDLoc DL(N);
- auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
- Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
- return;
- }
- }
- }
- case ISD::READ_REGISTER: {
- SDLoc DL(N);
- assert(N->getValueType(0) == MVT::i128 &&
- "READ_REGISTER custom lowering is only for 128-bit sysregs");
- SDValue Chain = N->getOperand(0);
- SDValue SysRegName = N->getOperand(1);
-
- SDValue Result = DAG.getNode(
- AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
- Chain, SysRegName);
-
- // Sysregs are not endian. Result.getValue(0) always contains the lower half
- // of the 128-bit System Register value.
- SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
- Result.getValue(0), Result.getValue(1));
- Results.push_back(Pair);
- Results.push_back(Result.getValue(2)); // Chain
- return;
- }
- }
-}
-
-bool AArch64TargetLowering::useLoadStackGuardNode(const Module &M) const {
- if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
- return TargetLowering::useLoadStackGuardNode(M);
- return true;
-}
-
-unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
- // Combine multiple FDIVs with the same divisor into multiple FMULs by the
- // reciprocal if there are three or more FDIVs.
- return 3;
-}
-
-TargetLoweringBase::LegalizeTypeAction
-AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
- // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
- // v4i16, v2i32 instead of to promote.
- if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
- VT == MVT::v1f32)
- return TypeWidenVector;
-
- return TargetLoweringBase::getPreferredVectorAction(VT);
-}
-
-// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
-// provided the address is 16-byte aligned.
-bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
- if (!Subtarget->hasLSE2())
- return false;
-
- if (auto LI = dyn_cast<LoadInst>(I))
- return LI->getType()->getPrimitiveSizeInBits() == 128 &&
- LI->getAlign() >= Align(16);
-
- if (auto SI = dyn_cast<StoreInst>(I))
- return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
- SI->getAlign() >= Align(16);
-
- return false;
-}
-
-bool AArch64TargetLowering::isOpSuitableForLSE128(const Instruction *I) const {
- if (!Subtarget->hasLSE128())
- return false;
-
- // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
- // will clobber the two registers.
- if (const auto *SI = dyn_cast<StoreInst>(I))
- return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
- SI->getAlign() >= Align(16) &&
- (SI->getOrdering() == AtomicOrdering::Release ||
- SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
-
- if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
- return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
- RMW->getAlign() >= Align(16) &&
- (RMW->getOperation() == AtomicRMWInst::Xchg ||
- RMW->getOperation() == AtomicRMWInst::And ||
- RMW->getOperation() == AtomicRMWInst::Or);
-
- return false;
-}
-
-bool AArch64TargetLowering::isOpSuitableForRCPC3(const Instruction *I) const {
- if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
- return false;
-
- if (auto LI = dyn_cast<LoadInst>(I))
- return LI->getType()->getPrimitiveSizeInBits() == 128 &&
- LI->getAlign() >= Align(16) &&
- LI->getOrdering() == AtomicOrdering::Acquire;
-
- if (auto SI = dyn_cast<StoreInst>(I))
- return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
- SI->getAlign() >= Align(16) &&
- SI->getOrdering() == AtomicOrdering::Release;
-
- return false;
-}
-
-bool AArch64TargetLowering::shouldInsertFencesForAtomic(
- const Instruction *I) const {
- if (isOpSuitableForRCPC3(I))
- return false;
- if (isOpSuitableForLSE128(I))
- return false;
- if (isOpSuitableForLDPSTP(I))
- return true;
- return false;
-}
-
-bool AArch64TargetLowering::shouldInsertTrailingFenceForAtomicStore(
- const Instruction *I) const {
- // Store-Release instructions only provide seq_cst guarantees when paired with
- // Load-Acquire instructions. MSVC CRT does not use these instructions to
- // implement seq_cst loads and stores, so we need additional explicit fences
- // after memory writes.
- if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
- return false;
-
- switch (I->getOpcode()) {
- default:
- return false;
- case Instruction::AtomicCmpXchg:
- return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
- AtomicOrdering::SequentiallyConsistent;
- case Instruction::AtomicRMW:
- return cast<AtomicRMWInst>(I)->getOrdering() ==
- AtomicOrdering::SequentiallyConsistent;
- case Instruction::Store:
- return cast<StoreInst>(I)->getOrdering() ==
- AtomicOrdering::SequentiallyConsistent;
- }
-}
-
-// Loads and stores less than 128-bits are already atomic; ones above that
-// are doomed anyway, so defer to the default libcall and blame the OS when
-// things go wrong.
-TargetLoweringBase::AtomicExpansionKind
-AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
- unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
- if (Size != 128)
- return AtomicExpansionKind::None;
- if (isOpSuitableForRCPC3(SI))
- return AtomicExpansionKind::None;
- if (isOpSuitableForLSE128(SI))
- return AtomicExpansionKind::Expand;
- if (isOpSuitableForLDPSTP(SI))
- return AtomicExpansionKind::None;
- return AtomicExpansionKind::Expand;
-}
-
-// Loads and stores less than 128-bits are already atomic; ones above that
-// are doomed anyway, so defer to the default libcall and blame the OS when
-// things go wrong.
-TargetLowering::AtomicExpansionKind
-AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
- unsigned Size = LI->getType()->getPrimitiveSizeInBits();
-
- if (Size != 128)
- return AtomicExpansionKind::None;
- if (isOpSuitableForRCPC3(LI))
- return AtomicExpansionKind::None;
- // No LSE128 loads
- if (isOpSuitableForLDPSTP(LI))
- return AtomicExpansionKind::None;
-
- // At -O0, fast-regalloc cannot cope with the live vregs necessary to
- // implement atomicrmw without spilling. If the target address is also on the
- // stack and close enough to the spill slot, this can lead to a situation
- // where the monitor always gets cleared and the atomic operation can never
- // succeed. So at -O0 lower this operation to a CAS loop.
- if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
- return AtomicExpansionKind::CmpXChg;
-
- // Using CAS for an atomic load has a better chance of succeeding under high
- // contention situations. So use it if available.
- return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
- : AtomicExpansionKind::LLSC;
-}
-
-// Return true if the atomic operation expansion will lower to use a library
-// call, and is thus ineligible to use an LLSC expansion.
-static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
- const AtomicRMWInst *RMW) {
- if (!RMW->isFloatingPointOperation())
- return false;
- switch (RMW->getType()->getScalarType()->getTypeID()) {
- case Type::FloatTyID:
- case Type::DoubleTyID:
- case Type::HalfTyID:
- case Type::BFloatTyID:
- // Will use soft float
- return !Subtarget.hasFPARMv8();
- default:
- // fp128 will emit library calls.
- return true;
- }
-
- llvm_unreachable("covered type switch");
-}
-
-// The "default" for integer RMW operations is to expand to an LL/SC loop.
-// However, with the LSE instructions (or outline-atomics mode, which provides
-// library routines in place of the LSE-instructions), we can directly emit many
-// operations instead.
-TargetLowering::AtomicExpansionKind
-AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
- Type *Ty = AI->getType();
- unsigned Size = Ty->getPrimitiveSizeInBits();
- assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
-
- bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
- (AI->getOperation() == AtomicRMWInst::Xchg ||
- AI->getOperation() == AtomicRMWInst::Or ||
- AI->getOperation() == AtomicRMWInst::And);
- if (CanUseLSE128)
- return AtomicExpansionKind::None;
-
- // If LSFE available, use atomic FP instructions in preference to expansion
- if (Subtarget->hasLSFE() && (AI->getOperation() == AtomicRMWInst::FAdd ||
- AI->getOperation() == AtomicRMWInst::FMax ||
- AI->getOperation() == AtomicRMWInst::FMin ||
- AI->getOperation() == AtomicRMWInst::FMaximum ||
- AI->getOperation() == AtomicRMWInst::FMinimum))
- return AtomicExpansionKind::None;
-
- // Nand is not supported in LSE.
- // Leave 128 bits to LLSC or CmpXChg.
- if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
- !AI->isFloatingPointOperation()) {
- if (Subtarget->hasLSE())
- return AtomicExpansionKind::None;
- if (Subtarget->outlineAtomics()) {
- // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
- // Don't outline them unless
- // (1) high level <atomic> support approved:
- // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
- // (2) low level libgcc and compiler-rt support implemented by:
- // min/max outline atomics helpers
- if (AI->getOperation() != AtomicRMWInst::Min &&
- AI->getOperation() != AtomicRMWInst::Max &&
- AI->getOperation() != AtomicRMWInst::UMin &&
- AI->getOperation() != AtomicRMWInst::UMax) {
- return AtomicExpansionKind::None;
- }
- }
- }
-
- // At -O0, fast-regalloc cannot cope with the live vregs necessary to
- // implement atomicrmw without spilling. If the target address is also on the
- // stack and close enough to the spill slot, this can lead to a situation
- // where the monitor always gets cleared and the atomic operation can never
- // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
- // we have a single CAS instruction that can replace the loop.
- if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
- Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI))
- return AtomicExpansionKind::CmpXChg;
-
- return AtomicExpansionKind::LLSC;
-}
-
-TargetLowering::AtomicExpansionKind
-AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
- AtomicCmpXchgInst *AI) const {
- // If subtarget has LSE, leave cmpxchg intact for codegen.
- if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
- return AtomicExpansionKind::None;
- // At -O0, fast-regalloc cannot cope with the live vregs necessary to
- // implement cmpxchg without spilling. If the address being exchanged is also
- // on the stack and close enough to the spill slot, this can lead to a
- // situation where the monitor always gets cleared and the atomic operation
- // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
- if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
- return AtomicExpansionKind::None;
-
- // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
- // it.
- unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
- if (Size > 64)
- return AtomicExpansionKind::None;
-
- return AtomicExpansionKind::LLSC;
-}
-
-Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
- Type *ValueTy, Value *Addr,
- AtomicOrdering Ord) const {
- Module *M = Builder.GetInsertBlock()->getParent()->getParent();
- bool IsAcquire = isAcquireOrStronger(Ord);
-
- // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
- // intrinsic must return {i64, i64} and we have to recombine them into a
- // single i128 here.
- if (ValueTy->getPrimitiveSizeInBits() == 128) {
- Intrinsic::ID Int =
- IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
-
- Value *LoHi =
- Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
-
- Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
- Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
-
- auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
- Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
- Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
-
- Value *Or = Builder.CreateOr(
- Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
- return Builder.CreateBitCast(Or, ValueTy);
- }
-
- Type *Tys[] = { Addr->getType() };
- Intrinsic::ID Int =
- IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
-
- const DataLayout &DL = M->getDataLayout();
- IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
- CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
- CI->addParamAttr(0, Attribute::get(Builder.getContext(),
- Attribute::ElementType, IntEltTy));
- Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
-
- return Builder.CreateBitCast(Trunc, ValueTy);
-}
-
-void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
- IRBuilderBase &Builder) const {
- Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {});
-}
-
-Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
- Value *Val, Value *Addr,
- AtomicOrdering Ord) const {
- Module *M = Builder.GetInsertBlock()->getParent()->getParent();
- bool IsRelease = isReleaseOrStronger(Ord);
-
- // Since the intrinsics must have legal type, the i128 intrinsics take two
- // parameters: "i64, i64". We must marshal Val into the appropriate form
- // before the call.
- if (Val->getType()->getPrimitiveSizeInBits() == 128) {
- Intrinsic::ID Int =
- IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
- Function *Stxr = Intrinsic::getOrInsertDeclaration(M, Int);
- Type *Int64Ty = Type::getInt64Ty(M->getContext());
- Type *Int128Ty = Type::getInt128Ty(M->getContext());
-
- Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
-
- Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
- Value *Hi =
- Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
- return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
- }
-
- Intrinsic::ID Int =
- IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
- Type *Tys[] = { Addr->getType() };
- Function *Stxr = Intrinsic::getOrInsertDeclaration(M, Int, Tys);
-
- const DataLayout &DL = M->getDataLayout();
- IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
- Val = Builder.CreateBitCast(Val, IntValTy);
-
- CallInst *CI = Builder.CreateCall(
- Stxr, {Builder.CreateZExtOrBitCast(
- Val, Stxr->getFunctionType()->getParamType(0)),
- Addr});
- CI->addParamAttr(1, Attribute::get(Builder.getContext(),
- Attribute::ElementType, Val->getType()));
- return CI;
-}
-
-bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
- Type *Ty, CallingConv::ID CallConv, bool isVarArg,
- const DataLayout &DL) const {
- if (!Ty->isArrayTy()) {
- const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
- return TySize.isScalable() && TySize.getKnownMinValue() > 128;
- }
-
- // All non aggregate members of the type must have the same type
- SmallVector<EVT> ValueVTs;
- ComputeValueVTs(*this, DL, Ty, ValueVTs);
- return all_equal(ValueVTs);
-}
-
-bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
- EVT) const {
- return false;
-}
-
-static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
- Module *M = IRB.GetInsertBlock()->getParent()->getParent();
- Function *ThreadPointerFunc = Intrinsic::getOrInsertDeclaration(
- M, Intrinsic::thread_pointer, IRB.getPtrTy());
- return IRB.CreatePointerCast(
- IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
- Offset),
- IRB.getPtrTy(0));
-}
-
-Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
- // Android provides a fixed TLS slot for the stack cookie. See the definition
- // of TLS_SLOT_STACK_GUARD in
- // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
- if (Subtarget->isTargetAndroid())
- return UseTlsOffset(IRB, 0x28);
-
- // Fuchsia is similar.
- // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
- if (Subtarget->isTargetFuchsia())
- return UseTlsOffset(IRB, -0x10);
-
- return TargetLowering::getIRStackGuard(IRB);
-}
-
-void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
- // MSVC CRT provides functionalities for stack protection.
- if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
- // MSVC CRT has a global variable holding security cookie.
- M.getOrInsertGlobal("__security_cookie",
- PointerType::getUnqual(M.getContext()));
-
- // MSVC CRT has a function to validate security cookie.
- FunctionCallee SecurityCheckCookie =
- M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
- Type::getVoidTy(M.getContext()),
- PointerType::getUnqual(M.getContext()));
- if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
- F->setCallingConv(CallingConv::Win64);
- F->addParamAttr(0, Attribute::AttrKind::InReg);
- }
- return;
- }
- TargetLowering::insertSSPDeclarations(M);
-}
-
-Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
- // MSVC CRT has a global variable holding security cookie.
- if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
- return M.getGlobalVariable("__security_cookie");
- return TargetLowering::getSDagStackGuard(M);
-}
-
-Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
- // MSVC CRT has a function to validate security cookie.
- if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
- return M.getFunction(Subtarget->getSecurityCheckCookieName());
- return TargetLowering::getSSPStackGuardCheck(M);
-}
-
-Value *
-AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
- // Android provides a fixed TLS slot for the SafeStack pointer. See the
- // definition of TLS_SLOT_SAFESTACK in
- // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
- if (Subtarget->isTargetAndroid())
- return UseTlsOffset(IRB, 0x48);
-
- // Fuchsia is similar.
- // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
- if (Subtarget->isTargetFuchsia())
- return UseTlsOffset(IRB, -0x8);
-
- return TargetLowering::getSafeStackPointerLocation(IRB);
-}
-
-/// If a physical register, this returns the register that receives the
-/// exception address on entry to an EH pad.
-Register AArch64TargetLowering::getExceptionPointerRegister(
- const Constant *PersonalityFn) const {
- // FIXME: This is a guess. Has this been defined yet?
- return AArch64::X0;
-}
-
-/// If a physical register, this returns the register that receives the
-/// exception typeid on entry to a landing pad.
-Register AArch64TargetLowering::getExceptionSelectorRegister(
- const Constant *PersonalityFn) const {
- // FIXME: This is a guess. Has this been defined yet?
- return AArch64::X1;
-}
-
-bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
- const Instruction &AndI) const {
- // Only sink 'and' mask to cmp use block if it is masking a single bit, since
- // this is likely to be fold the and/cmp/br into a single tbz instruction. It
- // may be beneficial to sink in other cases, but we would have to check that
- // the cmp would not get folded into the br to form a cbz for these to be
- // beneficial.
- ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
- if (!Mask)
- return false;
- return Mask->getValue().isPowerOf2();
-}
-
-bool AArch64TargetLowering::
- shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
- SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
- unsigned OldShiftOpcode, unsigned NewShiftOpcode,
- SelectionDAG &DAG) const {
- // Does baseline recommend not to perform the fold by default?
- if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
- X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
- return false;
- // Else, if this is a vector shift, prefer 'shl'.
- return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
-}
-
-TargetLowering::ShiftLegalizationStrategy
-AArch64TargetLowering::preferredShiftLegalizationStrategy(
- SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
- if (DAG.getMachineFunction().getFunction().hasMinSize() &&
- !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
- return ShiftLegalizationStrategy::LowerToLibcall;
- return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,
- ExpansionFactor);
-}
-
-void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
- // Update IsSplitCSR in AArch64unctionInfo.
- AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
- AFI->setIsSplitCSR(true);
-}
-
-void AArch64TargetLowering::insertCopiesSplitCSR(
- MachineBasicBlock *Entry,
- const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
- const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
- const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
- if (!IStart)
- return;
-
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
- MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
- MachineBasicBlock::iterator MBBI = Entry->begin();
- for (const MCPhysReg *I = IStart; *I; ++I) {
- const TargetRegisterClass *RC = nullptr;
- if (AArch64::GPR64RegClass.contains(*I))
- RC = &AArch64::GPR64RegClass;
- else if (AArch64::FPR64RegClass.contains(*I))
- RC = &AArch64::FPR64RegClass;
- else
- llvm_unreachable("Unexpected register class in CSRsViaCopy!");
-
- Register NewVR = MRI->createVirtualRegister(RC);
- // Create copy from CSR to a virtual register.
- // FIXME: this currently does not emit CFI pseudo-instructions, it works
- // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
- // nounwind. If we want to generalize this later, we may need to emit
- // CFI pseudo-instructions.
- assert(Entry->getParent()->getFunction().hasFnAttribute(
- Attribute::NoUnwind) &&
- "Function should be nounwind in insertCopiesSplitCSR!");
- Entry->addLiveIn(*I);
- BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
- .addReg(*I);
-
- // Insert the copy-back instructions right before the terminator.
- for (auto *Exit : Exits)
- BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
- TII->get(TargetOpcode::COPY), *I)
- .addReg(NewVR);
- }
-}
-
-bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
- // Integer division on AArch64 is expensive. However, when aggressively
- // optimizing for code size, we prefer to use a div instruction, as it is
- // usually smaller than the alternative sequence.
- // The exception to this is vector division. Since AArch64 doesn't have vector
- // integer division, leaving the division as-is is a loss even in terms of
- // size, because it will have to be scalarized, while the alternative code
- // sequence can be performed in vector form.
- bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
- return OptSize && !VT.isVector();
-}
-
-bool AArch64TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
- const MachineFunction &MF) const {
- // Avoid merging stores into fixed-length vectors when Neon is unavailable.
- // In future, we could allow this when SVE is available, but currently,
- // the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
- // the general lowering may introduce stack spills/reloads).
- if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
- return false;
-
- // Do not merge to float value size (128 bytes) if no implicit float attribute
- // is set.
- bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
- return !NoFloat || MemVT.getSizeInBits() <= 64;
-}
-
-bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
- // We want inc-of-add for scalars and sub-of-not for vectors.
- return VT.isScalarInteger();
-}
-
-bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
- EVT VT) const {
- // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
- // legalize.
- if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
- return false;
- if (FPVT == MVT::v8bf16)
- return false;
- return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
-}
-
-bool AArch64TargetLowering::shouldExpandCmpUsingSelects(EVT VT) const {
- // Expand scalar and SVE operations using selects. Neon vectors prefer sub to
- // avoid vselect becoming bsl / unrolling.
- return !VT.isFixedLengthVector();
-}
-
-MachineInstr *
-AArch64TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
- MachineBasicBlock::instr_iterator &MBBI,
- const TargetInstrInfo *TII) const {
- assert(MBBI->isCall() && MBBI->getCFIType() &&
- "Invalid call instruction for a KCFI check");
-
- switch (MBBI->getOpcode()) {
- case AArch64::BLR:
- case AArch64::BLRNoIP:
- case AArch64::TCRETURNri:
- case AArch64::TCRETURNrix16x17:
- case AArch64::TCRETURNrix17:
- case AArch64::TCRETURNrinotx16:
- break;
- default:
- llvm_unreachable("Unexpected CFI call opcode");
- }
-
- MachineOperand &Target = MBBI->getOperand(0);
- assert(Target.isReg() && "Invalid target operand for an indirect call");
- Target.setIsRenamable(false);
-
- return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
- .addReg(Target.getReg())
- .addImm(MBBI->getCFIType())
- .getInstr();
-}
-
-bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
- return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
-}
-
-unsigned
-AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
- if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
- return getPointerTy(DL).getSizeInBits();
-
- return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
-}
-
-void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
- MachineFrameInfo &MFI = MF.getFrameInfo();
- // If we have any vulnerable SVE stack objects then the stack protector
- // needs to be placed at the top of the SVE stack area, as the SVE locals
- // are placed above the other locals, so we allocate it as if it were a
- // scalable vector.
- // FIXME: It may be worthwhile having a specific interface for this rather
- // than doing it here in finalizeLowering.
- if (MFI.hasStackProtectorIndex()) {
- for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
- if (MFI.getStackID(i) == TargetStackID::ScalableVector &&
- MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) {
- MFI.setStackID(MFI.getStackProtectorIndex(),
- TargetStackID::ScalableVector);
- MFI.setObjectAlignment(MFI.getStackProtectorIndex(), Align(16));
- break;
- }
- }
- }
- MFI.computeMaxCallFrameSize(MF);
- TargetLoweringBase::finalizeLowering(MF);
-}
-
-// Unlike X86, we let frame lowering assign offsets to all catch objects.
-bool AArch64TargetLowering::needsFixedCatchObjects() const { return true; }
-
-bool AArch64TargetLowering::shouldLocalize(
- const MachineInstr &MI, const TargetTransformInfo *TTI) const {
- auto &MF = *MI.getMF();
- auto &MRI = MF.getRegInfo();
- auto maxUses = [](unsigned RematCost) {
- // A cost of 1 means remats are basically free.
- if (RematCost == 1)
- return std::numeric_limits<unsigned>::max();
- if (RematCost == 2)
- return 2U;
-
- // Remat is too expensive, only sink if there's one user.
- if (RematCost > 2)
- return 1U;
- llvm_unreachable("Unexpected remat cost");
- };
-
- unsigned Opc = MI.getOpcode();
- switch (Opc) {
- case TargetOpcode::G_GLOBAL_VALUE: {
- // On Darwin, TLS global vars get selected into function calls, which
- // we don't want localized, as they can get moved into the middle of a
- // another call sequence.
- const GlobalValue &GV = *MI.getOperand(1).getGlobal();
- if (GV.isThreadLocal() && Subtarget->isTargetMachO())
- return false;
- return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
- }
- case TargetOpcode::G_FCONSTANT:
- case TargetOpcode::G_CONSTANT: {
- const ConstantInt *CI;
- unsigned AdditionalCost = 0;
-
- if (Opc == TargetOpcode::G_CONSTANT)
- CI = MI.getOperand(1).getCImm();
- else {
- LLT Ty = MRI.getType(MI.getOperand(0).getReg());
- // We try to estimate cost of 32/64b fpimms, as they'll likely be
- // materialized as integers.
- if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
- break;
- auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
- bool OptForSize = MF.getFunction().hasOptSize();
- if (isFPImmLegal(APF, EVT::getFloatingPointVT(Ty.getScalarSizeInBits()),
- OptForSize))
- return true; // Constant should be cheap.
- CI =
- ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
- // FP materialization also costs an extra move, from gpr to fpr.
- AdditionalCost = 1;
- }
- APInt Imm = CI->getValue();
- InstructionCost Cost = TTI->getIntImmCost(
- Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize);
- assert(Cost.isValid() && "Expected a valid imm cost");
-
- unsigned RematCost = Cost.getValue();
- RematCost += AdditionalCost;
- Register Reg = MI.getOperand(0).getReg();
- unsigned MaxUses = maxUses(RematCost);
- // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
- if (MaxUses == std::numeric_limits<unsigned>::max())
- --MaxUses;
- return MRI.hasAtMostUserInstrs(Reg, MaxUses);
- }
- // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
- // localizable.
- case AArch64::ADRP:
- case AArch64::G_ADD_LOW:
- // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
- case TargetOpcode::G_PTR_ADD:
- return true;
- default:
- break;
- }
- return TargetLoweringBase::shouldLocalize(MI, TTI);
-}
-
-bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
- // Fallback for scalable vectors.
- // Note that if EnableSVEGISel is true, we allow scalable vector types for
- // all instructions, regardless of whether they are actually supported.
- if (!EnableSVEGISel) {
- if (Inst.getType()->isScalableTy()) {
- return true;
- }
-
- for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
- if (Inst.getOperand(i)->getType()->isScalableTy())
- return true;
-
- if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
- if (AI->getAllocatedType()->isScalableTy())
- return true;
- }
- }
-
- // Checks to allow the use of SME instructions
- if (auto *Base = dyn_cast<CallBase>(&Inst)) {
- auto CallAttrs = SMECallAttrs(*Base);
- if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
- CallAttrs.requiresPreservingZT0() ||
- CallAttrs.requiresPreservingAllZAState())
- return true;
- }
- return false;
-}
-
-// Return the largest legal scalable vector type that matches VT's element type.
-static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
- assert(VT.isFixedLengthVector() &&
- DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
- "Expected legal fixed length vector!");
- switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
- default:
- llvm_unreachable("unexpected element type for SVE container");
- case MVT::i8:
- return EVT(MVT::nxv16i8);
- case MVT::i16:
- return EVT(MVT::nxv8i16);
- case MVT::i32:
- return EVT(MVT::nxv4i32);
- case MVT::i64:
- return EVT(MVT::nxv2i64);
- case MVT::bf16:
- return EVT(MVT::nxv8bf16);
- case MVT::f16:
- return EVT(MVT::nxv8f16);
- case MVT::f32:
- return EVT(MVT::nxv4f32);
- case MVT::f64:
- return EVT(MVT::nxv2f64);
- }
-}
-
-// Return a predicate with active lanes corresponding to the extent of VT.
-static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
- EVT VT) {
- assert(VT.isFixedLengthVector() &&
- DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
- "Expected legal fixed length vector!");
-
- std::optional<unsigned> PgPattern =
- getSVEPredPatternFromNumElements(VT.getVectorNumElements());
- assert(PgPattern && "Unexpected element count for SVE predicate");
-
- // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
- // AArch64SVEPredPattern::all, which can enable the use of unpredicated
- // variants of instructions when available.
- const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
- unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
- unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
- if (MaxSVESize && MinSVESize == MaxSVESize &&
- MaxSVESize == VT.getSizeInBits())
- PgPattern = AArch64SVEPredPattern::all;
-
- MVT MaskVT;
- switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
- default:
- llvm_unreachable("unexpected element type for SVE predicate");
- case MVT::i8:
- MaskVT = MVT::nxv16i1;
- break;
- case MVT::i16:
- case MVT::f16:
- case MVT::bf16:
- MaskVT = MVT::nxv8i1;
- break;
- case MVT::i32:
- case MVT::f32:
- MaskVT = MVT::nxv4i1;
- break;
- case MVT::i64:
- case MVT::f64:
- MaskVT = MVT::nxv2i1;
- break;
- }
-
- return getPTrue(DAG, DL, MaskVT, *PgPattern);
-}
-
-static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
- EVT VT) {
- assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
- "Expected legal scalable vector!");
- auto PredTy = VT.changeVectorElementType(MVT::i1);
- return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
-}
-
-static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
- if (VT.isFixedLengthVector())
- return getPredicateForFixedLengthVector(DAG, DL, VT);
-
- return getPredicateForScalableVector(DAG, DL, VT);
-}
-
-// Grow V to consume an entire SVE register.
-static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
- assert(VT.isScalableVector() &&
- "Expected to convert into a scalable vector!");
- assert(V.getValueType().isFixedLengthVector() &&
- "Expected a fixed length vector operand!");
- SDLoc DL(V);
- SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
-}
-
-// Shrink V so it's just big enough to maintain a VT's worth of data.
-static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
- assert(VT.isFixedLengthVector() &&
- "Expected to convert into a fixed length vector!");
- assert(V.getValueType().isScalableVector() &&
- "Expected a scalable vector operand!");
- SDLoc DL(V);
- SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
-}
-
-// Convert all fixed length vector loads larger than NEON to masked_loads.
-SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- auto Load = cast<LoadSDNode>(Op);
-
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
- EVT LoadVT = ContainerVT;
- EVT MemVT = Load->getMemoryVT();
-
- auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
-
- if (VT.isFloatingPoint()) {
- LoadVT = ContainerVT.changeTypeToInteger();
- MemVT = MemVT.changeTypeToInteger();
- }
-
- SDValue NewLoad = DAG.getMaskedLoad(
- LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
- DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
- Load->getAddressingMode(), Load->getExtensionType());
-
- SDValue Result = NewLoad;
- if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
- EVT ExtendVT = ContainerVT.changeVectorElementType(
- Load->getMemoryVT().getVectorElementType());
-
- Result = getSVESafeBitCast(ExtendVT, Result, DAG);
- Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
- Pg, Result, DAG.getUNDEF(ContainerVT));
- } else if (VT.isFloatingPoint()) {
- Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
- }
-
- Result = convertFromScalableVector(DAG, VT, Result);
- SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
- return DAG.getMergeValues(MergedValues, DL);
-}
-
-static SDValue convertFixedMaskToScalableVector(SDValue Mask,
- SelectionDAG &DAG) {
- SDLoc DL(Mask);
- EVT InVT = Mask.getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
- SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
-
- if (ISD::isBuildVectorAllOnes(Mask.getNode()))
- return Pg;
-
- bool InvertCond = false;
- if (isBitwiseNot(Mask)) {
- InvertCond = true;
- Mask = Mask.getOperand(0);
- }
-
- SDValue Op1, Op2;
- ISD::CondCode CC;
-
- // When Mask is the result of a SETCC, it's better to regenerate the compare.
- if (Mask.getOpcode() == ISD::SETCC) {
- Op1 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(0));
- Op2 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(1));
- CC = cast<CondCodeSDNode>(Mask.getOperand(2))->get();
- } else {
- Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
- Op2 = DAG.getConstant(0, DL, ContainerVT);
- CC = ISD::SETNE;
- }
-
- if (InvertCond)
- CC = getSetCCInverse(CC, Op1.getValueType());
-
- return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
- {Pg, Op1, Op2, DAG.getCondCode(CC)});
-}
-
-// Convert all fixed length vector loads larger than NEON to masked_loads.
-SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- auto Load = cast<MaskedLoadSDNode>(Op);
-
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
-
- SDValue Mask = Load->getMask();
- // If this is an extending load and the mask type is not the same as
- // load's type then we have to extend the mask type.
- if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
- assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
- "Incorrect mask type");
- Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Mask);
- }
- Mask = convertFixedMaskToScalableVector(Mask, DAG);
-
- SDValue PassThru;
- bool IsPassThruZeroOrUndef = false;
-
- if (Load->getPassThru()->isUndef()) {
- PassThru = DAG.getUNDEF(ContainerVT);
- IsPassThruZeroOrUndef = true;
- } else {
- if (ContainerVT.isInteger())
- PassThru = DAG.getConstant(0, DL, ContainerVT);
- else
- PassThru = DAG.getConstantFP(0, DL, ContainerVT);
- if (isZerosVector(Load->getPassThru().getNode()))
- IsPassThruZeroOrUndef = true;
- }
-
- SDValue NewLoad = DAG.getMaskedLoad(
- ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
- Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
- Load->getAddressingMode(), Load->getExtensionType());
-
- SDValue Result = NewLoad;
- if (!IsPassThruZeroOrUndef) {
- SDValue OldPassThru =
- convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
- Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
- }
-
- Result = convertFromScalableVector(DAG, VT, Result);
- SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
- return DAG.getMergeValues(MergedValues, DL);
-}
-
-// Convert all fixed length vector stores larger than NEON to masked_stores.
-SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- auto Store = cast<StoreSDNode>(Op);
-
- SDLoc DL(Op);
- EVT VT = Store->getValue().getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
- EVT MemVT = Store->getMemoryVT();
-
- auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
- auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
-
- if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
- EVT TruncVT = ContainerVT.changeVectorElementType(
- Store->getMemoryVT().getVectorElementType());
- MemVT = MemVT.changeTypeToInteger();
- NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
- NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
- DAG.getUNDEF(TruncVT));
- NewValue =
- getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
- } else if (VT.isFloatingPoint()) {
- MemVT = MemVT.changeTypeToInteger();
- NewValue =
- getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
- }
-
- return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
- Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
- Store->getMemOperand(), Store->getAddressingMode(),
- Store->isTruncatingStore());
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- auto *Store = cast<MaskedStoreSDNode>(Op);
-
- SDLoc DL(Op);
- EVT VT = Store->getValue().getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
-
- auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
- SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG);
-
- return DAG.getMaskedStore(
- Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
- Mask, Store->getMemoryVT(), Store->getMemOperand(),
- Store->getAddressingMode(), Store->isTruncatingStore());
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT VT = Op.getValueType();
- EVT EltVT = VT.getVectorElementType();
-
- bool Signed = Op.getOpcode() == ISD::SDIV;
- unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
-
- bool Negated;
- uint64_t SplatVal;
- if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
- SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
- SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32);
-
- SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
- SDValue Res =
- DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, DL, ContainerVT, Pg, Op1, Op2);
- if (Negated)
- Res = DAG.getNode(ISD::SUB, DL, ContainerVT,
- DAG.getConstant(0, DL, ContainerVT), Res);
-
- return convertFromScalableVector(DAG, VT, Res);
- }
-
- // Scalable vector i32/i64 DIV is supported.
- if (EltVT == MVT::i32 || EltVT == MVT::i64)
- return LowerToPredicatedOp(Op, DAG, PredOpcode);
-
- // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
- EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
- EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
- unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
-
- // If the wider type is legal: extend, op, and truncate.
- EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
- if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
- SDValue Op0 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(0));
- SDValue Op1 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(1));
- SDValue Div = DAG.getNode(Op.getOpcode(), DL, WideVT, Op0, Op1);
- return DAG.getNode(ISD::TRUNCATE, DL, VT, Div);
- }
-
- auto HalveAndExtendVector = [&DAG, &DL, &HalfVT, &PromVT,
- &ExtendOpcode](SDValue Op) {
- SDValue IdxZero = DAG.getConstant(0, DL, MVT::i64);
- SDValue IdxHalf =
- DAG.getConstant(HalfVT.getVectorNumElements(), DL, MVT::i64);
- SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxZero);
- SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxHalf);
- return std::pair<SDValue, SDValue>(
- {DAG.getNode(ExtendOpcode, DL, PromVT, Lo),
- DAG.getNode(ExtendOpcode, DL, PromVT, Hi)});
- };
-
- // If wider type is not legal: split, extend, op, trunc and concat.
- auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
- auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
- SDValue Lo = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0LoExt, Op1LoExt);
- SDValue Hi = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0HiExt, Op1HiExt);
- SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Lo);
- SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Hi);
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoTrunc, HiTrunc});
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- SDLoc DL(Op);
- SDValue Val = Op.getOperand(0);
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
- Val = convertToScalableVector(DAG, ContainerVT, Val);
-
- bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
- unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
-
- // Repeatedly unpack Val until the result is of the desired element type.
- switch (ContainerVT.getSimpleVT().SimpleTy) {
- default:
- llvm_unreachable("unimplemented container type");
- case MVT::nxv16i8:
- Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
- if (VT.getVectorElementType() == MVT::i16)
- break;
- [[fallthrough]];
- case MVT::nxv8i16:
- Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
- if (VT.getVectorElementType() == MVT::i32)
- break;
- [[fallthrough]];
- case MVT::nxv4i32:
- Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
- assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
- break;
- }
-
- return convertFromScalableVector(DAG, VT, Val);
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- SDLoc DL(Op);
- SDValue Val = Op.getOperand(0);
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
- Val = convertToScalableVector(DAG, ContainerVT, Val);
-
- // Repeatedly truncate Val until the result is of the desired element type.
- switch (ContainerVT.getSimpleVT().SimpleTy) {
- default:
- llvm_unreachable("unimplemented container type");
- case MVT::nxv2i64:
- Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
- Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
- if (VT.getVectorElementType() == MVT::i32)
- break;
- [[fallthrough]];
- case MVT::nxv4i32:
- Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
- Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
- if (VT.getVectorElementType() == MVT::i16)
- break;
- [[fallthrough]];
- case MVT::nxv8i16:
- Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
- Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
- assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
- break;
- }
-
- return convertFromScalableVector(DAG, VT, Val);
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
- SDValue Op, SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- EVT InVT = Op.getOperand(0).getValueType();
- assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- SDLoc DL(Op);
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
- SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
-
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
- SDValue Op, SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- SDLoc DL(Op);
- EVT InVT = Op.getOperand(0).getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
- SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
-
- auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
- Op.getOperand(1), Op.getOperand(2));
-
- return convertFromScalableVector(DAG, VT, ScalableRes);
-}
-
-// Convert vector operation 'Op' to an equivalent predicated operation whereby
-// the original operation's type is used to construct a suitable predicate.
-// NOTE: The results for inactive lanes are undefined.
-SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
- SelectionDAG &DAG,
- unsigned NewOp) const {
- EVT VT = Op.getValueType();
- SDLoc DL(Op);
- auto Pg = getPredicateForVector(DAG, DL, VT);
-
- if (VT.isFixedLengthVector()) {
- assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
-
- // Create list of operands by converting existing ones to scalable types.
- SmallVector<SDValue, 4> Operands = {Pg};
- for (const SDValue &V : Op->op_values()) {
- if (isa<CondCodeSDNode>(V)) {
- Operands.push_back(V);
- continue;
- }
-
- if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
- EVT VTArg = VTNode->getVT().getVectorElementType();
- EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
- Operands.push_back(DAG.getValueType(NewVTArg));
- continue;
- }
-
- assert(isTypeLegal(V.getValueType()) &&
- "Expected only legal fixed-width types");
- Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
- }
-
- if (isMergePassthruOpcode(NewOp))
- Operands.push_back(DAG.getUNDEF(ContainerVT));
-
- auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
- return convertFromScalableVector(DAG, VT, ScalableRes);
- }
-
- assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
-
- SmallVector<SDValue, 4> Operands = {Pg};
- for (const SDValue &V : Op->op_values()) {
- assert((!V.getValueType().isVector() ||
- V.getValueType().isScalableVector()) &&
- "Only scalable vectors are supported!");
- Operands.push_back(V);
- }
-
- if (isMergePassthruOpcode(NewOp))
- Operands.push_back(DAG.getUNDEF(VT));
-
- return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
-}
-
-// If a fixed length vector operation has no side effects when applied to
-// undefined elements, we can safely use scalable vectors to perform the same
-// operation without needing to worry about predication.
-SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && isTypeLegal(VT) &&
- "Only expected to lower fixed length vector operation!");
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
-
- // Create list of operands by converting existing ones to scalable types.
- SmallVector<SDValue, 4> Ops;
- for (const SDValue &V : Op->op_values()) {
- assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
-
- // Pass through non-vector operands.
- if (!V.getValueType().isVector()) {
- Ops.push_back(V);
- continue;
- }
-
- // "cast" fixed length vector to a scalable vector.
- assert(V.getValueType().isFixedLengthVector() &&
- isTypeLegal(V.getValueType()) &&
- "Only fixed length vectors are supported!");
- Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
- }
-
- auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
- return convertFromScalableVector(DAG, VT, ScalableRes);
-}
-
-SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
- SelectionDAG &DAG) const {
- SDLoc DL(ScalarOp);
- SDValue AccOp = ScalarOp.getOperand(0);
- SDValue VecOp = ScalarOp.getOperand(1);
- EVT SrcVT = VecOp.getValueType();
- EVT ResVT = SrcVT.getVectorElementType();
-
- EVT ContainerVT = SrcVT;
- if (SrcVT.isFixedLengthVector()) {
- ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
- VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
- }
-
- SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
- SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
-
- // Convert operands to Scalable.
- AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
- DAG.getUNDEF(ContainerVT), AccOp, Zero);
-
- // Perform reduction.
- SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
- Pg, AccOp, VecOp);
-
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
-}
-
-SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
- SelectionDAG &DAG) const {
- SDLoc DL(ReduceOp);
- SDValue Op = ReduceOp.getOperand(0);
- EVT OpVT = Op.getValueType();
- EVT VT = ReduceOp.getValueType();
-
- if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
- return SDValue();
-
- SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
-
- switch (ReduceOp.getOpcode()) {
- default:
- return SDValue();
- case ISD::VECREDUCE_OR:
- if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
- // The predicate can be 'Op' because
- // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
- return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
- else
- return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
- case ISD::VECREDUCE_AND: {
- Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
- return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
- }
- case ISD::VECREDUCE_XOR: {
- SDValue ID =
- DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
- if (OpVT == MVT::nxv1i1) {
- // Emulate a CNTP on .Q using .D and a different governing predicate.
- Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
- Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
- }
- SDValue Cntp =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
- return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
- }
- }
-
- return SDValue();
-}
-
-SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
- SDValue ScalarOp,
- SelectionDAG &DAG) const {
- SDLoc DL(ScalarOp);
- SDValue VecOp = ScalarOp.getOperand(0);
- EVT SrcVT = VecOp.getValueType();
-
- if (useSVEForFixedLengthVectorVT(
- SrcVT,
- /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
- VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
- }
-
- // Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.
- if (ScalarOp.getOpcode() == ISD::VECREDUCE_ADD &&
- VecOp.getOpcode() == ISD::ZERO_EXTEND) {
- SDValue BoolVec = VecOp.getOperand(0);
- if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {
- // CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)
- SDValue CntpOp = DAG.getNode(
- ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
- DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64),
- BoolVec, BoolVec);
- return DAG.getAnyExtOrTrunc(CntpOp, DL, ScalarOp.getValueType());
- }
- }
-
- // UADDV always returns an i64 result.
- EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
- SrcVT.getVectorElementType();
- EVT RdxVT = SrcVT;
- if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
- RdxVT = getPackedSVEVectorVT(ResVT);
-
- SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
- SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
- SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
- Rdx, DAG.getConstant(0, DL, MVT::i64));
-
- // The VEC_REDUCE nodes expect an element size result.
- if (ResVT != ScalarOp.getValueType())
- Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
-
- return Res;
-}
-
-SDValue
-AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- SDLoc DL(Op);
-
- EVT InVT = Op.getOperand(1).getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
- SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
- SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
-
- // Convert the mask to a predicated (NOTE: We don't need to worry about
- // inactive lanes since VSELECT is safe when given undefined elements).
- EVT MaskVT = Op.getOperand(0).getValueType();
- EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
- auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
- Mask = DAG.getNode(ISD::TRUNCATE, DL,
- MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
-
- auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
- Mask, Op1, Op2);
-
- return convertFromScalableVector(DAG, VT, ScalableRes);
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT InVT = Op.getOperand(0).getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
-
- assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
- "Only expected to lower fixed length vector operation!");
- assert(Op.getValueType() == InVT.changeTypeToInteger() &&
- "Expected integer result of the same bit length as the inputs!");
-
- auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
- auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
- auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
-
- EVT CmpVT = Pg.getValueType();
- auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
- {Pg, Op1, Op2, Op.getOperand(2)});
-
- EVT PromoteVT = ContainerVT.changeTypeToInteger();
- auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
- return convertFromScalableVector(DAG, Op.getValueType(), Promote);
-}
-
-SDValue
-AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc DL(Op);
- auto SrcOp = Op.getOperand(0);
- EVT VT = Op.getValueType();
- EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
- EVT ContainerSrcVT =
- getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
-
- SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
- Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
- return convertFromScalableVector(DAG, VT, Op);
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- SDLoc DL(Op);
- unsigned NumOperands = Op->getNumOperands();
-
- assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
- "Unexpected number of operands in CONCAT_VECTORS");
-
- auto SrcOp1 = Op.getOperand(0);
- auto SrcOp2 = Op.getOperand(1);
- EVT VT = Op.getValueType();
- EVT SrcVT = SrcOp1.getValueType();
-
- // Match a splat of 128b segments that fit in a single register.
- if (SrcVT.is128BitVector() && all_equal(Op.getNode()->op_values())) {
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
- SDValue Splat =
- DAG.getNode(AArch64ISD::DUPLANE128, DL, ContainerVT,
- convertToScalableVector(DAG, ContainerVT, SrcOp1),
- DAG.getConstant(0, DL, MVT::i64, /*isTarget=*/true));
- return convertFromScalableVector(DAG, VT, Splat);
- }
-
- if (NumOperands > 2) {
- SmallVector<SDValue, 4> Ops;
- EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
- for (unsigned I = 0; I < NumOperands; I += 2)
- Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
- Op->getOperand(I), Op->getOperand(I + 1)));
-
- return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
- }
-
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
-
- SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
- SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
- SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
-
- Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
-
- return convertFromScalableVector(DAG, VT, Op);
-}
-
-SDValue
-AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- SDLoc DL(Op);
- SDValue Val = Op.getOperand(0);
- SDValue Pg = getPredicateForVector(DAG, DL, VT);
- EVT SrcVT = Val.getValueType();
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
- EVT ExtendVT = ContainerVT.changeVectorElementType(
- SrcVT.getVectorElementType());
-
- Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
- Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
-
- Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
- Val = getSVESafeBitCast(ExtendVT, Val, DAG);
- Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
- Pg, Val, DAG.getUNDEF(ContainerVT));
-
- return convertFromScalableVector(DAG, VT, Val);
-}
-
-SDValue
-AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- SDLoc DL(Op);
- SDValue Val = Op.getOperand(0);
- EVT SrcVT = Val.getValueType();
- EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
- EVT RoundVT = ContainerSrcVT.changeVectorElementType(
- VT.getVectorElementType());
- SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
-
- Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
- Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
- Op.getOperand(1), DAG.getUNDEF(RoundVT));
- Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
- Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
-
- Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
- return DAG.getNode(ISD::BITCAST, DL, VT, Val);
-}
-
-SDValue
-AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
- unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
- : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
-
- SDLoc DL(Op);
- SDValue Val = Op.getOperand(0);
- EVT SrcVT = Val.getValueType();
- EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
- EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
-
- if (VT.bitsGE(SrcVT)) {
- SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
-
- Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
- VT.changeTypeToInteger(), Val);
-
- // Safe to use a larger than specified operand because by promoting the
- // value nothing has changed from an arithmetic point of view.
- Val =
- convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
- Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
- DAG.getUNDEF(ContainerDstVT));
- return convertFromScalableVector(DAG, VT, Val);
- } else {
- EVT CvtVT = ContainerSrcVT.changeVectorElementType(
- ContainerDstVT.getVectorElementType());
- SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
-
- Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
- Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
- Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
- Val = convertFromScalableVector(DAG, SrcVT, Val);
-
- Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
- return DAG.getNode(ISD::BITCAST, DL, VT, Val);
- }
-}
-
-SDValue
-AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT OpVT = Op.getValueType();
- assert(OpVT.isScalableVector() &&
- "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
-
- // Are multi-register uzp instructions available?
- if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
- OpVT.getVectorElementType() != MVT::i1) {
- Intrinsic::ID IntID;
- switch (Op->getNumOperands()) {
- default:
- return SDValue();
- case 2:
- IntID = Intrinsic::aarch64_sve_uzp_x2;
- break;
- case 4:
- if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
- OpVT.getScalarSizeInBits() == 64)
- return SDValue();
- IntID = Intrinsic::aarch64_sve_uzp_x4;
- break;
- }
-
- SmallVector<SDValue, 5> Ops;
- Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
- Ops.append(Op->op_values().begin(), Op->op_values().end());
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
- }
-
- if (Op->getNumOperands() != 2)
- return SDValue();
-
- SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
- Op.getOperand(1));
- SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
- Op.getOperand(1));
- return DAG.getMergeValues({Even, Odd}, DL);
-}
-
-SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT OpVT = Op.getValueType();
- assert(OpVT.isScalableVector() &&
- "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
-
- // Are multi-register zip instructions available?
- if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
- OpVT.getVectorElementType() != MVT::i1) {
- Intrinsic::ID IntID;
- switch (Op->getNumOperands()) {
- default:
- return SDValue();
- case 2:
- IntID = Intrinsic::aarch64_sve_zip_x2;
- break;
- case 4:
- if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
- OpVT.getScalarSizeInBits() == 64)
- return SDValue();
- IntID = Intrinsic::aarch64_sve_zip_x4;
- break;
- }
-
- SmallVector<SDValue, 5> Ops;
- Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
- Ops.append(Op->op_values().begin(), Op->op_values().end());
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
- }
-
- if (Op->getNumOperands() != 2)
- return SDValue();
-
- SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
- Op.getOperand(1));
- SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
- Op.getOperand(1));
- return DAG.getMergeValues({Lo, Hi}, DL);
-}
-
-SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
- SelectionDAG &DAG) const {
- // FIXME: Maybe share some code with LowerMGather/Scatter?
- MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
- SDLoc DL(HG);
- SDValue Chain = HG->getChain();
- SDValue Inc = HG->getInc();
- SDValue Mask = HG->getMask();
- SDValue Ptr = HG->getBasePtr();
- SDValue Index = HG->getIndex();
- SDValue Scale = HG->getScale();
- SDValue IntID = HG->getIntID();
-
- // The Intrinsic ID determines the type of update operation.
- [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
- // Right now, we only support 'add' as an update.
- assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
- "Unexpected histogram update operation");
-
- EVT IndexVT = Index.getValueType();
- LLVMContext &Ctx = *DAG.getContext();
- ElementCount EC = IndexVT.getVectorElementCount();
- EVT MemVT = EVT::getVectorVT(Ctx, HG->getMemoryVT(), EC);
- EVT IncExtVT =
- EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
- EVT IncSplatVT = EVT::getVectorVT(Ctx, IncExtVT, EC);
- bool ExtTrunc = IncSplatVT != MemVT;
-
- SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
- SDValue PassThru = DAG.getSplatVector(IncSplatVT, DL, Zero);
- SDValue IncSplat = DAG.getSplatVector(
- IncSplatVT, DL, DAG.getAnyExtOrTrunc(Inc, DL, IncExtVT));
- SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
-
- MachineMemOperand *MMO = HG->getMemOperand();
- // Create an MMO for the gather, without load|store flags.
- MachineMemOperand *GMMO = DAG.getMachineFunction().getMachineMemOperand(
- MMO->getPointerInfo(), MachineMemOperand::MOLoad, MMO->getSize(),
- MMO->getAlign(), MMO->getAAInfo());
- ISD::MemIndexType IndexType = HG->getIndexType();
- SDValue Gather = DAG.getMaskedGather(
- DAG.getVTList(IncSplatVT, MVT::Other), MemVT, DL, Ops, GMMO, IndexType,
- ExtTrunc ? ISD::EXTLOAD : ISD::NON_EXTLOAD);
-
- SDValue GChain = Gather.getValue(1);
-
- // Perform the histcnt, multiply by inc, add to bucket data.
- SDValue ID =
- DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncExtVT);
- SDValue HistCnt =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
- SDValue Mul = DAG.getNode(ISD::MUL, DL, IncSplatVT, HistCnt, IncSplat);
- SDValue Add = DAG.getNode(ISD::ADD, DL, IncSplatVT, Gather, Mul);
-
- // Create an MMO for the scatter, without load|store flags.
- MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
- MMO->getPointerInfo(), MachineMemOperand::MOStore, MMO->getSize(),
- MMO->getAlign(), MMO->getAAInfo());
-
- SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
- SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
- ScatterOps, SMMO, IndexType, ExtTrunc);
- return Scatter;
-}
-
-/// If a PARTIAL_REDUCE_MLA node comes in with an accumulator-input type pairing
-/// of (nx)v2i64/(nx)v16i8, we cannot directly lower it to a (u|s)dot. We can
-/// however still make use of the dot product instruction by instead
-/// accumulating over two steps: (nx)v16i8 -> (nx)v4i32 -> (nx)v2i64.
-/// If available, make use of the (U|S)ADDW(B|T) instructions, otherwise
-/// the following pattern is emitted:
-/// add(add(Acc, ext(EXTRACT_SUBVECTOR(N, 0)), ext(EXTRACT_SUBVECTOR(N,
-/// NTy/2))))
-SDValue
-AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc DL(Op);
-
- SDValue Acc = Op.getOperand(0);
- SDValue LHS = Op.getOperand(1);
- SDValue RHS = Op.getOperand(2);
- EVT ResultVT = Op.getValueType();
- EVT OrigResultVT = ResultVT;
- EVT OpVT = LHS.getValueType();
-
- bool ConvertToScalable =
- ResultVT.isFixedLengthVector() &&
- useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true);
-
- if (ConvertToScalable) {
- ResultVT = getContainerForFixedLengthVector(DAG, ResultVT);
- OpVT = getContainerForFixedLengthVector(DAG, LHS.getValueType());
- Acc = convertToScalableVector(DAG, ResultVT, Acc);
- LHS = convertToScalableVector(DAG, OpVT, LHS);
- RHS = convertToScalableVector(DAG, OpVT, RHS);
- Op = DAG.getNode(Op.getOpcode(), DL, ResultVT, {Acc, LHS, RHS});
- }
-
- // Two-way and four-way partial reductions are supported by patterns.
- // We only need to handle the 8-way partial reduction.
- if (ResultVT.getScalarType() != MVT::i64 || OpVT.getScalarType() != MVT::i8)
- return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Op)
- : Op;
-
- EVT DotVT = ResultVT.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
- SDValue DotNode = DAG.getNode(Op.getOpcode(), DL, DotVT,
- DAG.getConstant(0, DL, DotVT), LHS, RHS);
-
- SDValue Res;
- bool IsUnsigned = Op.getOpcode() == ISD::PARTIAL_REDUCE_UMLA;
- if (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable()) {
- unsigned LoOpcode = IsUnsigned ? AArch64ISD::UADDWB : AArch64ISD::SADDWB;
- unsigned HiOpcode = IsUnsigned ? AArch64ISD::UADDWT : AArch64ISD::SADDWT;
- SDValue Lo = DAG.getNode(LoOpcode, DL, ResultVT, Acc, DotNode);
- Res = DAG.getNode(HiOpcode, DL, ResultVT, Lo, DotNode);
- } else {
- // Fold (nx)v4i32 into (nx)v2i64
- auto [DotNodeLo, DotNodeHi] = DAG.SplitVector(DotNode, DL);
- if (IsUnsigned) {
- DotNodeLo = DAG.getZExtOrTrunc(DotNodeLo, DL, ResultVT);
- DotNodeHi = DAG.getZExtOrTrunc(DotNodeHi, DL, ResultVT);
- } else {
- DotNodeLo = DAG.getSExtOrTrunc(DotNodeLo, DL, ResultVT);
- DotNodeHi = DAG.getSExtOrTrunc(DotNodeHi, DL, ResultVT);
- }
- auto Lo = DAG.getNode(ISD::ADD, DL, ResultVT, Acc, DotNodeLo);
- Res = DAG.getNode(ISD::ADD, DL, ResultVT, Lo, DotNodeHi);
- }
-
- return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Res)
- : Res;
-}
-
-SDValue
-AArch64TargetLowering::LowerGET_ACTIVE_LANE_MASK(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- assert(Subtarget->isSVEorStreamingSVEAvailable() &&
- "Lowering fixed length get_active_lane_mask requires SVE!");
-
- // There are no dedicated fixed-length instructions for GET_ACTIVE_LANE_MASK,
- // but we can use SVE when available.
-
- SDLoc DL(Op);
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
- EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
-
- SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WhileVT,
- Op.getOperand(0), Op.getOperand(1));
- SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
- DAG.getVectorIdxConstant(0, DL));
-}
-
-SDValue
-AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
- unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
- : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
-
- SDLoc DL(Op);
- SDValue Val = Op.getOperand(0);
- EVT SrcVT = Val.getValueType();
- EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
- EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
-
- if (VT.bitsGT(SrcVT)) {
- EVT CvtVT = ContainerDstVT.changeVectorElementType(
- ContainerSrcVT.getVectorElementType());
- SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
-
- Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
- Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
-
- Val = convertToScalableVector(DAG, ContainerDstVT, Val);
- Val = getSVESafeBitCast(CvtVT, Val, DAG);
- Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
- DAG.getUNDEF(ContainerDstVT));
- return convertFromScalableVector(DAG, VT, Val);
- } else {
- EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
- SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
-
- // Safe to use a larger than specified result since an fp_to_int where the
- // result doesn't fit into the destination is undefined.
- Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
- Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
- Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
-
- return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
- }
-}
-
-static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
- ArrayRef<int> ShuffleMask, EVT VT,
- EVT ContainerVT, SelectionDAG &DAG) {
- auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
- SDLoc DL(Op);
- unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
- unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
- bool IsSingleOp =
- ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
-
- if (!Subtarget.isNeonAvailable() && !MinSVESize)
- MinSVESize = 128;
-
- // Ignore two operands if no SVE2 or all index numbers couldn't
- // be represented.
- if (!IsSingleOp && !Subtarget.hasSVE2())
- return SDValue();
-
- EVT VTOp1 = Op.getOperand(0).getValueType();
- unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
- unsigned IndexLen = MinSVESize / BitsPerElt;
- unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
- uint64_t MaxOffset = maxUIntN(BitsPerElt);
- EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
- EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
- bool MinMaxEqual = (MinSVESize == MaxSVESize);
- assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
- "Incorrectly legalised shuffle operation");
-
- SmallVector<SDValue, 8> TBLMask;
- // If MinSVESize is not equal to MaxSVESize then we need to know which
- // TBL mask element needs adjustment.
- SmallVector<SDValue, 8> AddRuntimeVLMask;
-
- // Bail out for 8-bits element types, because with 2048-bit SVE register
- // size 8 bits is only sufficient to index into the first source vector.
- if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
- return SDValue();
-
- for (int Index : ShuffleMask) {
- // Handling poison index value.
- if (Index < 0)
- Index = 0;
- // If the mask refers to elements in the second operand, then we have to
- // offset the index by the number of elements in a vector. If this is number
- // is not known at compile-time, we need to maintain a mask with 'VL' values
- // to add at runtime.
- if ((unsigned)Index >= ElementsPerVectorReg) {
- if (MinMaxEqual) {
- Index += IndexLen - ElementsPerVectorReg;
- } else {
- Index = Index - ElementsPerVectorReg;
- AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
- }
- } else if (!MinMaxEqual)
- AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
- // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
- // to 255, this might point to the last element of in the second operand
- // of the shufflevector, thus we are rejecting this transform.
- if ((unsigned)Index >= MaxOffset)
- return SDValue();
- TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
- }
-
- // Choosing an out-of-range index leads to the lane being zeroed vs zero
- // value where it would perform first lane duplication for out of
- // index elements. For i8 elements an out-of-range index could be a valid
- // for 2048-bit vector register size.
- for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
- TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
- if (!MinMaxEqual)
- AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
- }
-
- EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
- SDValue VecMask =
- DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
- SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
-
- SDValue Shuffle;
- if (IsSingleOp)
- Shuffle =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
- DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
- Op1, SVEMask);
- else if (Subtarget.hasSVE2()) {
- if (!MinMaxEqual) {
- unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
- SDValue VScale = (BitsPerElt == 64)
- ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
- : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
- SDValue VecMask =
- DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
- SDValue MulByMask = DAG.getNode(
- ISD::MUL, DL, MaskType,
- DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
- DAG.getBuildVector(MaskType, DL,
- ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
- SDValue UpdatedVecMask =
- DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
- SVEMask = convertToScalableVector(
- DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
- }
- Shuffle =
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
- DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
- Op1, Op2, SVEMask);
- }
- Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
- return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
-}
-
-SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
- SDValue Op, SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
-
- auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
- auto ShuffleMask = SVN->getMask();
-
- SDLoc DL(Op);
- SDValue Op1 = Op.getOperand(0);
- SDValue Op2 = Op.getOperand(1);
-
- EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
- Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
- Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
-
- auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
- if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
- return MVT::i32;
- return ScalarTy;
- };
-
- if (SVN->isSplat()) {
- unsigned Lane = std::max(0, SVN->getSplatIndex());
- EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
- SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
- DAG.getConstant(Lane, DL, MVT::i64));
- Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
- return convertFromScalableVector(DAG, VT, Op);
- }
-
- bool ReverseEXT = false;
- unsigned Imm;
- if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
- Imm == VT.getVectorNumElements() - 1) {
- if (ReverseEXT)
- std::swap(Op1, Op2);
- EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
- SDValue Scalar = DAG.getNode(
- ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
- DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
- Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
- return convertFromScalableVector(DAG, VT, Op);
- }
-
- unsigned EltSize = VT.getScalarSizeInBits();
- for (unsigned BlockSize : {64U, 32U, 16U}) {
- if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), BlockSize)) {
- unsigned RevOp;
- if (EltSize == 8)
- RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
- else if (EltSize == 16)
- RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
- else
- RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
- EVT BlockedVT =
- getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), BlockSize));
- SDValue Pg = getPredicateForVector(DAG, DL, BlockedVT);
- SDValue BlockedOp1 = DAG.getNode(ISD::BITCAST, DL, BlockedVT, Op1);
- SDValue BlockedRev = DAG.getNode(RevOp, DL, BlockedVT, Pg, BlockedOp1,
- DAG.getUNDEF(BlockedVT));
- SDValue Container =
- DAG.getNode(ISD::BITCAST, DL, ContainerVT, BlockedRev);
- return convertFromScalableVector(DAG, VT, Container);
- }
- }
-
- if (Subtarget->hasSVE2p1() && EltSize == 64 &&
- isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
- SDValue Pg = getPredicateForVector(DAG, DL, VT);
- SDValue Revd = DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, ContainerVT,
- Pg, Op1, DAG.getUNDEF(ContainerVT));
- return convertFromScalableVector(DAG, VT, Revd);
- }
-
- unsigned WhichResult;
- if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
- WhichResult == 0)
- return convertFromScalableVector(
- DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
-
- if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
- unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
- return convertFromScalableVector(
- DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
- }
-
- if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
- return convertFromScalableVector(
- DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
-
- if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
- unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
- return convertFromScalableVector(
- DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
- }
-
- // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
- // represents the same logical operation as performed by a ZIP instruction. In
- // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
- // equivalent to an AArch64 instruction. There's the extra component of
- // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
- // only operated on 64/128bit vector types that have a direct mapping to a
- // target register and so an exact mapping is implied.
- // However, when using SVE for fixed length vectors, most legal vector types
- // are actually sub-vectors of a larger SVE register. When mapping
- // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
- // how the mask's indices translate. Specifically, when the mapping requires
- // an exact meaning for a specific vector index (e.g. Index X is the last
- // vector element in the register) then such mappings are often only safe when
- // the exact SVE register size is know. The main exception to this is when
- // indices are logically relative to the first element of either
- // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
- // when converting from fixed-length to scalable vector types (i.e. the start
- // of a fixed length vector is always the start of a scalable vector).
- unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
- unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
- if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
- if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
- Op2.isUndef()) {
- Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
- return convertFromScalableVector(DAG, VT, Op);
- }
-
- if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
- WhichResult != 0)
- return convertFromScalableVector(
- DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
-
- if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
- unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
- return convertFromScalableVector(
- DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
- }
-
- if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
- return convertFromScalableVector(
- DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
-
- if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
- unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
- return convertFromScalableVector(
- DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
- }
-
- if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) &&
- Subtarget->isSVEorStreamingSVEAvailable()) {
- assert(VT.getFixedSizeInBits() % AArch64::SVEBitsPerBlock == 0 &&
- "Unsupported SVE vector size");
-
- unsigned Segments = VT.getFixedSizeInBits() / AArch64::SVEBitsPerBlock;
- unsigned SegmentElts = VT.getVectorNumElements() / Segments;
- if (std::optional<unsigned> Lane =
- isDUPQMask(ShuffleMask, Segments, SegmentElts)) {
- SDValue IID =
- DAG.getConstant(Intrinsic::aarch64_sve_dup_laneq, DL, MVT::i64);
- return convertFromScalableVector(
- DAG, VT,
- DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
- {IID, Op1,
- DAG.getConstant(*Lane, DL, MVT::i64,
- /*isTarget=*/true)}));
- }
- }
- }
-
- // Try to widen the shuffle before generating a possibly expensive SVE TBL.
- // This may allow the shuffle to be matched as something cheaper like ZIP1.
- if (SDValue WideOp = tryWidenMaskForShuffle(Op, DAG))
- return WideOp;
-
- // Avoid producing TBL instruction if we don't know SVE register minimal size,
- // unless NEON is not available and we can assume minimal SVE register size is
- // 128-bits.
- if (MinSVESize || !Subtarget->isNeonAvailable())
- return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
- DAG);
-
- return SDValue();
-}
-
-SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
- SelectionDAG &DAG) const {
- SDLoc DL(Op);
- EVT InVT = Op.getValueType();
-
- assert(VT.isScalableVector() && isTypeLegal(VT) &&
- InVT.isScalableVector() && isTypeLegal(InVT) &&
- "Only expect to cast between legal scalable vector types!");
- assert(VT.getVectorElementType() != MVT::i1 &&
- InVT.getVectorElementType() != MVT::i1 &&
- "For predicate bitcasts, use getSVEPredicateBitCast");
-
- if (InVT == VT)
- return Op;
-
- EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
- EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
-
- // Safe bitcasting between unpacked vector types of different element counts
- // is currently unsupported because the following is missing the necessary
- // work to ensure the result's elements live where they're supposed to within
- // an SVE register.
- // 01234567
- // e.g. nxv2i32 = XX??XX??
- // nxv4f16 = X?X?X?X?
- assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||
- VT == PackedVT || InVT == PackedInVT) &&
- "Unexpected bitcast!");
-
- // Pack input if required.
- if (InVT != PackedInVT)
- Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
-
- if (Subtarget->isLittleEndian() ||
- PackedVT.getScalarSizeInBits() == PackedInVT.getScalarSizeInBits())
- Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
- else {
- EVT PackedVTAsInt = PackedVT.changeTypeToInteger();
- EVT PackedInVTAsInt = PackedInVT.changeTypeToInteger();
-
- // Simulate the effect of casting through memory.
- Op = DAG.getNode(ISD::BITCAST, DL, PackedInVTAsInt, Op);
- if (PackedInVTAsInt.getScalarSizeInBits() != 8)
- Op = DAG.getNode(ISD::BSWAP, DL, PackedInVTAsInt, Op);
- Op = DAG.getNode(AArch64ISD::NVCAST, DL, PackedVTAsInt, Op);
- if (PackedVTAsInt.getScalarSizeInBits() != 8)
- Op = DAG.getNode(ISD::BSWAP, DL, PackedVTAsInt, Op);
- Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
- }
-
- // Unpack result if required.
- if (VT != PackedVT)
- Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
-
- return Op;
-}
-
-bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,
- SDValue N) const {
- return ::isAllActivePredicate(DAG, N);
-}
-
-EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
- return ::getPromotedVTForPredicate(VT);
-}
-
-bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
- SDValue Op, const APInt &OriginalDemandedBits,
- const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
- unsigned Depth) const {
-
- unsigned Opc = Op.getOpcode();
- switch (Opc) {
- case AArch64ISD::VSHL: {
- // Match (VSHL (VLSHR Val X) X)
- SDValue ShiftL = Op;
- SDValue ShiftR = Op->getOperand(0);
- if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
- return false;
-
- if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
- return false;
-
- unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
- unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
-
- // Other cases can be handled as well, but this is not
- // implemented.
- if (ShiftRBits != ShiftLBits)
- return false;
-
- unsigned ScalarSize = Op.getScalarValueSizeInBits();
- assert(ScalarSize > ShiftLBits && "Invalid shift imm");
-
- APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
- APInt UnusedBits = ~OriginalDemandedBits;
-
- if ((ZeroBits & UnusedBits) != ZeroBits)
- return false;
-
- // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
- // used - simplify to just Val.
- return TLO.CombineTo(Op, ShiftR->getOperand(0));
- }
- case AArch64ISD::BICi: {
- // Fold BICi if all destination bits already known to be zeroed
- SDValue Op0 = Op.getOperand(0);
- KnownBits KnownOp0 =
- TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
- // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
- APInt BitsToClear =
- (Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
- .trunc(KnownOp0.getBitWidth());
- APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
- if (BitsToClear.isSubsetOf(AlreadyZeroedBitsToClear))
- return TLO.CombineTo(Op, Op0);
-
- Known = KnownOp0 & KnownBits::makeConstant(~BitsToClear);
- return false;
- }
- case ISD::INTRINSIC_WO_CHAIN: {
- if (auto ElementSize = IsSVECntIntrinsic(Op)) {
- unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
- if (!MaxSVEVectorSizeInBits)
- MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
- unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
- // The SVE count intrinsics don't support the multiplier immediate so we
- // don't have to account for that here. The value returned may be slightly
- // over the true required bits, as this is based on the "ALL" pattern. The
- // other patterns are also exposed by these intrinsics, but they all
- // return a value that's strictly less than "ALL".
- unsigned RequiredBits = llvm::bit_width(MaxElements);
- unsigned BitWidth = Known.Zero.getBitWidth();
- if (RequiredBits < BitWidth)
- Known.Zero.setHighBits(BitWidth - RequiredBits);
- return false;
- }
- }
- }
-
- return TargetLowering::SimplifyDemandedBitsForTargetNode(
- Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
-}
-
-bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
- return Op.getOpcode() == AArch64ISD::DUP ||
- Op.getOpcode() == AArch64ISD::MOVI ||
- Op.getOpcode() == AArch64ISD::MOVIshift ||
- (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
- TargetLowering::isTargetCanonicalConstantNode(Op);
-}
-
-bool AArch64TargetLowering::isComplexDeinterleavingSupported() const {
- return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
- Subtarget->hasComplxNum();
-}
-
-bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
- ComplexDeinterleavingOperation Operation, Type *Ty) const {
- auto *VTy = dyn_cast<VectorType>(Ty);
- if (!VTy)
- return false;
-
- // If the vector is scalable, SVE is enabled, implying support for complex
- // numbers. Otherwise, we need to ensure complex number support is available
- if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
- return false;
-
- auto *ScalarTy = VTy->getScalarType();
- unsigned NumElements = VTy->getElementCount().getKnownMinValue();
-
- // We can only process vectors that have a bit size of 128 or higher (with an
- // additional 64 bits for Neon). Additionally, these vectors must have a
- // power-of-2 size, as we later split them into the smallest supported size
- // and merging them back together after applying complex operation.
- unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
- if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
- !llvm::isPowerOf2_32(VTyWidth))
- return false;
-
- if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
- unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
-
- if (Operation == ComplexDeinterleavingOperation::CDot)
- return ScalarWidth == 32 || ScalarWidth == 64;
- return 8 <= ScalarWidth && ScalarWidth <= 64;
- }
-
- // CDot is not supported outside of scalable/sve scopes
- if (Operation == ComplexDeinterleavingOperation::CDot)
- return false;
-
- return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
- ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
-}
-
-Value *AArch64TargetLowering::createComplexDeinterleavingIR(
- IRBuilderBase &B, ComplexDeinterleavingOperation OperationType,
- ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
- Value *Accumulator) const {
- VectorType *Ty = cast<VectorType>(InputA->getType());
- if (Accumulator == nullptr)
- Accumulator = Constant::getNullValue(Ty);
- bool IsScalable = Ty->isScalableTy();
- bool IsInt = Ty->getElementType()->isIntegerTy();
-
- unsigned TyWidth =
- Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
-
- assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
- "Vector type must be either 64 or a power of 2 that is at least 128");
-
- if (TyWidth > 128) {
- int Stride = Ty->getElementCount().getKnownMinValue() / 2;
- int AccStride = cast<VectorType>(Accumulator->getType())
- ->getElementCount()
- .getKnownMinValue() /
- 2;
- auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
- auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, uint64_t(0));
- auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, uint64_t(0));
- auto *UpperSplitA = B.CreateExtractVector(HalfTy, InputA, Stride);
- auto *UpperSplitB = B.CreateExtractVector(HalfTy, InputB, Stride);
- Value *LowerSplitAcc = nullptr;
- Value *UpperSplitAcc = nullptr;
- Type *FullTy = Ty;
- FullTy = Accumulator->getType();
- auto *HalfAccTy = VectorType::getHalfElementsVectorType(
- cast<VectorType>(Accumulator->getType()));
- LowerSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, uint64_t(0));
- UpperSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, AccStride);
- auto *LowerSplitInt = createComplexDeinterleavingIR(
- B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
- auto *UpperSplitInt = createComplexDeinterleavingIR(
- B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
-
- auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
- LowerSplitInt, uint64_t(0));
- return B.CreateInsertVector(FullTy, Result, UpperSplitInt, AccStride);
- }
-
- if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
- if (IsScalable) {
- if (IsInt)
- return B.CreateIntrinsic(
- Intrinsic::aarch64_sve_cmla_x, Ty,
- {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
-
- auto *Mask = B.getAllOnesMask(Ty->getElementCount());
- return B.CreateIntrinsic(
- Intrinsic::aarch64_sve_fcmla, Ty,
- {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
- }
-
- Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
- Intrinsic::aarch64_neon_vcmla_rot90,
- Intrinsic::aarch64_neon_vcmla_rot180,
- Intrinsic::aarch64_neon_vcmla_rot270};
-
-
- return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
- {Accumulator, InputA, InputB});
- }
-
- if (OperationType == ComplexDeinterleavingOperation::CAdd) {
- if (IsScalable) {
- if (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||
- Rotation == ComplexDeinterleavingRotation::Rotation_270) {
- if (IsInt)
- return B.CreateIntrinsic(
- Intrinsic::aarch64_sve_cadd_x, Ty,
- {InputA, InputB, B.getInt32((int)Rotation * 90)});
-
- auto *Mask = B.getAllOnesMask(Ty->getElementCount());
- return B.CreateIntrinsic(
- Intrinsic::aarch64_sve_fcadd, Ty,
- {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
- }
- return nullptr;
- }
-
- Intrinsic::ID IntId = Intrinsic::not_intrinsic;
- if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
- IntId = Intrinsic::aarch64_neon_vcadd_rot90;
- else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
- IntId = Intrinsic::aarch64_neon_vcadd_rot270;
-
- if (IntId == Intrinsic::not_intrinsic)
- return nullptr;
-
- return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
- }
-
- if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
- IsScalable) {
- return B.CreateIntrinsic(
- Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
- {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
- }
-
- return nullptr;
-}
-
-bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
- unsigned Opc = N->getOpcode();
- if (ISD::isExtOpcode(Opc)) {
- if (any_of(N->users(),
- [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
- return false;
- }
- return true;
-}
-
-unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
- return Subtarget->getMinimumJumpTableEntries();
-}
-
-MVT AArch64TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
- CallingConv::ID CC,
- EVT VT) const {
- bool NonUnitFixedLengthVector =
- VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
- if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
- return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
-
- EVT VT1;
- MVT RegisterVT;
- unsigned NumIntermediates;
- getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
- RegisterVT);
- return RegisterVT;
-}
-
-unsigned AArch64TargetLowering::getNumRegistersForCallingConv(
- LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
- bool NonUnitFixedLengthVector =
- VT.isFixedLengthVector() && !VT.getVectorElementCount().isScalar();
- if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
- return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
-
- EVT VT1;
- MVT VT2;
- unsigned NumIntermediates;
- return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
- NumIntermediates, VT2);
-}
-
-unsigned AArch64TargetLowering::getVectorTypeBreakdownForCallingConv(
- LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
- unsigned &NumIntermediates, MVT &RegisterVT) const {
- int NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv(
- Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
- if (!RegisterVT.isFixedLengthVector() ||
- RegisterVT.getFixedSizeInBits() <= 128)
- return NumRegs;
-
- assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
- assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
- assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
-
- // A size mismatch here implies either type promotion or widening and would
- // have resulted in scalarisation if larger vectors had not be available.
- if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
- EVT EltTy = VT.getVectorElementType();
- EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
- if (!isTypeLegal(NewVT))
- NewVT = EltTy;
-
- IntermediateVT = NewVT;
- NumIntermediates = VT.getVectorNumElements();
- RegisterVT = getRegisterType(Context, NewVT);
- return NumIntermediates;
- }
-
- // SVE VLS support does not introduce a new ABI so we should use NEON sized
- // types for vector arguments and returns.
-
- unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
- NumIntermediates *= NumSubRegs;
- NumRegs *= NumSubRegs;
-
- switch (RegisterVT.getVectorElementType().SimpleTy) {
- default:
- llvm_unreachable("unexpected element type for vector");
- case MVT::i8:
- IntermediateVT = RegisterVT = MVT::v16i8;
- break;
- case MVT::i16:
- IntermediateVT = RegisterVT = MVT::v8i16;
- break;
- case MVT::i32:
- IntermediateVT = RegisterVT = MVT::v4i32;
- break;
- case MVT::i64:
- IntermediateVT = RegisterVT = MVT::v2i64;
- break;
- case MVT::f16:
- IntermediateVT = RegisterVT = MVT::v8f16;
- break;
- case MVT::f32:
- IntermediateVT = RegisterVT = MVT::v4f32;
- break;
- case MVT::f64:
- IntermediateVT = RegisterVT = MVT::v2f64;
- break;
- case MVT::bf16:
- IntermediateVT = RegisterVT = MVT::v8bf16;
- break;
- }
-
- return NumRegs;
-}
-
-bool AArch64TargetLowering::hasInlineStackProbe(
- const MachineFunction &MF) const {
- return !Subtarget->isTargetWindows() &&
- MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
-}
-
-bool AArch64TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
- switch (Opc) {
- case ISD::TRUNCATE_SSAT_S:
- case ISD::TRUNCATE_SSAT_U:
- case ISD::TRUNCATE_USAT_U:
- if (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)
- return true;
- }
-
- return TargetLowering::isTypeDesirableForOp(Opc, VT);
-}
-
-bool AArch64TargetLowering::shouldPreservePtrArith(const Function &F,
- EVT VT) const {
- return Subtarget->hasCPA() && UseFEATCPACodegen;
-}
+ // In the case of non-temporal gather loads and qua
\ No newline at end of file
More information about the llvm-commits
mailing list