[llvm] 42b3419 - [AArch64] Split LSLFast into Addr and ALU parts
David Green via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 18 00:59:29 PDT 2023
Author: David Green
Date: 2023-08-18T08:59:24+01:00
New Revision: 42b3419339dba270107e9323a1fbfe3b39ed39bd
URL: https://github.com/llvm/llvm-project/commit/42b3419339dba270107e9323a1fbfe3b39ed39bd
DIFF: https://github.com/llvm/llvm-project/commit/42b3419339dba270107e9323a1fbfe3b39ed39bd.diff
LOG: [AArch64] Split LSLFast into Addr and ALU parts
As far as I can tell FeatureLSLFast was originally added to specify that a lsl
of <= 3 was cheap when folded into an addressing operand, so should override
the one-use checks usually intended to make sure we don't perform redundant
work. At a later point it also came to also mean that add x0, x1, x2, lsl N
with N <= 4 was cheap, in that it took a single cycle not multiple cycles that
more complex adds usually take.
This patch splits those two concepts out into separate subtarget features. The
biggest change is the change to AArch64DAGToDAGISel::isWorthFoldingALU, making
ALU operations now produce a ADDWrs if the shift is <= 4.
Otherwise the patch is mostly an NFC as it tries to keep the subtarget features
the same for each cpu. I believe that the Arm OoO CPUs should eventually be
changed to a new subtarget feature that specifies that a shift of 2 or 3 with
any extend should be treated as cheap (just not shifts of 1 or 4).
Differential Revision: https://reviews.llvm.org/D157982
Added:
llvm/test/CodeGen/AArch64/lslfast.ll
Modified:
llvm/lib/Target/AArch64/AArch64.td
llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
llvm/test/CodeGen/AArch64/mul_pow2.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 6e327b83e4e639..e27c4230e1fdc3 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -382,9 +382,13 @@ def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
"equivalent when the immediate does "
"not fit in the encoding.">;
-def FeatureLSLFast : SubtargetFeature<
- "lsl-fast", "HasLSLFast", "true",
- "CPU has a fastpath logical shift of up to 3 places">;
+def FeatureAddrLSLFast : SubtargetFeature<
+ "addr-lsl-fast", "HasAddrLSLFast", "true",
+ "Address operands with logical shift of up to 3 places are cheap">;
+
+def FeatureALULSLFast : SubtargetFeature<
+ "alu-lsl-fast", "HasALULSLFast", "true",
+ "Add/Sub operations with lsl shift <= 4 are cheap">;
def FeatureAggressiveFMA :
SubtargetFeature<"aggressive-fma",
@@ -841,7 +845,8 @@ def TuneA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
"Cortex-A76 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -850,7 +855,8 @@ def TuneA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
FeatureCmpBccFusion,
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -859,7 +865,8 @@ def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78",
FeatureCmpBccFusion,
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -870,7 +877,8 @@ def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
FeatureCmpBccFusion,
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -880,7 +888,8 @@ def TuneA710 : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710",
FeatureCmpBccFusion,
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -890,7 +899,8 @@ def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715",
FeatureFuseAES,
FeaturePostRAScheduler,
FeatureCmpBccFusion,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeatureFuseAdrpAdd,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -905,7 +915,8 @@ def TuneX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
FeatureCmpBccFusion,
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -915,14 +926,16 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
FeatureCmpBccFusion,
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
"Cortex-X3 ARM processors", [
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeatureFuseAdrpAdd,
FeatureFuseAES,
FeaturePostRAScheduler,
@@ -1060,7 +1073,8 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
FeatureFuseCCSelect,
FeatureFuseAdrpAdd,
FeatureFuseLiterals,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive]>;
@@ -1077,7 +1091,8 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
FeatureFuseCCSelect,
FeatureFuseAdrpAdd,
FeatureFuseLiterals,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureZCZeroing]>;
@@ -1087,7 +1102,8 @@ def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureZCZeroing,
- FeatureLSLFast]
+ FeatureAddrLSLFast,
+ FeatureALULSLFast]
>;
def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
@@ -1096,7 +1112,8 @@ def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureZCZeroing,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeatureSlowSTRQro
]>;
@@ -1110,7 +1127,8 @@ def TuneNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", "NeoverseN1
"Neoverse N1 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -1119,7 +1137,8 @@ def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2
"Neoverse N2 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -1128,7 +1147,8 @@ def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Ne
"Neoverse 512-TVB ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -1137,7 +1157,8 @@ def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1
"Neoverse V1 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive,
@@ -1147,7 +1168,8 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
"Neoverse V2 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureEnableSelectOptimize,
FeaturePredictableSelectIsExpensive]>;
@@ -1158,7 +1180,8 @@ def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureZCZeroing,
- FeatureLSLFast]>;
+ FeatureAddrLSLFast,
+ FeatureALULSLFast]>;
def TuneThunderX2T99 : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "ThunderX2T99",
"Cavium ThunderX2 processors", [
@@ -1210,7 +1233,8 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
"Ampere Computing Ampere-1 processors", [
FeaturePostRAScheduler,
FeatureFuseAES,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeatureAggressiveFMA,
FeatureArithmeticBccFusion,
FeatureCmpBccFusion,
@@ -1221,7 +1245,8 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
"Ampere Computing Ampere-1A processors", [
FeaturePostRAScheduler,
FeatureFuseAES,
- FeatureLSLFast,
+ FeatureAddrLSLFast,
+ FeatureALULSLFast,
FeatureAggressiveFMA,
FeatureArithmeticBccFusion,
FeatureCmpBccFusion,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 479067d2fb6a40..60a155a86667e8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -451,7 +451,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
SDValue &Offset, SDValue &SignExtend,
SDValue &DoShift);
- bool isWorthFolding(SDValue V) const;
+ bool isWorthFoldingALU(SDValue V, bool LSL = false) const;
+ bool isWorthFoldingAddr(SDValue V) const;
bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
SDValue &Offset, SDValue &SignExtend);
@@ -660,18 +661,19 @@ static bool isWorthFoldingSHL(SDValue V) {
return true;
}
-/// Determine whether it is worth to fold V into an extended register.
-bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
+/// Determine whether it is worth to fold V into an extended register addressing
+/// mode.
+bool AArch64DAGToDAGISel::isWorthFoldingAddr(SDValue V) const {
// Trivial if we are optimizing for code size or if there is only
// one use of the value.
if (CurDAG->shouldOptForSize() || V.hasOneUse())
return true;
// If a subtarget has a fastpath LSL we can fold a logical shift into
// the addressing mode and save a cycle.
- if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL &&
+ if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::SHL &&
isWorthFoldingSHL(V))
return true;
- if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) {
+ if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::ADD) {
const SDValue LHS = V.getOperand(0);
const SDValue RHS = V.getOperand(1);
if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
@@ -762,35 +764,6 @@ bool AArch64DAGToDAGISel::SelectShiftedRegisterFromAnd(SDValue N, SDValue &Reg,
return true;
}
-/// SelectShiftedRegister - Select a "shifted register" operand. If the value
-/// is not shifted, set the Shift operand to default of "LSL 0". The logical
-/// instructions allow the shifted register to be rotated, but the arithmetic
-/// instructions do not. The AllowROR parameter specifies whether ROR is
-/// supported.
-bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
- SDValue &Reg, SDValue &Shift) {
- if (SelectShiftedRegisterFromAnd(N, Reg, Shift))
- return true;
-
- AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
- if (ShType == AArch64_AM::InvalidShiftExtend)
- return false;
- if (!AllowROR && ShType == AArch64_AM::ROR)
- return false;
-
- if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
- unsigned BitSize = N.getValueSizeInBits();
- unsigned Val = RHS->getZExtValue() & (BitSize - 1);
- unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
-
- Reg = N.getOperand(0);
- Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32);
- return isWorthFolding(N);
- }
-
- return false;
-}
-
/// getExtendTypeForNode - Translate an extend node to the corresponding
/// ExtendType value.
static AArch64_AM::ShiftExtendType
@@ -845,6 +818,56 @@ getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
return AArch64_AM::InvalidShiftExtend;
}
+/// Determine whether it is worth to fold V into an extended register of an
+/// Add/Sub. LSL means we are folding into an `add w0, w1, w2, lsl #N`
+/// instruction, and the shift should be treated as worth folding even if has
+/// multiple uses.
+bool AArch64DAGToDAGISel::isWorthFoldingALU(SDValue V, bool LSL) const {
+ // Trivial if we are optimizing for code size or if there is only
+ // one use of the value.
+ if (CurDAG->shouldOptForSize() || V.hasOneUse())
+ return true;
+
+ // If a subtarget has a fastpath LSL we can fold a logical shift into
+ // the add/sub and save a cycle.
+ if (LSL && Subtarget->hasALULSLFast() && V.getOpcode() == ISD::SHL &&
+ V.getConstantOperandVal(1) <= 4 &&
+ getExtendTypeForNode(V.getOperand(0)) == AArch64_AM::InvalidShiftExtend)
+ return true;
+
+ // It hurts otherwise, since the value will be reused.
+ return false;
+}
+
+/// SelectShiftedRegister - Select a "shifted register" operand. If the value
+/// is not shifted, set the Shift operand to default of "LSL 0". The logical
+/// instructions allow the shifted register to be rotated, but the arithmetic
+/// instructions do not. The AllowROR parameter specifies whether ROR is
+/// supported.
+bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
+ SDValue &Reg, SDValue &Shift) {
+ if (SelectShiftedRegisterFromAnd(N, Reg, Shift))
+ return true;
+
+ AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
+ if (ShType == AArch64_AM::InvalidShiftExtend)
+ return false;
+ if (!AllowROR && ShType == AArch64_AM::ROR)
+ return false;
+
+ if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+ unsigned BitSize = N.getValueSizeInBits();
+ unsigned Val = RHS->getZExtValue() & (BitSize - 1);
+ unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
+
+ Reg = N.getOperand(0);
+ Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32);
+ return isWorthFoldingALU(N, true);
+ }
+
+ return false;
+}
+
/// Instructions that accept extend modifiers like UXTW expect the register
/// being extended to be a GPR32, but the incoming DAG might be acting on a
/// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
@@ -925,7 +948,7 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
Reg = narrowIfNeeded(CurDAG, Reg);
Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
MVT::i32);
- return isWorthFolding(N);
+ return isWorthFoldingALU(N);
}
/// SelectArithUXTXRegister - Select a "UXTX register" operand. This
@@ -949,7 +972,7 @@ bool AArch64DAGToDAGISel::SelectArithUXTXRegister(SDValue N, SDValue &Reg,
Reg = N.getOperand(0);
Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
MVT::i32);
- return isWorthFolding(N);
+ return isWorthFoldingALU(N);
}
/// If there's a use of this ADDlow that's not itself a load/store then we'll
@@ -1164,7 +1187,7 @@ bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
return false;
- return isWorthFolding(N);
+ return isWorthFoldingAddr(N);
}
bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
@@ -1192,7 +1215,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
}
// Remember if it is worth folding N when it produces extended register.
- bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+ bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N);
// Try to match a shifted extend on the RHS.
if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
@@ -1222,7 +1245,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
MVT::i32);
- if (isWorthFolding(LHS))
+ if (isWorthFoldingAddr(LHS))
return true;
}
@@ -1234,7 +1257,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
MVT::i32);
- if (isWorthFolding(RHS))
+ if (isWorthFoldingAddr(RHS))
return true;
}
@@ -1305,7 +1328,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
}
// Remember if it is worth folding N when it produces extended register.
- bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+ bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N);
// Try to match a shifted extend on the RHS.
if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3d0567665b3a1c..1912bde26d3d69 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16454,7 +16454,7 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
} else if (SCVPlus1.isPowerOf2()) {
ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
- } else if (Subtarget->hasLSLFast() &&
+ } else if (Subtarget->hasALULSLFast() &&
isPowPlusPlusConst(ConstValue, CVM, CVN)) {
APInt CVMMinus1 = CVM - 1;
APInt CVNMinus1 = CVN - 1;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index acd429e96fa41b..2425a9a60c6373 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -6079,7 +6079,7 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
// It's better to avoid folding and recomputing shifts when we don't have a
// fastpath.
- if (!STI.hasLSLFast())
+ if (!STI.hasAddrLSLFast())
return false;
// We have a fastpath, so folding a shift in and potentially computing it
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
index 03002a33650c22..720d9ad13aa077 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
@@ -24,7 +24,7 @@
define void @ldbbrox(i64* %addr) { ret void }
define void @ldrqrox(i64* %addr) { ret void }
attributes #0 = { optsize }
- attributes #1 = { "target-features"="+lsl-fast" }
+ attributes #1 = { "target-features"="+addr-lsl-fast" }
...
---
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
index eaa89081199ed6..ae3827cc65b5ea 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK0
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK3
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK3
%struct.a = type [256 x i16]
%struct.b = type [256 x i32]
diff --git a/llvm/test/CodeGen/AArch64/lslfast.ll b/llvm/test/CodeGen/AArch64/lslfast.ll
new file mode 100644
index 00000000000000..5ec70b5f229754
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/lslfast.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-SLOW
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+alu-lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK-FAST
+
+define i32 @testmul3(i32 noundef %x, i32 noundef %y, i32 noundef %z) {
+; CHECK-SLOW-LABEL: testmul3:
+; CHECK-SLOW: // %bb.0: // %entry
+; CHECK-SLOW-NEXT: lsl w8, w0, #3
+; CHECK-SLOW-NEXT: add w9, w8, w1
+; CHECK-SLOW-NEXT: add w8, w8, w2
+; CHECK-SLOW-NEXT: mul w0, w8, w9
+; CHECK-SLOW-NEXT: ret
+;
+; CHECK-FAST-LABEL: testmul3:
+; CHECK-FAST: // %bb.0: // %entry
+; CHECK-FAST-NEXT: add w8, w1, w0, lsl #3
+; CHECK-FAST-NEXT: add w9, w2, w0, lsl #3
+; CHECK-FAST-NEXT: mul w0, w9, w8
+; CHECK-FAST-NEXT: ret
+entry:
+ %shl = shl i32 %x, 3
+ %add = add nsw i32 %shl, %y
+ %add2 = add nsw i32 %shl, %z
+ %mul = mul nsw i32 %add2, %add
+ ret i32 %mul
+}
+
+define i32 @testvar(i32 noundef %x, i32 noundef %y, i32 noundef %z, i32 %zz) {
+; CHECK-LABEL: testvar:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl w8, w0, w3
+; CHECK-NEXT: add w9, w8, w1
+; CHECK-NEXT: add w8, w8, w2
+; CHECK-NEXT: mul w0, w8, w9
+; CHECK-NEXT: ret
+entry:
+ %shl = shl i32 %x, %zz
+ %add = add nsw i32 %shl, %y
+ %add2 = add nsw i32 %shl, %z
+ %mul = mul nsw i32 %add2, %add
+ ret i32 %mul
+}
+
+define i32 @testmul5(i32 noundef %x, i32 noundef %y, i32 noundef %z) {
+; CHECK-LABEL: testmul5:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl w8, w0, #5
+; CHECK-NEXT: add w9, w8, w1
+; CHECK-NEXT: add w8, w8, w2
+; CHECK-NEXT: mul w0, w8, w9
+; CHECK-NEXT: ret
+entry:
+ %shl = shl i32 %x, 5
+ %add = add nsw i32 %shl, %y
+ %add2 = add nsw i32 %shl, %z
+ %mul = mul nsw i32 %add2, %add
+ ret i32 %mul
+}
+
+define i64 @testsext3(i32 noundef %x, i64 noundef %y, i64 noundef %z) {
+; CHECK-LABEL: testsext3:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: sbfiz x8, x0, #3, #32
+; CHECK-NEXT: add x9, x8, x1
+; CHECK-NEXT: add x8, x8, x2
+; CHECK-NEXT: mul x0, x9, x8
+; CHECK-NEXT: ret
+entry:
+ %conv = sext i32 %x to i64
+ %shl = shl nsw i64 %conv, 3
+ %add = add nsw i64 %shl, %y
+ %add3 = add nsw i64 %shl, %z
+ %mul = mul nsw i64 %add, %add3
+ ret i64 %mul
+}
+
+define i64 @testzext3(i32 noundef %x, i64 noundef %y, i64 noundef %z) {
+; CHECK-LABEL: testzext3:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT: ubfiz x8, x0, #3, #32
+; CHECK-NEXT: add x9, x8, x1
+; CHECK-NEXT: add x8, x8, x2
+; CHECK-NEXT: mul x0, x9, x8
+; CHECK-NEXT: ret
+entry:
+ %conv = zext i32 %x to i64
+ %shl = shl nsw i64 %conv, 3
+ %add = add nsw i64 %shl, %y
+ %add3 = add nsw i64 %shl, %z
+ %mul = mul nsw i64 %add, %add3
+ ret i64 %mul
+}
+
+define i64 @test3sext(i32 noundef %x, i64 noundef %y, i64 noundef %z) {
+; CHECK-LABEL: test3sext:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl w8, w0, #3
+; CHECK-NEXT: sxtw x8, w8
+; CHECK-NEXT: add x9, x8, x1
+; CHECK-NEXT: add x8, x8, x2
+; CHECK-NEXT: mul x0, x9, x8
+; CHECK-NEXT: ret
+entry:
+ %shl = shl i32 %x, 3
+ %conv = sext i32 %shl to i64
+ %add = add nsw i64 %conv, %y
+ %add3 = add nsw i64 %conv, %z
+ %mul = mul nsw i64 %add, %add3
+ ret i64 %mul
+}
+
+define i64 @test3zext(i32 noundef %x, i64 noundef %y, i64 noundef %z) {
+; CHECK-LABEL: test3zext:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: lsl w8, w0, #3
+; CHECK-NEXT: add x9, x8, x1
+; CHECK-NEXT: add x8, x8, x2
+; CHECK-NEXT: mul x0, x9, x8
+; CHECK-NEXT: ret
+entry:
+ %shl = shl i32 %x, 3
+ %conv = zext i32 %shl to i64
+ %add = add nsw i64 %conv, %y
+ %add3 = add nsw i64 %conv, %z
+ %mul = mul nsw i64 %add, %add3
+ ret i64 %mul
+}
diff --git a/llvm/test/CodeGen/AArch64/mul_pow2.ll b/llvm/test/CodeGen/AArch64/mul_pow2.ll
index cbdf6337847cb6..8614424edbdd74 100644
--- a/llvm/test/CodeGen/AArch64/mul_pow2.ll
+++ b/llvm/test/CodeGen/AArch64/mul_pow2.ll
@@ -493,7 +493,7 @@ define i32 @test16(i32 %x) {
ret i32 %mul
}
-define i32 @test25_fast_shift(i32 %x) "target-features"="+lsl-fast" {
+define i32 @test25_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
; CHECK-LABEL: test25_fast_shift:
; CHECK: // %bb.0:
; CHECK-NEXT: add w8, w0, w0, lsl #2
@@ -510,7 +510,7 @@ define i32 @test25_fast_shift(i32 %x) "target-features"="+lsl-fast" {
ret i32 %mul
}
-define i32 @test45_fast_shift(i32 %x) "target-features"="+lsl-fast" {
+define i32 @test45_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
; CHECK-LABEL: test45_fast_shift:
; CHECK: // %bb.0:
; CHECK-NEXT: add w8, w0, w0, lsl #2
@@ -546,7 +546,7 @@ define i32 @test45(i32 %x) {
}
; Negative test: The shift amount 4 larger than 3
-define i32 @test85_fast_shift(i32 %x) "target-features"="+lsl-fast" {
+define i32 @test85_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
; CHECK-LABEL: test85_fast_shift:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #85
@@ -564,7 +564,7 @@ define i32 @test85_fast_shift(i32 %x) "target-features"="+lsl-fast" {
}
; Negative test: The shift amount 5 larger than 3
-define i32 @test297_fast_shift(i32 %x) "target-features"="+lsl-fast" {
+define i32 @test297_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
; CHECK-LABEL: test297_fast_shift:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #297
More information about the llvm-commits
mailing list