[llvm] 42b3419 - [AArch64] Split LSLFast into Addr and ALU parts

Fri Aug 18 00:59:29 PDT 2023

Author: David Green
Date: 2023-08-18T08:59:24+01:00
New Revision: 42b3419339dba270107e9323a1fbfe3b39ed39bd

URL: https://github.com/llvm/llvm-project/commit/42b3419339dba270107e9323a1fbfe3b39ed39bd
DIFF: https://github.com/llvm/llvm-project/commit/42b3419339dba270107e9323a1fbfe3b39ed39bd.diff

LOG: [AArch64] Split LSLFast into Addr and ALU parts

As far as I can tell FeatureLSLFast was originally added to specify that a lsl
of <= 3 was cheap when folded into an addressing operand, so should override
the one-use checks usually intended to make sure we don't perform redundant
work. At a later point it also came to also mean that add x0, x1, x2, lsl N
with N <= 4 was cheap, in that it took a single cycle not multiple cycles that
more complex adds usually take.

This patch splits those two concepts out into separate subtarget features. The
biggest change is the change to AArch64DAGToDAGISel::isWorthFoldingALU, making
ALU operations now produce a ADDWrs if the shift is <= 4.

Otherwise the patch is mostly an NFC as it tries to keep the subtarget features
the same for each cpu. I believe that the Arm OoO CPUs should eventually be
changed to a new subtarget feature that specifies that a shift of 2 or 3 with
any extend should be treated as cheap (just not shifts of 1 or 4).

Differential Revision: https://reviews.llvm.org/D157982

Added: 
    llvm/test/CodeGen/AArch64/lslfast.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64.td
    llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
    llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
    llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
    llvm/test/CodeGen/AArch64/mul_pow2.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 6e327b83e4e639..e27c4230e1fdc3 100644

--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -382,9 +382,13 @@ def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
                                         "equivalent when the immediate does "
                                         "not fit in the encoding.">;
 
-def FeatureLSLFast : SubtargetFeature<
-    "lsl-fast", "HasLSLFast", "true",
-    "CPU has a fastpath logical shift of up to 3 places">;
+def FeatureAddrLSLFast : SubtargetFeature<
+    "addr-lsl-fast", "HasAddrLSLFast", "true",
+    "Address operands with logical shift of up to 3 places are cheap">;
+
+def FeatureALULSLFast : SubtargetFeature<
+    "alu-lsl-fast", "HasALULSLFast", "true",
+    "Add/Sub operations with lsl shift <= 4 are cheap">;
 
 def FeatureAggressiveFMA :
   SubtargetFeature<"aggressive-fma",
@@ -841,7 +845,8 @@ def TuneA76     : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
                                    "Cortex-A76 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureLSLFast,
+                                   FeatureAddrLSLFast,
+                                   FeatureALULSLFast,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
 
@@ -850,7 +855,8 @@ def TuneA77     : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
                                    FeatureCmpBccFusion,
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureLSLFast,
+                                   FeatureAddrLSLFast,
+                                   FeatureALULSLFast,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
 
@@ -859,7 +865,8 @@ def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78",
                                FeatureCmpBccFusion,
                                FeatureFuseAES,
                                FeatureFuseAdrpAdd,
-                               FeatureLSLFast,
+                               FeatureAddrLSLFast,
+                               FeatureALULSLFast,
                                FeaturePostRAScheduler,
                                FeatureEnableSelectOptimize,
                                FeaturePredictableSelectIsExpensive]>;
@@ -870,7 +877,8 @@ def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
                                 FeatureCmpBccFusion,
                                 FeatureFuseAES,
                                 FeatureFuseAdrpAdd,
-                                FeatureLSLFast,
+                                FeatureAddrLSLFast,
+                                FeatureALULSLFast,
                                 FeaturePostRAScheduler,
                                 FeatureEnableSelectOptimize,
                                 FeaturePredictableSelectIsExpensive]>;
@@ -880,7 +888,8 @@ def TuneA710    : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710",
                                    FeatureCmpBccFusion,
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeatureLSLFast,
+                                   FeatureAddrLSLFast,
+                                   FeatureALULSLFast,
                                    FeaturePostRAScheduler,
                                    FeatureEnableSelectOptimize,
                                    FeaturePredictableSelectIsExpensive]>;
@@ -890,7 +899,8 @@ def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715",
                                  FeatureFuseAES,
                                  FeaturePostRAScheduler,
                                  FeatureCmpBccFusion,
-                                 FeatureLSLFast,
+                                 FeatureAddrLSLFast,
+                                 FeatureALULSLFast,
                                  FeatureFuseAdrpAdd,
                                  FeatureEnableSelectOptimize,
                                  FeaturePredictableSelectIsExpensive]>;
@@ -905,7 +915,8 @@ def TuneX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
                                   FeatureCmpBccFusion,
                                   FeatureFuseAES,
                                   FeatureFuseAdrpAdd,
-                                  FeatureLSLFast,
+                                  FeatureAddrLSLFast,
+                                  FeatureALULSLFast,
                                   FeaturePostRAScheduler,
                                   FeatureEnableSelectOptimize,
                                   FeaturePredictableSelectIsExpensive]>;
@@ -915,14 +926,16 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
                                   FeatureCmpBccFusion,
                                   FeatureFuseAES,
                                   FeatureFuseAdrpAdd,
-                                  FeatureLSLFast,
+                                  FeatureAddrLSLFast,
+                                  FeatureALULSLFast,
                                   FeaturePostRAScheduler,
                                   FeatureEnableSelectOptimize,
                                   FeaturePredictableSelectIsExpensive]>;
 
 def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
                               "Cortex-X3 ARM processors", [
-                               FeatureLSLFast,
+                               FeatureAddrLSLFast,
+                               FeatureALULSLFast,
                                FeatureFuseAdrpAdd,
                                FeatureFuseAES,
                                FeaturePostRAScheduler,
@@ -1060,7 +1073,8 @@ def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
                                      FeatureFuseCCSelect,
                                      FeatureFuseAdrpAdd,
                                      FeatureFuseLiterals,
-                                     FeatureLSLFast,
+                                     FeatureAddrLSLFast,
+                                     FeatureALULSLFast,
                                      FeaturePostRAScheduler,
                                      FeaturePredictableSelectIsExpensive]>;
 
@@ -1077,7 +1091,8 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
                                      FeatureFuseCCSelect,
                                      FeatureFuseAdrpAdd,
                                      FeatureFuseLiterals,
-                                     FeatureLSLFast,
+                                     FeatureAddrLSLFast,
+                                     FeatureALULSLFast,
                                      FeaturePostRAScheduler,
                                      FeatureZCZeroing]>;
 
@@ -1087,7 +1102,8 @@ def TuneKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
-                                   FeatureLSLFast]
+                                   FeatureAddrLSLFast,
+                                   FeatureALULSLFast]
                                    >;
 
 def TuneFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
@@ -1096,7 +1112,8 @@ def TuneFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
-                                   FeatureLSLFast,
+                                   FeatureAddrLSLFast,
+                                   FeatureALULSLFast,
                                    FeatureSlowSTRQro
                                    ]>;
 
@@ -1110,7 +1127,8 @@ def TuneNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", "NeoverseN1
                                       "Neoverse N1 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureLSLFast,
+                                      FeatureAddrLSLFast,
+                                      FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
                                       FeaturePredictableSelectIsExpensive]>;
@@ -1119,7 +1137,8 @@ def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2
                                       "Neoverse N2 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureLSLFast,
+                                      FeatureAddrLSLFast,
+                                      FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
                                       FeaturePredictableSelectIsExpensive]>;
@@ -1128,7 +1147,8 @@ def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Ne
                                       "Neoverse 512-TVB ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureLSLFast,
+                                      FeatureAddrLSLFast,
+                                      FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
                                       FeaturePredictableSelectIsExpensive]>;
@@ -1137,7 +1157,8 @@ def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1
                                       "Neoverse V1 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureLSLFast,
+                                      FeatureAddrLSLFast,
+                                      FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
                                       FeaturePredictableSelectIsExpensive,
@@ -1147,7 +1168,8 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
                                       "Neoverse V2 ARM processors", [
                                       FeatureFuseAES,
                                       FeatureFuseAdrpAdd,
-                                      FeatureLSLFast,
+                                      FeatureAddrLSLFast,
+                                      FeatureALULSLFast,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
                                       FeaturePredictableSelectIsExpensive]>;
@@ -1158,7 +1180,8 @@ def TuneSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureZCZeroing,
-                                   FeatureLSLFast]>;
+                                   FeatureAddrLSLFast,
+                                   FeatureALULSLFast]>;
 
 def TuneThunderX2T99  : SubtargetFeature<"thunderx2t99", "ARMProcFamily", "ThunderX2T99",
                                          "Cavium ThunderX2 processors", [
@@ -1210,7 +1233,8 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
                                    "Ampere Computing Ampere-1 processors", [
                                    FeaturePostRAScheduler,
                                    FeatureFuseAES,
-                                   FeatureLSLFast,
+                                   FeatureAddrLSLFast,
+                                   FeatureALULSLFast,
                                    FeatureAggressiveFMA,
                                    FeatureArithmeticBccFusion,
                                    FeatureCmpBccFusion,
@@ -1221,7 +1245,8 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     "Ampere Computing Ampere-1A processors", [
                                     FeaturePostRAScheduler,
                                     FeatureFuseAES,
-                                    FeatureLSLFast,
+                                    FeatureAddrLSLFast,
+                                    FeatureALULSLFast,
                                     FeatureAggressiveFMA,
                                     FeatureArithmeticBccFusion,
                                     FeatureCmpBccFusion,

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 479067d2fb6a40..60a155a86667e8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -451,7 +451,8 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
   bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
                          SDValue &Offset, SDValue &SignExtend,
                          SDValue &DoShift);
-  bool isWorthFolding(SDValue V) const;
+  bool isWorthFoldingALU(SDValue V, bool LSL = false) const;
+  bool isWorthFoldingAddr(SDValue V) const;
   bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
                          SDValue &Offset, SDValue &SignExtend);
 
@@ -660,18 +661,19 @@ static bool isWorthFoldingSHL(SDValue V) {
   return true;
 }
 
-/// Determine whether it is worth to fold V into an extended register.
-bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
+/// Determine whether it is worth to fold V into an extended register addressing
+/// mode.
+bool AArch64DAGToDAGISel::isWorthFoldingAddr(SDValue V) const {
   // Trivial if we are optimizing for code size or if there is only
   // one use of the value.
   if (CurDAG->shouldOptForSize() || V.hasOneUse())
     return true;
   // If a subtarget has a fastpath LSL we can fold a logical shift into
   // the addressing mode and save a cycle.
-  if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL &&
+  if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::SHL &&
       isWorthFoldingSHL(V))
     return true;
-  if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) {
+  if (Subtarget->hasAddrLSLFast() && V.getOpcode() == ISD::ADD) {
     const SDValue LHS = V.getOperand(0);
     const SDValue RHS = V.getOperand(1);
     if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
@@ -762,35 +764,6 @@ bool AArch64DAGToDAGISel::SelectShiftedRegisterFromAnd(SDValue N, SDValue &Reg,
   return true;
 }
 
-/// SelectShiftedRegister - Select a "shifted register" operand.  If the value
-/// is not shifted, set the Shift operand to default of "LSL 0".  The logical
-/// instructions allow the shifted register to be rotated, but the arithmetic
-/// instructions do not.  The AllowROR parameter specifies whether ROR is
-/// supported.
-bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
-                                                SDValue &Reg, SDValue &Shift) {
-  if (SelectShiftedRegisterFromAnd(N, Reg, Shift))
-    return true;
-
-  AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
-  if (ShType == AArch64_AM::InvalidShiftExtend)
-    return false;
-  if (!AllowROR && ShType == AArch64_AM::ROR)
-    return false;
-
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    unsigned BitSize = N.getValueSizeInBits();
-    unsigned Val = RHS->getZExtValue() & (BitSize - 1);
-    unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
-
-    Reg = N.getOperand(0);
-    Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32);
-    return isWorthFolding(N);
-  }
-
-  return false;
-}
-
 /// getExtendTypeForNode - Translate an extend node to the corresponding
 /// ExtendType value.
 static AArch64_AM::ShiftExtendType
@@ -845,6 +818,56 @@ getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
   return AArch64_AM::InvalidShiftExtend;
 }
 
+/// Determine whether it is worth to fold V into an extended register of an
+/// Add/Sub. LSL means we are folding into an `add w0, w1, w2, lsl #N`
+/// instruction, and the shift should be treated as worth folding even if has
+/// multiple uses.
+bool AArch64DAGToDAGISel::isWorthFoldingALU(SDValue V, bool LSL) const {
+  // Trivial if we are optimizing for code size or if there is only
+  // one use of the value.
+  if (CurDAG->shouldOptForSize() || V.hasOneUse())
+    return true;
+
+  // If a subtarget has a fastpath LSL we can fold a logical shift into
+  // the add/sub and save a cycle.
+  if (LSL && Subtarget->hasALULSLFast() && V.getOpcode() == ISD::SHL &&
+      V.getConstantOperandVal(1) <= 4 &&
+      getExtendTypeForNode(V.getOperand(0)) == AArch64_AM::InvalidShiftExtend)
+    return true;
+
+  // It hurts otherwise, since the value will be reused.
+  return false;
+}
+
+/// SelectShiftedRegister - Select a "shifted register" operand.  If the value
+/// is not shifted, set the Shift operand to default of "LSL 0".  The logical
+/// instructions allow the shifted register to be rotated, but the arithmetic
+/// instructions do not.  The AllowROR parameter specifies whether ROR is
+/// supported.
+bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
+                                                SDValue &Reg, SDValue &Shift) {
+  if (SelectShiftedRegisterFromAnd(N, Reg, Shift))
+    return true;
+
+  AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
+  if (ShType == AArch64_AM::InvalidShiftExtend)
+    return false;
+  if (!AllowROR && ShType == AArch64_AM::ROR)
+    return false;
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    unsigned BitSize = N.getValueSizeInBits();
+    unsigned Val = RHS->getZExtValue() & (BitSize - 1);
+    unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
+
+    Reg = N.getOperand(0);
+    Shift = CurDAG->getTargetConstant(ShVal, SDLoc(N), MVT::i32);
+    return isWorthFoldingALU(N, true);
+  }
+
+  return false;
+}
+
 /// Instructions that accept extend modifiers like UXTW expect the register
 /// being extended to be a GPR32, but the incoming DAG might be acting on a
 /// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
@@ -925,7 +948,7 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
   Reg = narrowIfNeeded(CurDAG, Reg);
   Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
                                     MVT::i32);
-  return isWorthFolding(N);
+  return isWorthFoldingALU(N);
 }
 
 /// SelectArithUXTXRegister - Select a "UXTX register" operand. This
@@ -949,7 +972,7 @@ bool AArch64DAGToDAGISel::SelectArithUXTXRegister(SDValue N, SDValue &Reg,
   Reg = N.getOperand(0);
   Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
                                     MVT::i32);
-  return isWorthFolding(N);
+  return isWorthFoldingALU(N);
 }
 
 /// If there's a use of this ADDlow that's not itself a load/store then we'll
@@ -1164,7 +1187,7 @@ bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
   if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
     return false;
 
-  return isWorthFolding(N);
+  return isWorthFoldingAddr(N);
 }
 
 bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
@@ -1192,7 +1215,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
   }
 
   // Remember if it is worth folding N when it produces extended register.
-  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+  bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N);
 
   // Try to match a shifted extend on the RHS.
   if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
@@ -1222,7 +1245,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
     Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
                                            MVT::i32);
-    if (isWorthFolding(LHS))
+    if (isWorthFoldingAddr(LHS))
       return true;
   }
 
@@ -1234,7 +1257,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
     Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
     SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, dl,
                                            MVT::i32);
-    if (isWorthFolding(RHS))
+    if (isWorthFoldingAddr(RHS))
       return true;
   }
 
@@ -1305,7 +1328,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
   }
 
   // Remember if it is worth folding N when it produces extended register.
-  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+  bool IsExtendedRegisterWorthFolding = isWorthFoldingAddr(N);
 
   // Try to match a shifted extend on the RHS.
   if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3d0567665b3a1c..1912bde26d3d69 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16454,7 +16454,7 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
     } else if (SCVPlus1.isPowerOf2()) {
       ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
       return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
-    } else if (Subtarget->hasLSLFast() &&
+    } else if (Subtarget->hasALULSLFast() &&
                isPowPlusPlusConst(ConstValue, CVM, CVN)) {
       APInt CVMMinus1 = CVM - 1;
       APInt CVNMinus1 = CVN - 1;

diff  --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index acd429e96fa41b..2425a9a60c6373 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -6079,7 +6079,7 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
 
   // It's better to avoid folding and recomputing shifts when we don't have a
   // fastpath.
-  if (!STI.hasLSLFast())
+  if (!STI.hasAddrLSLFast())
     return false;
 
   // We have a fastpath, so folding a shift in and potentially computing it

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
index 03002a33650c22..720d9ad13aa077 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/load-addressing-modes.mir
@@ -24,7 +24,7 @@
   define void @ldbbrox(i64* %addr) { ret void }
   define void @ldrqrox(i64* %addr) { ret void }
   attributes #0 = { optsize }
-  attributes #1 = { "target-features"="+lsl-fast" }
+  attributes #1 = { "target-features"="+addr-lsl-fast" }
 ...
 
 ---

diff  --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
index eaa89081199ed6..ae3827cc65b5ea 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK0
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK3
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+addr-lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK3
 
 %struct.a = type [256 x i16]
 %struct.b = type [256 x i32]

diff  --git a/llvm/test/CodeGen/AArch64/lslfast.ll b/llvm/test/CodeGen/AArch64/lslfast.ll
new file mode 100644
index 00000000000000..5ec70b5f229754
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/lslfast.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-SLOW
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+alu-lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK-FAST
+
+define i32 @testmul3(i32 noundef %x, i32 noundef %y, i32 noundef %z) {
+; CHECK-SLOW-LABEL: testmul3:
+; CHECK-SLOW:       // %bb.0: // %entry
+; CHECK-SLOW-NEXT:    lsl w8, w0, #3
+; CHECK-SLOW-NEXT:    add w9, w8, w1
+; CHECK-SLOW-NEXT:    add w8, w8, w2
+; CHECK-SLOW-NEXT:    mul w0, w8, w9
+; CHECK-SLOW-NEXT:    ret
+;
+; CHECK-FAST-LABEL: testmul3:
+; CHECK-FAST:       // %bb.0: // %entry
+; CHECK-FAST-NEXT:    add w8, w1, w0, lsl #3
+; CHECK-FAST-NEXT:    add w9, w2, w0, lsl #3
+; CHECK-FAST-NEXT:    mul w0, w9, w8
+; CHECK-FAST-NEXT:    ret
+entry:
+  %shl = shl i32 %x, 3
+  %add = add nsw i32 %shl, %y
+  %add2 = add nsw i32 %shl, %z
+  %mul = mul nsw i32 %add2, %add
+  ret i32 %mul
+}
+
+define i32 @testvar(i32 noundef %x, i32 noundef %y, i32 noundef %z, i32 %zz) {
+; CHECK-LABEL: testvar:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl w8, w0, w3
+; CHECK-NEXT:    add w9, w8, w1
+; CHECK-NEXT:    add w8, w8, w2
+; CHECK-NEXT:    mul w0, w8, w9
+; CHECK-NEXT:    ret
+entry:
+  %shl = shl i32 %x, %zz
+  %add = add nsw i32 %shl, %y
+  %add2 = add nsw i32 %shl, %z
+  %mul = mul nsw i32 %add2, %add
+  ret i32 %mul
+}
+
+define i32 @testmul5(i32 noundef %x, i32 noundef %y, i32 noundef %z) {
+; CHECK-LABEL: testmul5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl w8, w0, #5
+; CHECK-NEXT:    add w9, w8, w1
+; CHECK-NEXT:    add w8, w8, w2
+; CHECK-NEXT:    mul w0, w8, w9
+; CHECK-NEXT:    ret
+entry:
+  %shl = shl i32 %x, 5
+  %add = add nsw i32 %shl, %y
+  %add2 = add nsw i32 %shl, %z
+  %mul = mul nsw i32 %add2, %add
+  ret i32 %mul
+}
+
+define i64 @testsext3(i32 noundef %x, i64 noundef %y, i64 noundef %z) {
+; CHECK-LABEL: testsext3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfiz x8, x0, #3, #32
+; CHECK-NEXT:    add x9, x8, x1
+; CHECK-NEXT:    add x8, x8, x2
+; CHECK-NEXT:    mul x0, x9, x8
+; CHECK-NEXT:    ret
+entry:
+  %conv = sext i32 %x to i64
+  %shl = shl nsw i64 %conv, 3
+  %add = add nsw i64 %shl, %y
+  %add3 = add nsw i64 %shl, %z
+  %mul = mul nsw i64 %add, %add3
+  ret i64 %mul
+}
+
+define i64 @testzext3(i32 noundef %x, i64 noundef %y, i64 noundef %z) {
+; CHECK-LABEL: testzext3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    ubfiz x8, x0, #3, #32
+; CHECK-NEXT:    add x9, x8, x1
+; CHECK-NEXT:    add x8, x8, x2
+; CHECK-NEXT:    mul x0, x9, x8
+; CHECK-NEXT:    ret
+entry:
+  %conv = zext i32 %x to i64
+  %shl = shl nsw i64 %conv, 3
+  %add = add nsw i64 %shl, %y
+  %add3 = add nsw i64 %shl, %z
+  %mul = mul nsw i64 %add, %add3
+  ret i64 %mul
+}
+
+define i64 @test3sext(i32 noundef %x, i64 noundef %y, i64 noundef %z) {
+; CHECK-LABEL: test3sext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl w8, w0, #3
+; CHECK-NEXT:    sxtw x8, w8
+; CHECK-NEXT:    add x9, x8, x1
+; CHECK-NEXT:    add x8, x8, x2
+; CHECK-NEXT:    mul x0, x9, x8
+; CHECK-NEXT:    ret
+entry:
+  %shl = shl i32 %x, 3
+  %conv = sext i32 %shl to i64
+  %add = add nsw i64 %conv, %y
+  %add3 = add nsw i64 %conv, %z
+  %mul = mul nsw i64 %add, %add3
+  ret i64 %mul
+}
+
+define i64 @test3zext(i32 noundef %x, i64 noundef %y, i64 noundef %z) {
+; CHECK-LABEL: test3zext:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsl w8, w0, #3
+; CHECK-NEXT:    add x9, x8, x1
+; CHECK-NEXT:    add x8, x8, x2
+; CHECK-NEXT:    mul x0, x9, x8
+; CHECK-NEXT:    ret
+entry:
+  %shl = shl i32 %x, 3
+  %conv = zext i32 %shl to i64
+  %add = add nsw i64 %conv, %y
+  %add3 = add nsw i64 %conv, %z
+  %mul = mul nsw i64 %add, %add3
+  ret i64 %mul
+}

diff  --git a/llvm/test/CodeGen/AArch64/mul_pow2.ll b/llvm/test/CodeGen/AArch64/mul_pow2.ll
index cbdf6337847cb6..8614424edbdd74 100644
--- a/llvm/test/CodeGen/AArch64/mul_pow2.ll
+++ b/llvm/test/CodeGen/AArch64/mul_pow2.ll
@@ -493,7 +493,7 @@ define i32 @test16(i32 %x) {
   ret i32 %mul
 }
 
-define i32 @test25_fast_shift(i32 %x) "target-features"="+lsl-fast" {
+define i32 @test25_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: test25_fast_shift:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add w8, w0, w0, lsl #2
@@ -510,7 +510,7 @@ define i32 @test25_fast_shift(i32 %x) "target-features"="+lsl-fast" {
   ret i32 %mul
 }
 
-define i32 @test45_fast_shift(i32 %x) "target-features"="+lsl-fast" {
+define i32 @test45_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: test45_fast_shift:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    add w8, w0, w0, lsl #2
@@ -546,7 +546,7 @@ define i32 @test45(i32 %x) {
 }
 
 ; Negative test: The shift amount 4 larger than 3
-define i32 @test85_fast_shift(i32 %x) "target-features"="+lsl-fast" {
+define i32 @test85_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: test85_fast_shift:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #85
@@ -564,7 +564,7 @@ define i32 @test85_fast_shift(i32 %x) "target-features"="+lsl-fast" {
 }
 
 ; Negative test: The shift amount 5 larger than 3
-define i32 @test297_fast_shift(i32 %x) "target-features"="+lsl-fast" {
+define i32 @test297_fast_shift(i32 %x) "target-features"="+alu-lsl-fast" {
 ; CHECK-LABEL: test297_fast_shift:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #297