[llvm] 159fb37 - [AArch64] Swap 'lsl(val1, small-shmt)' to right hand side for AND(lsl(val1,small-shmt), lsl(val2,large-shmt))
Mingming Liu via llvm-commits
llvm-commits at lists.llvm.org
Sun Oct 9 17:38:01 PDT 2022
Author: Mingming Liu
Date: 2022-10-09T17:26:54-07:00
New Revision: 159fb378f779ac79f7d456ea233892ad526b56d8
URL: https://github.com/llvm/llvm-project/commit/159fb378f779ac79f7d456ea233892ad526b56d8
DIFF: https://github.com/llvm/llvm-project/commit/159fb378f779ac79f7d456ea233892ad526b56d8.diff
LOG: [AArch64] Swap 'lsl(val1,small-shmt)' to right hand side for AND(lsl(val1,small-shmt), lsl(val2,large-shmt))
On many aarch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with LSL shift (shift-amount <= 4) has smaller latency and higher
throughput than ADD with larger shift (shift-amunt > 4). This is at least no-op for the rest of the processors.
Differential Revision: https://reviews.llvm.org/D135208
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 43d9f8c2f9c94..10867cc8962da 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1753,6 +1753,25 @@ EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
return VT.changeVectorElementTypeToInteger();
}
+// isIntImmediate - This method tests to see if the node is a constant
+// operand. If so Imm will receive the value.
+static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
+ if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
+ Imm = C->getZExtValue();
+ return true;
+ }
+ return false;
+}
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the value.
+static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
+ uint64_t &Imm) {
+ return N->getOpcode() == Opc &&
+ isIntImmediate(N->getOperand(1).getNode(), Imm);
+}
+
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
const APInt &Demanded,
TargetLowering::TargetLoweringOpt &TLO,
@@ -16705,6 +16724,40 @@ static SDValue performBuildVectorCombine(SDNode *N,
return SDValue();
}
+static SDValue performAddCombineForShiftedOperands(SDNode *N,
+ SelectionDAG &DAG) {
+ // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
+ // commutative.
+ if (N->getOpcode() != ISD::ADD)
+ return SDValue();
+
+ // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
+ // shifted register is only available for i32 and i64.
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ uint64_t LHSImm = 0, RHSImm = 0;
+ // If both operand are shifted by imm and shift amount is not greater than 4
+ // for one operand, swap LHS and RHS to put operand with smaller shift amount
+ // on RHS.
+ //
+ // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
+ // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
+ // with LSL (shift > 4). For the rest of processors, this is no-op for
+ // performance or correctness.
+ if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
+ isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
+ RHSImm > 4 && LHS.hasOneUse())
+ return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
+
+ return SDValue();
+}
+
static SDValue performAddSubCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -16719,6 +16772,8 @@ static SDValue performAddSubCombine(SDNode *N,
return Val;
if (SDValue Val = performVectorAddSubExtCombine(N, DAG))
return Val;
+ if (SDValue Val = performAddCombineForShiftedOperands(N, DAG))
+ return Val;
return performAddSubLongCombine(N, DCI, DAG);
}
diff --git a/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll b/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
index 2e6f753e56f47..4078863301748 100644
--- a/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
+++ b/llvm/test/CodeGen/AArch64/logical_shifted_reg.ll
@@ -292,8 +292,8 @@ ret:
define i64 @add_swap_rhs_lhs_i64(i64 %0, i64 %1) {
; CHECK-LABEL: add_swap_rhs_lhs_i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: lsl x8, x1, #3
-; CHECK-NEXT: add x0, x8, x0, lsl #8
+; CHECK-NEXT: lsl x8, x0, #8
+; CHECK-NEXT: add x0, x8, x1, lsl #3
; CHECK-NEXT: ret
%3 = shl i64 %0, 8
%4 = shl i64 %1, 3
@@ -318,8 +318,8 @@ define i64 @add_swap_no_op_i64(i64 %0, i64 %1, i64* %2) {
define i32 @add_swap_rhs_lhs_i32(i32 %0, i32 %1) {
; CHECK-LABEL: add_swap_rhs_lhs_i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: lsl w8, w1, #3
-; CHECK-NEXT: add w0, w8, w0, lsl #8
+; CHECK-NEXT: lsl w8, w0, #8
+; CHECK-NEXT: add w0, w8, w1, lsl #3
; CHECK-NEXT: ret
%3 = shl i32 %0, 8
%4 = shl i32 %1, 3
More information about the llvm-commits
mailing list