[llvm] 0eb0a65 - [AArch64] Correctly determine if {ADD,SUB}{W,X}rs instructions are cheap
Momchil Velikov via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 21 10:48:34 PDT 2023
Author: Momchil Velikov
Date: 2023-09-21T18:44:24+01:00
New Revision: 0eb0a65d0f9c5735c51b7816d68d3200f54b5a3e
URL: https://github.com/llvm/llvm-project/commit/0eb0a65d0f9c5735c51b7816d68d3200f54b5a3e
DIFF: https://github.com/llvm/llvm-project/commit/0eb0a65d0f9c5735c51b7816d68d3200f54b5a3e.diff
LOG: [AArch64] Correctly determine if {ADD,SUB}{W,X}rs instructions are cheap
These are marked to be "as cheap as a move".
According to publicly available Software Optimization Guides, they
have one cycle latency and maximum throughput only on some
microarchitectures, only for `LSL` and only for some shift amounts.
This patch uses the subtarget feature `FeatureALULSLFast` to determine
how cheap the instructions are.
Reviewed By: dmgreen
Differential Revision: https://reviews.llvm.org/D152827
Change-Id: I8f0d7e79bcf277ebf959719991c29a1bc7829486
Added:
llvm/test/CodeGen/AArch64/addsub-shifted-reg-cheap-as-move.ll
Modified:
llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index e5e2eb2bfdc7c3a..60cdea9e8384e54 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -904,6 +904,13 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default:
return MI.isAsCheapAsAMove();
+
+ case AArch64::ADDWrs:
+ case AArch64::ADDXrs:
+ case AArch64::SUBWrs:
+ case AArch64::SUBXrs:
+ return Subtarget.hasALULSLFast() && MI.getOperand(3).getImm() <= 4;
+
// If MOVi32imm or MOVi64imm can be expanded into ORRWri or
// ORRXri, it is as cheap as MOV.
// Likewise if it can be expanded to MOVZ/MOVN/MOVK.
diff --git a/llvm/test/CodeGen/AArch64/addsub-shifted-reg-cheap-as-move.ll b/llvm/test/CodeGen/AArch64/addsub-shifted-reg-cheap-as-move.ll
new file mode 100644
index 000000000000000..6b1c6fa674602d5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/addsub-shifted-reg-cheap-as-move.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -o - | FileCheck %s
+; RUN: llc -mattr=+alu-lsl-fast < %s -o - | FileCheck %s -check-prefix=LSLFAST
+target triple = "aarch64-linux"
+
+declare void @g(...)
+
+; Check that ADDWrs/ADDXrs with shift > 4 is considered relatively
+; slow, thus CSE-d.
+define void @f0(i1 %c0, i1 %c1, ptr %a, i64 %i) {
+; CHECK-LABEL: f0:
+; CHECK: // %bb.0: // %E
+; CHECK-NEXT: tbz w0, #0, .LBB0_5
+; CHECK-NEXT: // %bb.1: // %A
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: add x0, x2, x3, lsl #5
+; CHECK-NEXT: tbz w1, #0, .LBB0_3
+; CHECK-NEXT: // %bb.2: // %B
+; CHECK-NEXT: bl g
+; CHECK-NEXT: b .LBB0_4
+; CHECK-NEXT: .LBB0_3: // %C
+; CHECK-NEXT: mov x1, x0
+; CHECK-NEXT: bl g
+; CHECK-NEXT: .LBB0_4:
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .LBB0_5: // %X
+; CHECK-NEXT: ret
+;
+; LSLFAST-LABEL: f0:
+; LSLFAST: // %bb.0: // %E
+; LSLFAST-NEXT: tbz w0, #0, .LBB0_5
+; LSLFAST-NEXT: // %bb.1: // %A
+; LSLFAST-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; LSLFAST-NEXT: .cfi_def_cfa_offset 16
+; LSLFAST-NEXT: .cfi_offset w30, -16
+; LSLFAST-NEXT: add x0, x2, x3, lsl #5
+; LSLFAST-NEXT: tbz w1, #0, .LBB0_3
+; LSLFAST-NEXT: // %bb.2: // %B
+; LSLFAST-NEXT: bl g
+; LSLFAST-NEXT: b .LBB0_4
+; LSLFAST-NEXT: .LBB0_3: // %C
+; LSLFAST-NEXT: mov x1, x0
+; LSLFAST-NEXT: bl g
+; LSLFAST-NEXT: .LBB0_4:
+; LSLFAST-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; LSLFAST-NEXT: .LBB0_5: // %X
+; LSLFAST-NEXT: ret
+E:
+ %p0 = getelementptr {i64, i64, i64, i64}, ptr %a, i64 %i
+ br i1 %c0, label %A, label %X
+
+A:
+ br i1 %c1, label %B, label %C
+
+B:
+ call void @g(ptr %p0)
+ br label %X
+
+C:
+ %p1 = getelementptr {i64, i64, i64, i64}, ptr %a, i64 %i
+ call void @g(ptr %p1, ptr %p0)
+ br label %X
+
+X:
+ ret void
+}
+
+; Check that ADDWrs/ADDXrs with shift <= 4 is considered relatively fast on sub-targets
+; with feature +alu-lsl-fast, thus *not* CSE-d.
+define void @f1(i1 %c0, i1 %c1, ptr %a, i64 %i) {
+; CHECK-LABEL: f1:
+; CHECK: // %bb.0: // %E
+; CHECK-NEXT: tbz w0, #0, .LBB1_5
+; CHECK-NEXT: // %bb.1: // %A
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: add x0, x2, x3, lsl #4
+; CHECK-NEXT: tbz w1, #0, .LBB1_3
+; CHECK-NEXT: // %bb.2: // %B
+; CHECK-NEXT: bl g
+; CHECK-NEXT: b .LBB1_4
+; CHECK-NEXT: .LBB1_3: // %C
+; CHECK-NEXT: mov x1, x0
+; CHECK-NEXT: bl g
+; CHECK-NEXT: .LBB1_4:
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .LBB1_5: // %X
+; CHECK-NEXT: ret
+;
+; LSLFAST-LABEL: f1:
+; LSLFAST: // %bb.0: // %E
+; LSLFAST-NEXT: tbz w0, #0, .LBB1_5
+; LSLFAST-NEXT: // %bb.1: // %A
+; LSLFAST-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; LSLFAST-NEXT: .cfi_def_cfa_offset 16
+; LSLFAST-NEXT: .cfi_offset w30, -16
+; LSLFAST-NEXT: add x8, x2, x3, lsl #4
+; LSLFAST-NEXT: tbz w1, #0, .LBB1_3
+; LSLFAST-NEXT: // %bb.2: // %B
+; LSLFAST-NEXT: mov x0, x8
+; LSLFAST-NEXT: bl g
+; LSLFAST-NEXT: b .LBB1_4
+; LSLFAST-NEXT: .LBB1_3: // %C
+; LSLFAST-NEXT: add x0, x2, x3, lsl #4
+; LSLFAST-NEXT: mov x1, x8
+; LSLFAST-NEXT: bl g
+; LSLFAST-NEXT: .LBB1_4:
+; LSLFAST-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; LSLFAST-NEXT: .LBB1_5: // %X
+; LSLFAST-NEXT: ret
+E:
+ %p0 = getelementptr {i64, i64}, ptr %a, i64 %i
+ br i1 %c0, label %A, label %X
+
+A:
+ br i1 %c1, label %B, label %C
+
+B:
+ call void @g(ptr %p0)
+ br label %X
+
+C:
+ %p1 = getelementptr {i64, i64}, ptr %a, i64 %i
+ call void @g(ptr %p1, ptr %p0)
+ br label %X
+
+X:
+ ret void
+}
More information about the llvm-commits
mailing list