[llvm] cb375e8 - [AArch64] Enable LSLFast for modern OoO cpus
David Green via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 20 09:09:25 PDT 2022
Author: David Green
Date: 2022-09-20T17:09:14+01:00
New Revision: cb375e8c1f393b53d6516950969a5caac42bf178
URL: https://github.com/llvm/llvm-project/commit/cb375e8c1f393b53d6516950969a5caac42bf178
DIFF: https://github.com/llvm/llvm-project/commit/cb375e8c1f393b53d6516950969a5caac42bf178.diff
LOG: [AArch64] Enable LSLFast for modern OoO cpus
This patch enables the LSLFast feature for Cortex-A76, Cortex-A77,
Cortex-A78, Cortex-A78C, Cortex-A710, Cortex-X1, Cortex-X2, Neoverse N1,
Neoverse N2, Neoverse V1 and the Neoverse 512TB pseudo-cpu, in-line with
the software optimization guides for those CPUs.
Differntial revision: https://reviews.llvm.org/D134273
Added:
Modified:
llvm/lib/Target/AArch64/AArch64.td
llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index a156b8f95db9..5ba54986d223 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -717,17 +717,20 @@ def TuneA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75",
def TuneA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
"Cortex-A76 ARM processors", [
- FeatureFuseAES]>;
+ FeatureFuseAES,
+ FeatureLSLFast]>;
def TuneA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
"Cortex-A77 ARM processors", [
FeatureCmpBccFusion,
- FeatureFuseAES]>;
+ FeatureFuseAES,
+ FeatureLSLFast]>;
def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78",
"Cortex-A78 ARM processors", [
FeatureCmpBccFusion,
FeatureFuseAES,
+ FeatureLSLFast,
FeaturePostRAScheduler]>;
def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
@@ -735,13 +738,15 @@ def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
"Cortex-A78C ARM processors", [
FeatureCmpBccFusion,
FeatureFuseAES,
+ FeatureLSLFast,
FeaturePostRAScheduler]>;
def TuneA710 : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710",
"Cortex-A710 ARM processors", [
+ FeatureCmpBccFusion,
FeatureFuseAES,
- FeaturePostRAScheduler,
- FeatureCmpBccFusion]>;
+ FeatureLSLFast,
+ FeaturePostRAScheduler]>;
def TuneR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily",
"CortexR82",
@@ -752,13 +757,15 @@ def TuneX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
"Cortex-X1 ARM processors", [
FeatureCmpBccFusion,
FeatureFuseAES,
+ FeatureLSLFast,
FeaturePostRAScheduler]>;
def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
"Cortex-X2 ARM processors", [
+ FeatureCmpBccFusion,
FeatureFuseAES,
- FeaturePostRAScheduler,
- FeatureCmpBccFusion]>;
+ FeatureLSLFast,
+ FeaturePostRAScheduler]>;
def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
"Fujitsu A64FX processors", [
@@ -901,31 +908,32 @@ def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
def TuneNeoverseE1 : SubtargetFeature<"neoversee1", "ARMProcFamily", "NeoverseE1",
"Neoverse E1 ARM processors", [
- FeaturePostRAScheduler,
- FeatureFuseAES
- ]>;
+ FeatureFuseAES,
+ FeaturePostRAScheduler]>;
def TuneNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", "NeoverseN1",
"Neoverse N1 ARM processors", [
- FeaturePostRAScheduler,
FeatureFuseAES,
- FeatureFuseAdrpAdd
- ]>;
+ FeatureFuseAdrpAdd,
+ FeatureLSLFast,
+ FeaturePostRAScheduler]>;
def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2",
"Neoverse N2 ARM processors", [
- FeaturePostRAScheduler,
- FeatureFuseAES
- ]>;
+ FeatureFuseAES,
+ FeatureLSLFast,
+ FeaturePostRAScheduler]>;
+
def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Neoverse512TVB",
"Neoverse 512-TVB ARM processors", [
- FeaturePostRAScheduler,
- FeatureFuseAES
- ]>;
+ FeatureFuseAES,
+ FeatureLSLFast,
+ FeaturePostRAScheduler]>;
def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1",
"Neoverse V1 ARM processors", [
FeatureFuseAES,
+ FeatureLSLFast,
FeaturePostRAScheduler]>;
def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
index 7c65884bb2df..25ea3933c006 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+lsl-fast | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK0
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+lsl-fast | FileCheck %s --check-prefixes=CHECK,CHECK3
%struct.a = type [256 x i16]
%struct.b = type [256 x i32]
@@ -7,20 +8,36 @@
declare void @foo()
define i16 @halfword(%struct.a* %ctx, i32 %xor72) nounwind {
-; CHECK-LABEL: halfword:
-; CHECK: // %bb.0:
-; CHECK-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: ubfx x21, x1, #9, #8
-; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: ldrh w20, [x0, x21, lsl #1]
-; CHECK-NEXT: bl foo
-; CHECK-NEXT: mov w0, w20
-; CHECK-NEXT: strh w20, [x19, x21, lsl #1]
-; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEXT: ret
+; CHECK0-LABEL: halfword:
+; CHECK0: // %bb.0:
+; CHECK0-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK0-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK0-NEXT: ubfx x8, x1, #9, #8
+; CHECK0-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK0-NEXT: lsl x21, x8, #1
+; CHECK0-NEXT: mov x19, x0
+; CHECK0-NEXT: ldrh w20, [x0, x21]
+; CHECK0-NEXT: bl foo
+; CHECK0-NEXT: mov w0, w20
+; CHECK0-NEXT: strh w20, [x19, x21]
+; CHECK0-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK0-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK0-NEXT: ret
+;
+; CHECK3-LABEL: halfword:
+; CHECK3: // %bb.0:
+; CHECK3-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK3-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK3-NEXT: ubfx x21, x1, #9, #8
+; CHECK3-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK3-NEXT: mov x19, x0
+; CHECK3-NEXT: ldrh w20, [x0, x21, lsl #1]
+; CHECK3-NEXT: bl foo
+; CHECK3-NEXT: mov w0, w20
+; CHECK3-NEXT: strh w20, [x19, x21, lsl #1]
+; CHECK3-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK3-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK3-NEXT: ret
%shr81 = lshr i32 %xor72, 9
%conv82 = zext i32 %shr81 to i64
%idxprom83 = and i64 %conv82, 255
@@ -32,20 +49,36 @@ define i16 @halfword(%struct.a* %ctx, i32 %xor72) nounwind {
}
define i32 @word(%struct.b* %ctx, i32 %xor72) nounwind {
-; CHECK-LABEL: word:
-; CHECK: // %bb.0:
-; CHECK-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: ubfx x21, x1, #9, #8
-; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: ldr w20, [x0, x21, lsl #2]
-; CHECK-NEXT: bl foo
-; CHECK-NEXT: mov w0, w20
-; CHECK-NEXT: str w20, [x19, x21, lsl #2]
-; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEXT: ret
+; CHECK0-LABEL: word:
+; CHECK0: // %bb.0:
+; CHECK0-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK0-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK0-NEXT: ubfx x8, x1, #9, #8
+; CHECK0-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK0-NEXT: lsl x21, x8, #2
+; CHECK0-NEXT: mov x19, x0
+; CHECK0-NEXT: ldr w20, [x0, x21]
+; CHECK0-NEXT: bl foo
+; CHECK0-NEXT: mov w0, w20
+; CHECK0-NEXT: str w20, [x19, x21]
+; CHECK0-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK0-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK0-NEXT: ret
+;
+; CHECK3-LABEL: word:
+; CHECK3: // %bb.0:
+; CHECK3-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK3-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK3-NEXT: ubfx x21, x1, #9, #8
+; CHECK3-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK3-NEXT: mov x19, x0
+; CHECK3-NEXT: ldr w20, [x0, x21, lsl #2]
+; CHECK3-NEXT: bl foo
+; CHECK3-NEXT: mov w0, w20
+; CHECK3-NEXT: str w20, [x19, x21, lsl #2]
+; CHECK3-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK3-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK3-NEXT: ret
%shr81 = lshr i32 %xor72, 9
%conv82 = zext i32 %shr81 to i64
%idxprom83 = and i64 %conv82, 255
@@ -57,20 +90,36 @@ define i32 @word(%struct.b* %ctx, i32 %xor72) nounwind {
}
define i64 @doubleword(%struct.c* %ctx, i32 %xor72) nounwind {
-; CHECK-LABEL: doubleword:
-; CHECK: // %bb.0:
-; CHECK-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT: ubfx x21, x1, #9, #8
-; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: mov x19, x0
-; CHECK-NEXT: ldr x20, [x0, x21, lsl #3]
-; CHECK-NEXT: bl foo
-; CHECK-NEXT: mov x0, x20
-; CHECK-NEXT: str x20, [x19, x21, lsl #3]
-; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
-; CHECK-NEXT: ret
+; CHECK0-LABEL: doubleword:
+; CHECK0: // %bb.0:
+; CHECK0-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK0-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK0-NEXT: ubfx x8, x1, #9, #8
+; CHECK0-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK0-NEXT: lsl x21, x8, #3
+; CHECK0-NEXT: mov x19, x0
+; CHECK0-NEXT: ldr x20, [x0, x21]
+; CHECK0-NEXT: bl foo
+; CHECK0-NEXT: mov x0, x20
+; CHECK0-NEXT: str x20, [x19, x21]
+; CHECK0-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK0-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK0-NEXT: ret
+;
+; CHECK3-LABEL: doubleword:
+; CHECK3: // %bb.0:
+; CHECK3-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; CHECK3-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK3-NEXT: ubfx x21, x1, #9, #8
+; CHECK3-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK3-NEXT: mov x19, x0
+; CHECK3-NEXT: ldr x20, [x0, x21, lsl #3]
+; CHECK3-NEXT: bl foo
+; CHECK3-NEXT: mov x0, x20
+; CHECK3-NEXT: str x20, [x19, x21, lsl #3]
+; CHECK3-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK3-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; CHECK3-NEXT: ret
%shr81 = lshr i32 %xor72, 9
%conv82 = zext i32 %shr81 to i64
%idxprom83 = and i64 %conv82, 255
@@ -112,3 +161,67 @@ exitbb:
endbb:
ret i64 %mul2
}
+
+define i64 @gep3(i64 *%p, i64 %b) {
+; CHECK0-LABEL: gep3:
+; CHECK0: // %bb.0:
+; CHECK0-NEXT: lsl x9, x1, #3
+; CHECK0-NEXT: mov x8, x0
+; CHECK0-NEXT: ldr x0, [x0, x9]
+; CHECK0-NEXT: str x1, [x8, x9]
+; CHECK0-NEXT: ret
+;
+; CHECK3-LABEL: gep3:
+; CHECK3: // %bb.0:
+; CHECK3-NEXT: mov x8, x0
+; CHECK3-NEXT: ldr x0, [x0, x1, lsl #3]
+; CHECK3-NEXT: str x1, [x8, x1, lsl #3]
+; CHECK3-NEXT: ret
+ %g = getelementptr inbounds i64, i64* %p, i64 %b
+ %l = load i64, i64* %g
+ store i64 %b, i64* %g
+ ret i64 %l
+}
+
+define i128 @gep4(i128 *%p, i128 %a, i64 %b) {
+; CHECK-LABEL: gep4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add x8, x0, x4, lsl #4
+; CHECK-NEXT: ldp x0, x1, [x8]
+; CHECK-NEXT: stp x2, x3, [x8]
+; CHECK-NEXT: ret
+ %g = getelementptr inbounds i128, i128* %p, i64 %b
+ %l = load i128, i128* %g
+ store i128 %a, i128* %g
+ ret i128 %l
+}
+
+define i64 @addlsl3(i64 %a, i64 %b) {
+; CHECK-LABEL: addlsl3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl x8, x0, #3
+; CHECK-NEXT: add x9, x1, x8
+; CHECK-NEXT: sub x8, x1, x8
+; CHECK-NEXT: eor x0, x9, x8
+; CHECK-NEXT: ret
+ %x = shl i64 %a, 3
+ %y = add i64 %b, %x
+ %z = sub i64 %b, %x
+ %r = xor i64 %y, %z
+ ret i64 %r
+}
+
+define i64 @addlsl4(i64 %a, i64 %b) {
+; CHECK-LABEL: addlsl4:
+; CHECK: // %bb.0:
+; CHECK-NEXT: lsl x8, x0, #4
+; CHECK-NEXT: add x9, x1, x8
+; CHECK-NEXT: sub x8, x1, x8
+; CHECK-NEXT: eor x0, x9, x8
+; CHECK-NEXT: ret
+ %x = shl i64 %a, 4
+ %y = add i64 %b, %x
+ %z = sub i64 %b, %x
+ %r = xor i64 %y, %z
+ ret i64 %r
+}
More information about the llvm-commits
mailing list