[llvm] 22f2173 - [AArch64] Add PredictableSelectIsExpensive feature to all the cpus that have FeatureEnableSelectOptimize
Aleksandr Popov via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 3 03:33:32 PDT 2023
Author: Aleksandr Popov
Date: 2023-07-02T09:23:43+02:00
New Revision: 22f21738370c15fbdd26006583027f8667e0461d
URL: https://github.com/llvm/llvm-project/commit/22f21738370c15fbdd26006583027f8667e0461d
DIFF: https://github.com/llvm/llvm-project/commit/22f21738370c15fbdd26006583027f8667e0461d.diff
LOG: [AArch64] Add PredictableSelectIsExpensive feature to all the cpus that have FeatureEnableSelectOptimize
In the revision https://reviews.llvm.org/D138990 was enabled select
optimize pass for AArch64.
We were doing some benchmarking on the Neoverse V1 and were
experimenting with select optimize heuristics. We found out that there
are some additional profitable transformations to predictable branches
(with prediction rate > 75% according to Agner Fog's rule of thumb) can
be done by base heuristic from SelectOptimize pass or by
optimizeSelectInst form CodeGenPrepare pass. But they are blocked on the
Neoverse V1, since PredictableSelectIsExpensive feature is not set for
that subtarget.
Note that to achieve this results we also changed predictable branch
threshold from 99% to 75%
Looks like it makes sense to add this feature to all targets where was
enabled select optimize pass in the https://reviews.llvm.org/D138990.
Reviewed By: dmgreen
Differential Revision: https://reviews.llvm.org/D143162
Added:
llvm/test/CodeGen/AArch64/convert-highly-predictable-select-to-branch.ll
Modified:
llvm/lib/Target/AArch64/AArch64.td
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 1a87e9b9d989bd..0acb7bc0bc2a33 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -792,33 +792,38 @@ def TuneA65 : SubtargetFeature<"a65", "ARMProcFamily", "CortexA65",
FeatureFuseAddress,
FeatureFuseAdrpAdd,
FeatureFuseLiterals,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72",
"Cortex-A72 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeatureFuseLiterals,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
"Cortex-A73 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75",
"Cortex-A75 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
"Cortex-A76 ARM processors", [
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeatureLSLFast,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
"Cortex-A77 ARM processors", [
@@ -826,7 +831,8 @@ def TuneA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
FeatureFuseAES,
FeatureFuseAdrpAdd,
FeatureLSLFast,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78",
"Cortex-A78 ARM processors", [
@@ -835,7 +841,8 @@ def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78",
FeatureFuseAdrpAdd,
FeatureLSLFast,
FeaturePostRAScheduler,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
"CortexA78C",
@@ -845,7 +852,8 @@ def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily",
FeatureFuseAdrpAdd,
FeatureLSLFast,
FeaturePostRAScheduler,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneA710 : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710",
"Cortex-A710 ARM processors", [
@@ -854,7 +862,8 @@ def TuneA710 : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710",
FeatureFuseAdrpAdd,
FeatureLSLFast,
FeaturePostRAScheduler,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715",
"Cortex-A715 ARM processors", [
@@ -863,7 +872,8 @@ def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715",
FeatureCmpBccFusion,
FeatureLSLFast,
FeatureFuseAdrpAdd,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily",
"CortexR82",
@@ -877,7 +887,8 @@ def TuneX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
FeatureFuseAdrpAdd,
FeatureLSLFast,
FeaturePostRAScheduler,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
"Cortex-X2 ARM processors", [
@@ -886,7 +897,8 @@ def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2",
FeatureFuseAdrpAdd,
FeatureLSLFast,
FeaturePostRAScheduler,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
"Cortex-X3 ARM processors", [
@@ -894,7 +906,8 @@ def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
FeatureFuseAdrpAdd,
FeatureFuseAES,
FeaturePostRAScheduler,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
"Fujitsu A64FX processors", [
@@ -1079,7 +1092,8 @@ def TuneNeoverseN1 : SubtargetFeature<"neoversen1", "ARMProcFamily", "NeoverseN1
FeatureFuseAdrpAdd,
FeatureLSLFast,
FeaturePostRAScheduler,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2",
"Neoverse N2 ARM processors", [
@@ -1087,7 +1101,8 @@ def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2
FeatureFuseAdrpAdd,
FeatureLSLFast,
FeaturePostRAScheduler,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Neoverse512TVB",
"Neoverse 512-TVB ARM processors", [
@@ -1095,7 +1110,8 @@ def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Ne
FeatureFuseAdrpAdd,
FeatureLSLFast,
FeaturePostRAScheduler,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1",
"Neoverse V1 ARM processors", [
@@ -1103,14 +1119,16 @@ def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1
FeatureFuseAdrpAdd,
FeatureLSLFast,
FeaturePostRAScheduler,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2",
"Neoverse V2 ARM processors", [
FeatureFuseAES,
FeatureLSLFast,
FeaturePostRAScheduler,
- FeatureEnableSelectOptimize]>;
+ FeatureEnableSelectOptimize,
+ FeaturePredictableSelectIsExpensive]>;
def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
"Qualcomm Saphira processors", [
diff --git a/llvm/test/CodeGen/AArch64/convert-highly-predictable-select-to-branch.ll b/llvm/test/CodeGen/AArch64/convert-highly-predictable-select-to-branch.ll
new file mode 100644
index 00000000000000..156ec400d5e7f1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/convert-highly-predictable-select-to-branch.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=generic -S < %s | FileCheck %s --check-prefix=CHECK-GENERIC
+; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=neoverse-n1 -S < %s | FileCheck %s
+; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -S < %s | FileCheck %s
+; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=cortex-a72 -S < %s | FileCheck %s
+
+; Test has not predictable select, which should not be transformed to a branch
+define i32 @test1(i32 %a) {
+; CHECK-GENERIC-LABEL: @test1(
+; CHECK-GENERIC-NEXT: entry:
+; CHECK-GENERIC-NEXT: [[CMP:%.*]] = icmp slt i32 [[A:%.*]], 1
+; CHECK-GENERIC-NEXT: [[DEC:%.*]] = sub i32 [[A]], 1
+; CHECK-GENERIC-NEXT: [[RES:%.*]] = select i1 [[CMP]], i32 0, i32 [[DEC]], !prof [[PROF0:![0-9]+]]
+; CHECK-GENERIC-NEXT: ret i32 [[RES]]
+;
+; CHECK-LABEL: @test1(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A:%.*]], 1
+; CHECK-NEXT: [[DEC:%.*]] = sub i32 [[A]], 1
+; CHECK-NEXT: [[RES:%.*]] = select i1 [[CMP]], i32 0, i32 [[DEC]], !prof [[PROF0:![0-9]+]]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+entry:
+ %cmp = icmp slt i32 %a, 1
+ %dec = sub i32 %a, 1
+ %res = select i1 %cmp, i32 0, i32 %dec, !prof !0
+ ret i32 %res
+}
+
+; Test has highly predictable select according to profile data,
+; which should be transformed to a branch on cores with enabled FeaturePredictableSelectIsExpensive
+define i32 @test2(i32 %a) {
+; CHECK-GENERIC-LABEL: @test2(
+; CHECK-GENERIC-NEXT: entry:
+; CHECK-GENERIC-NEXT: [[CMP:%.*]] = icmp slt i32 [[A:%.*]], 1
+; CHECK-GENERIC-NEXT: [[DEC:%.*]] = sub i32 [[A]], 1
+; CHECK-GENERIC-NEXT: [[RES:%.*]] = select i1 [[CMP]], i32 0, i32 [[DEC]], !prof [[PROF1:![0-9]+]]
+; CHECK-GENERIC-NEXT: ret i32 [[RES]]
+;
+; CHECK-LABEL: @test2(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A:%.*]], 1
+; CHECK-NEXT: [[RES_FROZEN:%.*]] = freeze i1 [[CMP]]
+; CHECK-NEXT: br i1 [[RES_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE_SINK:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK: select.false.sink:
+; CHECK-NEXT: [[DEC:%.*]] = sub i32 [[A]], 1
+; CHECK-NEXT: br label [[SELECT_END]]
+; CHECK: select.end:
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[DEC]], [[SELECT_FALSE_SINK]] ]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+entry:
+ %cmp = icmp slt i32 %a, 1
+ %dec = sub i32 %a, 1
+ %res = select i1 %cmp, i32 0, i32 %dec, !prof !1
+ ret i32 %res
+}
+
+!0 = !{!"branch_weights", i32 1, i32 1}
+!1 = !{!"branch_weights", i32 1, i32 1000}
More information about the llvm-commits
mailing list