[llvm] [AArch64] Improve cost model for legal subvec insert/extract (PR #81135)
Graham Hunter via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 8 05:34:39 PST 2024
https://github.com/huntergr-arm created https://github.com/llvm/llvm-project/pull/81135
Currently we model subvector inserts and extracts as shuffles, potentially going as far as scalarizing. If the types are legal then they can just be simple zip/unzip operations, or possible even no-ops. Change the cost to a relatively small one to ensure that simple loops featuring such operations between fixed and scalable vector types that are effectively the same at a given sve width can be unrolled and further optimized.
>From d25b58f0a0df5913a9f78d331b724864709a0423 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 7 Feb 2024 17:22:35 +0000
Subject: [PATCH 1/2] [NFC][AArch64] Precommit tests for guarding unrolling
with scalable subvec ins/ext
---
.../AArch64/scalable-vec-ins-ext.ll | 87 +++++++++++++++++++
1 file changed, 87 insertions(+)
create mode 100644 llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll b/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
new file mode 100644
index 0000000000000..4e9885750008f
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=loop-unroll,simplifycfg -S -mtriple aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=128 %s | FileCheck %s -check-prefix=UNROLL-128
+; RUN: opt -passes=loop-unroll,simplifycfg -S -mtriple aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=256 %s | FileCheck %s -check-prefix=UNROLL-256
+
+;; This test contains IR similar to what would be generated when SVE ACLE
+;; routines are used with fixed-width vector types -- lots of subvector inserts
+;; and extracts that are effectively just bitcasts since the types are the
+;; same at a given SVE bit size. We want to make sure that they are not a
+;; barrier to unrolling simple loops with a fixed trip count which could be
+;; further optimized.
+
+define void @test_ins_ext_cost(ptr readonly %a, ptr readonly %b, ptr readonly %c, ptr noalias %d) {
+; UNROLL-128-LABEL: define void @test_ins_ext_cost(
+; UNROLL-128-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; UNROLL-128-NEXT: entry:
+; UNROLL-128-NEXT: br label [[FOR_BODY:%.*]]
+; UNROLL-128: for.body:
+; UNROLL-128-NEXT: [[EXIT_COND:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[FOR_BODY]] ]
+; UNROLL-128-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 1, [[FOR_BODY]] ]
+; UNROLL-128-NEXT: [[GEP_A:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 [[IV]]
+; UNROLL-128-NEXT: [[LOAD_A:%.*]] = load <8 x float>, ptr [[GEP_A]], align 16
+; UNROLL-128-NEXT: [[GEP_B:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 [[IV]]
+; UNROLL-128-NEXT: [[LOAD_B:%.*]] = load <8 x float>, ptr [[GEP_B]], align 16
+; UNROLL-128-NEXT: [[GEP_C:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 [[IV]]
+; UNROLL-128-NEXT: [[LOAD_C:%.*]] = load <8 x float>, ptr [[GEP_C]], align 16
+; UNROLL-128-NEXT: [[CAST_SCALABLE_B:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B]], i64 0)
+; UNROLL-128-NEXT: [[CAST_SCALABLE_C:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C]], i64 0)
+; UNROLL-128-NEXT: [[ADD:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B]], [[CAST_SCALABLE_C]]
+; UNROLL-128-NEXT: [[CAST_SCALABLE_A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A]], i64 0)
+; UNROLL-128-NEXT: [[MUL:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A]], [[ADD]]
+; UNROLL-128-NEXT: [[CAST_FIXED_D:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL]], i64 0)
+; UNROLL-128-NEXT: [[GEP_D:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 [[IV]]
+; UNROLL-128-NEXT: store <8 x float> [[CAST_FIXED_D]], ptr [[GEP_D]], align 16
+; UNROLL-128-NEXT: br i1 [[EXIT_COND]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; UNROLL-128: exit:
+; UNROLL-128-NEXT: ret void
+;
+; UNROLL-256-LABEL: define void @test_ins_ext_cost(
+; UNROLL-256-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; UNROLL-256-NEXT: entry:
+; UNROLL-256-NEXT: br label [[FOR_BODY:%.*]]
+; UNROLL-256: for.body:
+; UNROLL-256-NEXT: [[EXIT_COND:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[FOR_BODY]] ]
+; UNROLL-256-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 1, [[FOR_BODY]] ]
+; UNROLL-256-NEXT: [[GEP_A:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 [[IV]]
+; UNROLL-256-NEXT: [[LOAD_A:%.*]] = load <8 x float>, ptr [[GEP_A]], align 16
+; UNROLL-256-NEXT: [[GEP_B:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 [[IV]]
+; UNROLL-256-NEXT: [[LOAD_B:%.*]] = load <8 x float>, ptr [[GEP_B]], align 16
+; UNROLL-256-NEXT: [[GEP_C:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 [[IV]]
+; UNROLL-256-NEXT: [[LOAD_C:%.*]] = load <8 x float>, ptr [[GEP_C]], align 16
+; UNROLL-256-NEXT: [[CAST_SCALABLE_B:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B]], i64 0)
+; UNROLL-256-NEXT: [[CAST_SCALABLE_C:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C]], i64 0)
+; UNROLL-256-NEXT: [[ADD:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B]], [[CAST_SCALABLE_C]]
+; UNROLL-256-NEXT: [[CAST_SCALABLE_A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A]], i64 0)
+; UNROLL-256-NEXT: [[MUL:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A]], [[ADD]]
+; UNROLL-256-NEXT: [[CAST_FIXED_D:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL]], i64 0)
+; UNROLL-256-NEXT: [[GEP_D:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 [[IV]]
+; UNROLL-256-NEXT: store <8 x float> [[CAST_FIXED_D]], ptr [[GEP_D]], align 16
+; UNROLL-256-NEXT: br i1 [[EXIT_COND]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; UNROLL-256: exit:
+; UNROLL-256-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body:
+ %exit.cond = phi i1 [ true, %entry ], [ false, %for.body ]
+ %iv = phi i64 [ 0, %entry ], [ 1, %for.body ]
+ %gep.a = getelementptr inbounds <8 x float>, ptr %a, i64 %iv
+ %load.a = load <8 x float>, ptr %gep.a, align 16
+ %gep.b = getelementptr inbounds <8 x float>, ptr %b, i64 %iv
+ %load.b = load <8 x float>, ptr %gep.b, align 16
+ %gep.c = getelementptr inbounds <8 x float>, ptr %c, i64 %iv
+ %load.c = load <8 x float>, ptr %gep.c, align 16
+ %cast.scalable.b = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> %load.b, i64 0)
+ %cast.scalable.c = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> %load.c, i64 0)
+ %add = fadd <vscale x 4 x float> %cast.scalable.b, %cast.scalable.c
+ %cast.scalable.a = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> %load.a, i64 0)
+ %mul = fmul <vscale x 4 x float> %cast.scalable.a, %add
+ %cast.fixed.d = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> %mul, i64 0)
+ %gep.d = getelementptr inbounds <8 x float>, ptr %d, i64 0, i64 %iv
+ store <8 x float> %cast.fixed.d, ptr %gep.d, align 16
+ br i1 %exit.cond, label %for.body, label %exit
+
+exit:
+ ret void
+}
>From 0394c212cb63470927801c70a12458a7cba2c236 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 7 Feb 2024 15:27:41 +0000
Subject: [PATCH 2/2] [AArch64] Don't model legal subvector insert/extract as
scalarization
---
.../AArch64/AArch64TargetTransformInfo.cpp | 26 +++++++++++++++
.../AArch64/scalable-vec-ins-ext.ll | 32 +++++++++++--------
2 files changed, 44 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cdd2750521d2c..0add50e506788 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -568,6 +568,32 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
return Cost;
}
+ case Intrinsic::vector_extract: {
+ // If both the vector argument and the return type are legal types, then
+ // this should be a no-op or simple operation; return a relatively low cost.
+ LLVMContext &C = RetTy->getContext();
+ EVT MRTy = getTLI()->getValueType(DL, RetTy);
+ EVT MPTy = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+ TargetLoweringBase::LegalizeKind RLK = getTLI()->getTypeConversion(C, MRTy);
+ TargetLoweringBase::LegalizeKind PLK = getTLI()->getTypeConversion(C, MPTy);
+ if (RLK.first == TargetLoweringBase::TypeLegal &&
+ PLK.first == TargetLoweringBase::TypeLegal)
+ return InstructionCost(1);
+ break;
+ }
+ case Intrinsic::vector_insert: {
+ // If both the vector and subvector arguments are legal types, then this
+ // should be a no-op or simple operation; return a relatively low cost.
+ LLVMContext &C = RetTy->getContext();
+ EVT MTy0 = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+ EVT MTy1 = getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
+ TargetLoweringBase::LegalizeKind LK0 = getTLI()->getTypeConversion(C, MTy0);
+ TargetLoweringBase::LegalizeKind LK1 = getTLI()->getTypeConversion(C, MTy1);
+ if (LK0.first == TargetLoweringBase::TypeLegal &&
+ LK1.first == TargetLoweringBase::TypeLegal)
+ return InstructionCost(1);
+ break;
+ }
case Intrinsic::bitreverse: {
static const CostTblEntry BitreverseTbl[] = {
{Intrinsic::bitreverse, MVT::i32, 1},
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll b/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
index 4e9885750008f..c69757abd4c1d 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
@@ -38,26 +38,30 @@ define void @test_ins_ext_cost(ptr readonly %a, ptr readonly %b, ptr readonly %c
; UNROLL-256-LABEL: define void @test_ins_ext_cost(
; UNROLL-256-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
; UNROLL-256-NEXT: entry:
-; UNROLL-256-NEXT: br label [[FOR_BODY:%.*]]
-; UNROLL-256: for.body:
-; UNROLL-256-NEXT: [[EXIT_COND:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[FOR_BODY]] ]
-; UNROLL-256-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 1, [[FOR_BODY]] ]
-; UNROLL-256-NEXT: [[GEP_A:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 [[IV]]
-; UNROLL-256-NEXT: [[LOAD_A:%.*]] = load <8 x float>, ptr [[GEP_A]], align 16
-; UNROLL-256-NEXT: [[GEP_B:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 [[IV]]
-; UNROLL-256-NEXT: [[LOAD_B:%.*]] = load <8 x float>, ptr [[GEP_B]], align 16
-; UNROLL-256-NEXT: [[GEP_C:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 [[IV]]
-; UNROLL-256-NEXT: [[LOAD_C:%.*]] = load <8 x float>, ptr [[GEP_C]], align 16
+; UNROLL-256-NEXT: [[LOAD_A:%.*]] = load <8 x float>, ptr [[A]], align 16
+; UNROLL-256-NEXT: [[LOAD_B:%.*]] = load <8 x float>, ptr [[B]], align 16
+; UNROLL-256-NEXT: [[LOAD_C:%.*]] = load <8 x float>, ptr [[C]], align 16
; UNROLL-256-NEXT: [[CAST_SCALABLE_B:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B]], i64 0)
; UNROLL-256-NEXT: [[CAST_SCALABLE_C:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C]], i64 0)
; UNROLL-256-NEXT: [[ADD:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B]], [[CAST_SCALABLE_C]]
; UNROLL-256-NEXT: [[CAST_SCALABLE_A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A]], i64 0)
; UNROLL-256-NEXT: [[MUL:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A]], [[ADD]]
; UNROLL-256-NEXT: [[CAST_FIXED_D:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL]], i64 0)
-; UNROLL-256-NEXT: [[GEP_D:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 [[IV]]
-; UNROLL-256-NEXT: store <8 x float> [[CAST_FIXED_D]], ptr [[GEP_D]], align 16
-; UNROLL-256-NEXT: br i1 [[EXIT_COND]], label [[FOR_BODY]], label [[EXIT:%.*]]
-; UNROLL-256: exit:
+; UNROLL-256-NEXT: store <8 x float> [[CAST_FIXED_D]], ptr [[D]], align 16
+; UNROLL-256-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 1
+; UNROLL-256-NEXT: [[LOAD_A_1:%.*]] = load <8 x float>, ptr [[GEP_A_1]], align 16
+; UNROLL-256-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 1
+; UNROLL-256-NEXT: [[LOAD_B_1:%.*]] = load <8 x float>, ptr [[GEP_B_1]], align 16
+; UNROLL-256-NEXT: [[GEP_C_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 1
+; UNROLL-256-NEXT: [[LOAD_C_1:%.*]] = load <8 x float>, ptr [[GEP_C_1]], align 16
+; UNROLL-256-NEXT: [[CAST_SCALABLE_B_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B_1]], i64 0)
+; UNROLL-256-NEXT: [[CAST_SCALABLE_C_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C_1]], i64 0)
+; UNROLL-256-NEXT: [[ADD_1:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B_1]], [[CAST_SCALABLE_C_1]]
+; UNROLL-256-NEXT: [[CAST_SCALABLE_A_1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A_1]], i64 0)
+; UNROLL-256-NEXT: [[MUL_1:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A_1]], [[ADD_1]]
+; UNROLL-256-NEXT: [[CAST_FIXED_D_1:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL_1]], i64 0)
+; UNROLL-256-NEXT: [[GEP_D_1:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 1
+; UNROLL-256-NEXT: store <8 x float> [[CAST_FIXED_D_1]], ptr [[GEP_D_1]], align 16
; UNROLL-256-NEXT: ret void
;
entry:
More information about the llvm-commits
mailing list