[llvm] [NFC][AArch64] Tests for guarding unrolling with scalable vec ins/ext (PR #81132)

Thu Feb 8 05:17:22 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Graham Hunter (huntergr-arm)

<details>
<summary>Changes</summary>



---
Full diff: https://github.com/llvm/llvm-project/pull/81132.diff


1 Files Affected:

- (added) llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll (+87) 


``````````diff

diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll b/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
new file mode 100644
index 00000000000000..4e9885750008f8
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/scalable-vec-ins-ext.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=loop-unroll,simplifycfg -S -mtriple aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=128 %s | FileCheck %s -check-prefix=UNROLL-128
+; RUN: opt -passes=loop-unroll,simplifycfg -S -mtriple aarch64 -mattr=+sve -aarch64-sve-vector-bits-min=256 %s | FileCheck %s -check-prefix=UNROLL-256
+
+;; This test contains IR similar to what would be generated when SVE ACLE
+;; routines are used with fixed-width vector types -- lots of subvector inserts
+;; and extracts that are effectively just bitcasts since the types are the
+;; same at a given SVE bit size. We want to make sure that they are not a
+;; barrier to unrolling simple loops with a fixed trip count which could be
+;; further optimized.
+
+define void @test_ins_ext_cost(ptr readonly %a, ptr readonly %b, ptr readonly %c, ptr noalias %d) {
+; UNROLL-128-LABEL: define void @test_ins_ext_cost(
+; UNROLL-128-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; UNROLL-128-NEXT:  entry:
+; UNROLL-128-NEXT:    br label [[FOR_BODY:%.*]]
+; UNROLL-128:       for.body:
+; UNROLL-128-NEXT:    [[EXIT_COND:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[FOR_BODY]] ]
+; UNROLL-128-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 1, [[FOR_BODY]] ]
+; UNROLL-128-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 [[IV]]
+; UNROLL-128-NEXT:    [[LOAD_A:%.*]] = load <8 x float>, ptr [[GEP_A]], align 16
+; UNROLL-128-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 [[IV]]
+; UNROLL-128-NEXT:    [[LOAD_B:%.*]] = load <8 x float>, ptr [[GEP_B]], align 16
+; UNROLL-128-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 [[IV]]
+; UNROLL-128-NEXT:    [[LOAD_C:%.*]] = load <8 x float>, ptr [[GEP_C]], align 16
+; UNROLL-128-NEXT:    [[CAST_SCALABLE_B:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B]], i64 0)
+; UNROLL-128-NEXT:    [[CAST_SCALABLE_C:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C]], i64 0)
+; UNROLL-128-NEXT:    [[ADD:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B]], [[CAST_SCALABLE_C]]
+; UNROLL-128-NEXT:    [[CAST_SCALABLE_A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A]], i64 0)
+; UNROLL-128-NEXT:    [[MUL:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A]], [[ADD]]
+; UNROLL-128-NEXT:    [[CAST_FIXED_D:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL]], i64 0)
+; UNROLL-128-NEXT:    [[GEP_D:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 [[IV]]
+; UNROLL-128-NEXT:    store <8 x float> [[CAST_FIXED_D]], ptr [[GEP_D]], align 16
+; UNROLL-128-NEXT:    br i1 [[EXIT_COND]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; UNROLL-128:       exit:
+; UNROLL-128-NEXT:    ret void
+;
+; UNROLL-256-LABEL: define void @test_ins_ext_cost(
+; UNROLL-256-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], ptr readonly [[C:%.*]], ptr noalias [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; UNROLL-256-NEXT:  entry:
+; UNROLL-256-NEXT:    br label [[FOR_BODY:%.*]]
+; UNROLL-256:       for.body:
+; UNROLL-256-NEXT:    [[EXIT_COND:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[FOR_BODY]] ]
+; UNROLL-256-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 1, [[FOR_BODY]] ]
+; UNROLL-256-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds <8 x float>, ptr [[A]], i64 [[IV]]
+; UNROLL-256-NEXT:    [[LOAD_A:%.*]] = load <8 x float>, ptr [[GEP_A]], align 16
+; UNROLL-256-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds <8 x float>, ptr [[B]], i64 [[IV]]
+; UNROLL-256-NEXT:    [[LOAD_B:%.*]] = load <8 x float>, ptr [[GEP_B]], align 16
+; UNROLL-256-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds <8 x float>, ptr [[C]], i64 [[IV]]
+; UNROLL-256-NEXT:    [[LOAD_C:%.*]] = load <8 x float>, ptr [[GEP_C]], align 16
+; UNROLL-256-NEXT:    [[CAST_SCALABLE_B:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_B]], i64 0)
+; UNROLL-256-NEXT:    [[CAST_SCALABLE_C:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_C]], i64 0)
+; UNROLL-256-NEXT:    [[ADD:%.*]] = fadd <vscale x 4 x float> [[CAST_SCALABLE_B]], [[CAST_SCALABLE_C]]
+; UNROLL-256-NEXT:    [[CAST_SCALABLE_A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[LOAD_A]], i64 0)
+; UNROLL-256-NEXT:    [[MUL:%.*]] = fmul <vscale x 4 x float> [[CAST_SCALABLE_A]], [[ADD]]
+; UNROLL-256-NEXT:    [[CAST_FIXED_D:%.*]] = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[MUL]], i64 0)
+; UNROLL-256-NEXT:    [[GEP_D:%.*]] = getelementptr inbounds <8 x float>, ptr [[D]], i64 0, i64 [[IV]]
+; UNROLL-256-NEXT:    store <8 x float> [[CAST_FIXED_D]], ptr [[GEP_D]], align 16
+; UNROLL-256-NEXT:    br i1 [[EXIT_COND]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; UNROLL-256:       exit:
+; UNROLL-256-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %exit.cond = phi i1 [ true, %entry ], [ false, %for.body ]
+  %iv = phi i64 [ 0, %entry ], [ 1, %for.body ]
+  %gep.a = getelementptr inbounds <8 x float>, ptr %a, i64 %iv
+  %load.a = load <8 x float>, ptr %gep.a, align 16
+  %gep.b = getelementptr inbounds <8 x float>, ptr %b, i64 %iv
+  %load.b = load <8 x float>, ptr %gep.b, align 16
+  %gep.c = getelementptr inbounds <8 x float>, ptr %c, i64 %iv
+  %load.c = load <8 x float>, ptr %gep.c, align 16
+  %cast.scalable.b = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> %load.b, i64 0)
+  %cast.scalable.c = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> %load.c, i64 0)
+  %add = fadd <vscale x 4 x float> %cast.scalable.b, %cast.scalable.c
+  %cast.scalable.a = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> %load.a, i64 0)
+  %mul = fmul <vscale x 4 x float> %cast.scalable.a, %add
+  %cast.fixed.d = tail call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> %mul, i64 0)
+  %gep.d = getelementptr inbounds <8 x float>, ptr %d, i64 0, i64 %iv
+  store <8 x float> %cast.fixed.d, ptr %gep.d, align 16
+  br i1 %exit.cond, label %for.body, label %exit
+
+exit:
+  ret void
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/81132