[llvm] d3932c6 - [LoopVectorize] Add tests with reductions that are stored in invariant address
Igor Kirillov via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 24 13:28:54 PST 2022
Author: Igor Kirillov
Date: 2022-01-24T21:26:38Z
New Revision: d3932c690d97df83f030f527456ddadf098b3d04
URL: https://github.com/llvm/llvm-project/commit/d3932c690d97df83f030f527456ddadf098b3d04
DIFF: https://github.com/llvm/llvm-project/commit/d3932c690d97df83f030f527456ddadf098b3d04.diff
LOG: [LoopVectorize] Add tests with reductions that are stored in invariant address
This patch adds tests for functionality that is to be implemented in D110235.
Differential Revision: https://reviews.llvm.org/D117213
Added:
llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
Modified:
llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
Removed:
################################################################################
diff --git a/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll b/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
index 03bd5b71af947..52e604a00df32 100644
--- a/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
+++ b/llvm/test/Transforms/LoopInterchange/reductions-across-inner-and-outer-loop.ll
@@ -151,6 +151,44 @@ for1.loopexit: ; preds = %for1.inc
ret i64 %sum.inc.lcssa2
}
+; Check that we do not interchange if reduction is stored in an invariant address inside inner loop
+; REMARKS: --- !Missed
+; REMARKS-NEXT: Pass: loop-interchange
+; REMARKS-NEXT: Name: UnsupportedPHIOuter
+; REMARKS-NEXT: Function: test4
+
+define i64 @test4([100 x [100 x i64]]* %Arr, i64* %dst) {
+entry:
+ %gep.dst = getelementptr inbounds i64, i64* %dst, i64 42
+ br label %for1.header
+
+for1.header: ; preds = %for1.inc, %entry
+ %indvars.iv23 = phi i64 [ 0, %entry ], [ %indvars.iv.next24, %for1.inc ]
+ %sum.outer = phi i64 [ 0, %entry ], [ %sum.inc.lcssa, %for1.inc ]
+ br label %for2
+
+for2: ; preds = %for2, %for1.header
+ %indvars.iv = phi i64 [ 0, %for1.header ], [ %indvars.iv.next.3, %for2 ]
+ %sum.inner = phi i64 [ %sum.outer, %for1.header ], [ %sum.inc, %for2 ]
+ %arrayidx = getelementptr inbounds [100 x [100 x i64]], [100 x [100 x i64]]* %Arr, i64 0, i64 %indvars.iv, i64 %indvars.iv23
+ %lv = load i64, i64* %arrayidx, align 4
+ %sum.inc = add i64 %sum.inner, %lv
+ store i64 %sum.inc, i64* %gep.dst, align 4
+ %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv, 1
+ %exit1 = icmp eq i64 %indvars.iv.next.3, 100
+ br i1 %exit1, label %for1.inc, label %for2
+
+for1.inc: ; preds = %for2
+ %sum.inc.lcssa = phi i64 [ %sum.inc, %for2 ]
+ %indvars.iv.next24 = add nuw nsw i64 %indvars.iv23, 1
+ %exit2 = icmp eq i64 %indvars.iv.next24, 100
+ br i1 %exit2, label %for1.loopexit, label %for1.header
+
+for1.loopexit: ; preds = %for1.inc
+ %sum.inc.lcssa2 = phi i64 [ %sum.inc.lcssa, %for1.inc ]
+ ret i64 %sum.inc.lcssa2
+}
+
; Check that we do not interchange or crash if the PHI in the outer loop gets a
; constant from the inner loop.
; REMARKS: --- !Missed
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
index 8a0a71ff63365..feafd8831b668 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -318,6 +318,31 @@ for.end:
ret float %.sroa.speculated
}
+; ADD (with reduction stored in invariant address)
+
+; CHECK-REMARK: loop not vectorized: value that could not be identified as reduction is used outside the loop
+define void @invariant_store(i32* %dst, i32* readonly %src) {
+; CHECK-LABEL: @invariant_store
+; CHECK-NOT: vector.body
+entry:
+ %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
+ store i32 0, i32* %gep.dst, align 4
+ br label %for.body
+for.body:
+ %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %gep.src = getelementptr inbounds i32, i32* %src, i64 %indvars.iv
+ %0 = load i32, i32* %gep.src, align 4
+ %add = add nsw i32 %sum, %0
+ store i32 %add, i32* %gep.dst, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
; Reduction cannot be vectorized
; MUL
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
index e890468f06ddf..3aa88d7fc18c1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
@@ -1309,6 +1309,37 @@ for.end:
declare float @llvm.fmuladd.f32(float, float, float)
+; Test case with invariant store where fadd is strict.
+define void @reduction_store_to_invariant_address(float* %dst, float* readonly %src) {
+; CHECK-ORDERED-LABEL: @reduction_store_to_invariant_address(
+; CHECK-ORDERED-NOT: vector.body
+
+; CHECK-UNORDERED-LABEL: @reduction_store_to_invariant_address(
+; CHECK-UNORDERED-NOT: vector.body
+
+; CHECK-NOT-VECTORIZED-LABEL: @reduction_store_to_invariant_address(
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+
+entry:
+ %arrayidx = getelementptr inbounds float, float* %dst, i64 42
+ store float 0.000000e+00, float* %arrayidx, align 4
+ br label %for.body
+
+for.body:
+ %0 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %arrayidx1 = getelementptr inbounds float, float* %src, i64 %indvars.iv
+ %1 = load float, float* %arrayidx1, align 4
+ %add = fadd float %0, %1
+ store float %add, float* %arrayidx, align 4
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond = icmp eq i64 %indvars.iv.next, 1000
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+ ret void
+}
+
!0 = distinct !{!0, !5, !9, !11}
!1 = distinct !{!1, !5, !10, !11}
!2 = distinct !{!2, !6, !9, !11}
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
new file mode 100644
index 0000000000000..d002b1b289ea0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll
@@ -0,0 +1,333 @@
+; RUN: opt < %s -passes="loop-vectorize" -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; This test checks that we can vectorize loop with reduction variable
+; stored in an invariant address.
+;
+; int sum = 0;
+; for(i=0..N) {
+; sum += src[i];
+; dst[42] = sum;
+; }
+; CHECK-LABEL: @reduc_store
+; CHECK-NOT: vector.body
+define void @reduc_store(i32* %dst, i32* readonly %src) {
+entry:
+ %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
+ store i32 0, i32* %gep.dst, align 4
+ br label %for.body
+
+for.body:
+ %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
+ %0 = load i32, i32* %gep.src, align 4
+ %add = add nsw i32 %sum, %0
+ store i32 %add, i32* %gep.dst, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %exit, label %for.body
+
+exit:
+ ret void
+}
+
+; Same as above but with floating point numbers instead.
+;
+; float sum = 0;
+; for(i=0..N) {
+; sum += src[i];
+; dst[42] = sum;
+; }
+; CHECK-LABEL: @reduc_store_fadd_fast
+; CHECK-NOT: vector.body
+define void @reduc_store_fadd_fast(float* %dst, float* readonly %src) {
+entry:
+ %gep.dst = getelementptr inbounds float, float* %dst, i64 42
+ store float 0.000000e+00, float* %gep.dst, align 4
+ br label %for.body
+
+for.body:
+ %sum = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %gep.src = getelementptr inbounds float, float* %src, i64 %iv
+ %0 = load float, float* %gep.src, align 4
+ %add = fadd fast float %sum, %0
+ store float %add, float* %gep.dst, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %exit, label %for.body
+
+exit:
+ ret void
+}
+
+; Check that if we have a read from an invariant address, we do not vectorize.
+;
+; int sum = 0;
+; for(i=0..N) {
+; sum += src[i];
+; dst.2[i] = dst[42];
+; dst[42] = sum;
+; }
+; CHECK-LABEL: @reduc_store_load
+; CHECK-NOT: vector.body
+define void @reduc_store_load(i32* %dst, i32* readonly %src, i32* noalias %dst.2) {
+entry:
+ %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
+ store i32 0, i32* %gep.dst, align 4
+ br label %for.body
+
+for.body:
+ %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
+ %0 = load i32, i32* %gep.src, align 4
+ %add = add nsw i32 %sum, %0
+ %lv = load i32, i32* %gep.dst
+ %gep.dst.2 = getelementptr inbounds i32, i32* %dst.2, i64 %iv
+ store i32 %lv, i32* %gep.dst.2, align 4
+ store i32 %add, i32* %gep.dst, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %exit, label %for.body
+
+exit:
+ ret void
+}
+
+; Final value is not guaranteed to be stored in an invariant address.
+; We don't vectorize in that case.
+;
+; int sum = 0;
+; for(i=0..N) {
+; int
diff = y[i] - x[i];
+; if (
diff > 0) {
+; sum = +=
diff ;
+; *t = sum;
+; }
+; }
+; CHECK-LABEL: @reduc_cond_store
+; CHECK-NOT: vector.body
+define void @reduc_cond_store(i32* %t, i32* readonly %x, i32* readonly %y) {
+entry:
+ store i32 0, i32* %t, align 4
+ br label %for.body
+
+for.body:
+ %sum = phi i32 [ 0, %entry ], [ %sum.2, %if.end ]
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %if.end ]
+ %gep.y = getelementptr inbounds i32, i32* %y, i64 %iv
+ %0 = load i32, i32* %gep.y, align 4
+ %gep.x = getelementptr inbounds i32, i32* %x, i64 %iv
+ %1 = load i32, i32* %gep.x, align 4
+ %
diff = sub nsw i32 %0, %1
+ %cmp2 = icmp sgt i32 %
diff , 0
+ br i1 %cmp2, label %if.then, label %if.end
+
+if.then:
+ %sum.1 = add nsw i32 %
diff , %sum
+ store i32 %sum.1, i32* %t, align 4
+ br label %if.end
+
+if.end:
+ %sum.2 = phi i32 [ %sum.1, %if.then ], [ %0, %for.body ]
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+; Check that we can vectorize code with several stores to an invariant address
+; with condition that final reduction value is stored too.
+;
+; int sum = 0;
+; for(int i=0; i < 1000; i+=2) {
+; sum += src[i];
+; dst[42] = sum;
+; sum += src[i+1];
+; dst[42] = sum;
+; }
+; CHECK-LABEL: @reduc_store_inside_unrolled
+; CHECK-NOT: vector.body
+define void @reduc_store_inside_unrolled(i32* %dst, i32* readonly %src) {
+entry:
+ %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum = phi i32 [ 0, %entry ], [ %sum.1, %for.body ]
+ %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
+ %0 = load i32, i32* %gep.src, align 4
+ %sum.1 = add nsw i32 %0, %sum
+ store i32 %sum.1, i32* %gep.dst, align 4
+ %1 = or i64 %iv, 1
+ %gep.src.1 = getelementptr inbounds i32, i32* %src, i64 %1
+ %2 = load i32, i32* %gep.src.1, align 4
+ %sum.2 = add nsw i32 %2, %sum.1
+ store i32 %sum.2, i32* %gep.dst, align 4
+ %iv.next = add nuw nsw i64 %iv, 2
+ %cmp = icmp slt i64 %iv.next, 1000
+ br i1 %cmp, label %for.body, label %exit
+
+exit:
+ ret void
+}
+
+; We cannot vectorize if two (or more) invariant stores exist in a loop.
+;
+; int sum = 0;
+; for(int i=0; i < 1000; i+=2) {
+; sum += src[i];
+; dst[42] = sum;
+; sum += src[i+1];
+; other_dst[42] = sum;
+; }
+; CHECK-LABEL: @reduc_double_invariant_store
+; CHECK-NOT: vector.body:
+define void @reduc_double_invariant_store(i32* %dst, i32* %other_dst, i32* readonly %src) {
+entry:
+ %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
+ %gep.other_dst = getelementptr inbounds i32, i32* %other_dst, i64 42
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum = phi i32 [ 0, %entry ], [ %sum.1, %for.body ]
+ %arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %sum.1 = add nsw i32 %0, %sum
+ store i32 %sum.1, i32* %gep.dst, align 4
+ %1 = or i64 %iv, 1
+ %arrayidx4 = getelementptr inbounds i32, i32* %src, i64 %1
+ %2 = load i32, i32* %arrayidx4, align 4
+ %sum.2 = add nsw i32 %2, %sum.1
+ store i32 %sum.2, i32* %gep.other_dst, align 4
+ %iv.next = add nuw nsw i64 %iv, 2
+ %cmp = icmp slt i64 %iv.next, 1000
+ br i1 %cmp, label %for.body, label %exit
+
+exit:
+ ret void
+}
+
+; int sum = 0;
+; for(int i=0; i < 1000; i+=2) {
+; sum += src[i];
+; if (src[i+1] > 0)
+; dst[42] = sum;
+; sum += src[i+1];
+; dst[42] = sum;
+; }
+; CHECK-LABEL: @reduc_store_middle_store_predicated
+; CHECK-NOT: vector.body
+define void @reduc_store_middle_store_predicated(i32* %dst, i32* readonly %src) {
+entry:
+ %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
+ br label %for.body
+
+for.body: ; preds = %latch, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+ %sum = phi i32 [ 0, %entry ], [ %sum.2, %latch ]
+ %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
+ %0 = load i32, i32* %gep.src, align 4
+ %sum.1 = add nsw i32 %0, %sum
+ %cmp = icmp sgt i32 %0, 0
+ br i1 %cmp, label %predicated, label %latch
+
+predicated: ; preds = %for.body
+ store i32 %sum.1, i32* %gep.dst, align 4
+ br label %latch
+
+latch: ; preds = %predicated, %for.body
+ %1 = or i64 %iv, 1
+ %gep.src.1 = getelementptr inbounds i32, i32* %src, i64 %1
+ %2 = load i32, i32* %gep.src.1, align 4
+ %sum.2 = add nsw i32 %2, %sum.1
+ store i32 %sum.2, i32* %gep.dst, align 4
+ %iv.next = add nuw nsw i64 %iv, 2
+ %cmp.1 = icmp slt i64 %iv.next, 1000
+ br i1 %cmp.1, label %for.body, label %exit
+
+exit: ; preds = %latch
+ ret void
+}
+
+; int sum = 0;
+; for(int i=0; i < 1000; i+=2) {
+; sum += src[i];
+; dst[42] = sum;
+; sum += src[i+1];
+; if (src[i+1] > 0)
+; dst[42] = sum;
+; }
+; CHECK-LABEL: @reduc_store_final_store_predicated
+; CHECK-NOT: vector.body:
+define void @reduc_store_final_store_predicated(i32* %dst, i32* readonly %src) {
+entry:
+ %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
+ br label %for.body
+
+for.body: ; preds = %latch, %entry
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+ %sum = phi i32 [ 0, %entry ], [ %sum.1, %latch ]
+ %arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %sum.1 = add nsw i32 %0, %sum
+ store i32 %sum.1, i32* %gep.dst, align 4
+ %1 = or i64 %iv, 1
+ %gep.src.1 = getelementptr inbounds i32, i32* %src, i64 %1
+ %2 = load i32, i32* %gep.src.1, align 4
+ %sum.2 = add nsw i32 %2, %sum.1
+ %cmp1 = icmp sgt i32 %2, 0
+ br i1 %cmp1, label %predicated, label %latch
+
+predicated: ; preds = %for.body
+ store i32 %sum.2, i32* %gep.dst, align 4
+ br label %latch
+
+latch: ; preds = %predicated, %for.body
+ %iv.next = add nuw nsw i64 %iv, 2
+ %cmp = icmp slt i64 %iv.next, 1000
+ br i1 %cmp, label %for.body, label %exit
+
+exit: ; preds = %latch
+ ret void
+}
+
+; Final value used outside of loop does not prevent vectorization
+;
+; int sum = 0;
+; for(int i=0; i < 1000; i++) {
+; sum += src[i];
+; dst[42] = sum;
+; }
+; dst[43] = sum;
+; CHECK-LABEL: @reduc_store_inoutside
+; CHECK-NOT: vector.body
+define void @reduc_store_inoutside(i32* %dst, i32* readonly %src) {
+entry:
+ %gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
+ br label %for.body
+
+for.body:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+ %sum = phi i32 [ 0, %entry ], [ %sum.1, %for.body ]
+ %arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv
+ %0 = load i32, i32* %arrayidx, align 4
+ %sum.1 = add nsw i32 %0, %sum
+ store i32 %sum.1, i32* %gep.dst, align 4
+ %iv.next = add nuw nsw i64 %iv, 1
+ %exitcond = icmp eq i64 %iv.next, 1000
+ br i1 %exitcond, label %exit, label %for.body
+
+exit:
+ %sum.lcssa = phi i32 [ %sum.1, %for.body ]
+ %gep.dst.1 = getelementptr inbounds i32, i32* %dst, i64 43
+ store i32 %sum.lcssa, i32* %gep.dst.1, align 4
+ ret void
+}
More information about the llvm-commits
mailing list