[PATCH] D106646: [LoopVectorize] Don't interleave scalar ordered reductions for inner loops

Fri Jul 23 03:49:40 PDT 2021

david-arm created this revision.
david-arm added reviewers: sdesmalen, kmclaughlin, dmgreen, c-rhodes, peterwaller-arm.
Herald added subscribers: hiraditya, kristof.beyls.
david-arm requested review of this revision.
Herald added a project: LLVM.
Herald added a subscriber: llvm-commits.

Consider the following loop:

  void foo(float *dst, float *src, int N) {
    for (int i = 0; i < N; i++) {
      dst[i] = 0.0;
      for (int j = 0; j < N; j++) {
        dst[i] += src[(i * N) + j];
      }
    }
  }

When we are not building with -Ofast we may attempt to vectorise the
inner loop using ordered reductions instead. In addition we also try
to select an appropriate interleave count for the inner loop. However,
when choosing a VF=1 the inner loop will be scalar and there is existing
code in selectInterleaveCount that limits the interleave count to 2
for reductions due to concerns about increasing the critical path.
For ordered reductions this problem is even worse due to the additional
data dependency, and so I've added code to simply disable interleaving
for scalar ordered reductions for now.

Test added here:

  Transforms/LoopVectorize/AArch64/strict-fadd-vf1.ll


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D106646

Files:
  llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
  llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-vf1.ll


Index: llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-vf1.ll
===================================================================

--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-vf1.ll
@@ -0,0 +1,47 @@
+; RUN: opt -loop-vectorize -enable-strict-reductions=true -force-vector-width=1 -S < %s -debug 2>log | FileCheck %s
+; RUN: cat log | FileCheck %s --check-prefix=CHECK-DEBUG
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; CHECK-DEBUG: LV: Not interleaving scalar ordered reductions.
+
+define void @foo(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %M, i64 %N) {
+; CHECK-LABEL: @foo(
+; CHECK-NOT: vector.body
+
+entry:
+  %dst27 = bitcast float* %dst to i8*
+  %0 = shl nuw i64 %M, 2
+  call void @llvm.memset.p0i8.i64(i8* align 4 %dst27, i8 0, i64 %0, i1 false)
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %entry, %for.cond3
+  %i.023.us = phi i64 [ %inc8.us, %for.cond3 ], [ 0, %entry ]
+  %arrayidx.us = getelementptr inbounds float, float* %dst, i64 %i.023.us
+  %mul.us = mul nsw i64 %i.023.us, %N
+  br label %for.body3.us
+
+for.body3.us:                                     ; preds = %for.body.us, %for.body3.us
+  %1 = phi float [ 0.000000e+00, %for.body.us ], [ %add6.us, %for.body3.us ]
+  %j.021.us = phi i64 [ 0, %for.body.us ], [ %inc.us, %for.body3.us ]
+  %add.us = add nsw i64 %j.021.us, %mul.us
+  %arrayidx4.us = getelementptr inbounds float, float* %src, i64 %add.us
+  %2 = load float, float* %arrayidx4.us, align 4
+  %add6.us = fadd float %2, %1
+  %inc.us = add nuw nsw i64 %j.021.us, 1
+  %exitcond.not = icmp eq i64 %inc.us, %N
+  br i1 %exitcond.not, label %for.cond3, label %for.body3.us
+
+for.cond3:                                        ; preds = %for.body3.us
+  %add6.us.lcssa = phi float [ %add6.us, %for.body3.us ]
+  store float %add6.us.lcssa, float* %arrayidx.us, align 4
+  %inc8.us = add nuw nsw i64 %i.023.us, 1
+  %exitcond26.not = icmp eq i64 %inc8.us, %M
+  br i1 %exitcond26.not, label %exit, label %for.body.us
+
+exit:                                             ; preds = %for.cond3
+  ret void
+}
+
+; Function Attrs: argmemonly nofree nounwind willreturn writeonly
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #1
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6469,9 +6469,21 @@
 
     // If we have a scalar reduction (vector reductions are already dealt with
     // by this point), we can increase the critical path length if the loop
-    // we're interleaving is inside another loop. Limit, by default to 2, so the
-    // critical path only gets increased by one reduction operation.
+    // we're interleaving is inside another loop. For tree-wise reductions
+    // set the limit to 2, and for ordered reductions it's best to disable
+    // interleaving entirely.
     if (HasReductions && TheLoop->getLoopDepth() > 1) {
+      bool HasOrderedReductions =
+          any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
+            const RecurrenceDescriptor &RdxDesc = Reduction.second;
+            return RdxDesc.isOrdered();
+          });
+      if (HasOrderedReductions) {
+        LLVM_DEBUG(
+            dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
+        return 1;
+      }
+
       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
       SmallIC = std::min(SmallIC, F);
       StoresIC = std::min(StoresIC, F);


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D106646.361151.patch
Type: text/x-patch
Size: 3665 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20210723/28c7ecb7/attachment.bin>