[llvm] 5a4de84 - [LoopVectorize] Fix crash for predicated instruction with scalable VF

Thu Jul 22 04:50:37 PDT 2021

Author: Caroline Concatto
Date: 2021-07-22T12:48:27+01:00
New Revision: 5a4de84d55faa5502de38b4f7ec6c6ed43d90043

URL: https://github.com/llvm/llvm-project/commit/5a4de84d55faa5502de38b4f7ec6c6ed43d90043
DIFF: https://github.com/llvm/llvm-project/commit/5a4de84d55faa5502de38b4f7ec6c6ed43d90043.diff

LOG: [LoopVectorize] Fix crash for predicated instruction with scalable VF

This patch avoids computing discounts for predicated instructions  when the
VF is scalable.
There is no support for vectorization of loops with division because the
vectorizer cannot guarantee that zero divisions will not happen.

This loop now does not use VF scalable

```
for (long long i = 0; i < n; i++)
    if (cond[i])
      a[i] /= b[i];
```

Differential Revision: https://reviews.llvm.org/D101916

Added: 
    llvm/test/Transforms/LoopVectorize/AArch64/scalable-predicate-instruction.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 001f1e45e488a..9e645cd696d0c 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6745,9 +6745,11 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
     for (Instruction &I : *BB)
       if (isScalarWithPredication(&I)) {
         ScalarCostsTy ScalarCosts;
+        // Do not apply discount if scalable, because that would lead to
+        // invalid scalarization costs.
         // Do not apply discount logic if hacked cost is needed
         // for emulated masked memrefs.
-        if (!useEmulatedMaskMemRefHack(&I) &&
+        if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) &&
             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
         // Remember that BB will remain after vectorization.
@@ -7547,6 +7549,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
       ScalarPredicatedBB = true;
 
     if (ScalarPredicatedBB) {
+      // Not possible to scalarize scalable vector with predicated instructions.
+      if (VF.isScalable())
+        return InstructionCost::getInvalid();
       // Return cost for branches around scalarized and predicated blocks.
       auto *Vec_i1Ty =
           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-predicate-instruction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-predicate-instruction.ll
new file mode 100644
index 0000000000000..16d7393c15063
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-predicate-instruction.ll
@@ -0,0 +1,95 @@
+; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; The test predication_in_loop corresponds
+; to the following function
+;   for (long long i = 0; i < 1024; i++) {
+;    if (cond[i])
+;      a[i] /= b[i];
+;  }
+
+; Scalarizing the division cannot be done for scalable vectors at the moment
+; when the loop needs predication
+; Future implementation of llvm.vp could allow this to happen
+
+define void  @predication_in_loop(i32* %a, i32* %b, i32* %cond) #0 {
+; CHECK-LABEL: @predication_in_loop
+; CHECK-NOT:  sdiv <vscale x 4 x i32>
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i.09 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %cond, i64 %i.09
+  %0 = load i32, i32* %arrayidx, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 %i.09
+  %1 = load i32, i32* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %i.09
+  %2 = load i32, i32* %arrayidx2, align 4
+  %div = sdiv i32 %2, %1
+  store i32 %div, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i64 %i.09, 1
+  %exitcond.not = icmp eq i64 %inc, 1024
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
+}
+
+
+;
+; This test unpredicated_loop_predication_through_tailfolding corresponds
+; to the following function
+;   for (long long i = 0; i < 1024; i++) {
+;      a[i] /= b[i];
+;  }
+
+; Scalarization not possible in the main loop when there is no predication and
+; epilogue should not be able to allow scalarization
+; otherwise it could  be able to vectorize, but will not because
+; "Max legal vector width too small, scalable vectorization unfeasible.."
+
+define void @unpredicated_loop_predication_through_tailfolding(i32* %a, i32* %b) #0 {
+; CHECK-LABEL: @unpredicated_loop_predication_through_tailfolding
+; CHECK-NOT:  sdiv <vscale x 4 x i32>
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %sdiv = sdiv i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %sdiv, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret void
+
+}
+
+attributes #0 = { "target-features"="+sve" }
+
+!0 = distinct !{!0, !1, !2, !3, !4}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!3 = !{!"llvm.loop.interleave.count", i32 1}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}