[llvm] b446ec5 - [LV] Make sure the MaxVF is a power-of-2 by rounding down.

Tue Jun 2 02:41:22 PDT 2020

Author: Florian Hahn
Date: 2020-06-02T10:40:49+01:00
New Revision: b446ec56a2987ba02dc2c80d42100be4a4689215

URL: https://github.com/llvm/llvm-project/commit/b446ec56a2987ba02dc2c80d42100be4a4689215
DIFF: https://github.com/llvm/llvm-project/commit/b446ec56a2987ba02dc2c80d42100be4a4689215.diff

LOG: [LV] Make sure the MaxVF is a power-of-2 by rounding down.

LV currently only supports power of 2 vectorization factors, which has
been made explicit with the assertion added in
840450549c9199150cbdee29acef756c19660ca1.

However, if the widest type is not a power-of-2 the computed MaxVF won't
be a power-of-2 either. This patch updates computeFeasibleMaxVF to
ensure the returned value is a power-of-2 by rounding down to the
nearest power-of-2.

Fixes PR46139.

Reviewers: Ayal, gilr, rengolin

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D80870

Added: 
    llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5e5f029578f0..4f40d8d529f3 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1323,8 +1323,9 @@ class LoopVectorizationCostModel {
 private:
   unsigned NumPredStores = 0;
 
-  /// \return An upper bound for the vectorization factor, larger than zero.
-  /// One is returned if vectorization should best be avoided due to cost.
+  /// \return An upper bound for the vectorization factor, a power-of-2 larger
+  /// than zero. One is returned if vectorization should best be avoided due
+  /// to cost.
   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
 
   /// The vectorization cost is a combination of the cost itself and a boolean
@@ -5058,9 +5059,8 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
 
   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
-  WidestRegister = PowerOf2Floor(WidestRegister);
-
-  unsigned MaxVectorSize = WidestRegister / WidestType;
+  // Note that both WidestRegister and WidestType may not be a powers of 2.
+  unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
 
   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
                     << " / " << WidestType << " bits.\n");

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll b/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll
new file mode 100644
index 000000000000..2e12e31342b5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/fp80-widest-type.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-vectorize -S %s -mattr=+avx512f | FileCheck %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.15.0"
+
+; Make sure non-power-of-2 types are handled correctly, i.e., MaxVF is still a power-of-2.
+
+; Test case from PR46139.
+
+define x86_fp80 @test() {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  foo.exit:
+; CHECK-NEXT:    br label [[FOR_BODY3_I_3:%.*]]
+; CHECK:       for.body3.i.3:
+; CHECK-NEXT:    [[N_ADDR_112_I_3:%.*]] = phi i64 [ [[DEC_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ 24, [[FOO_EXIT:%.*]] ]
+; CHECK-NEXT:    [[X_ADDR_111_I_3:%.*]] = phi x86_fp80 [ [[MUL_I_3:%.*]], [[FOR_BODY3_I_3]] ], [ undef, [[FOO_EXIT]] ]
+; CHECK-NEXT:    [[MUL_I_3]] = fmul x86_fp80 [[X_ADDR_111_I_3]], 0xK40008000000000000000
+; CHECK-NEXT:    [[DEC_I_3]] = add nsw i64 [[N_ADDR_112_I_3]], -1
+; CHECK-NEXT:    [[CMP2_I_3:%.*]] = icmp sgt i64 [[N_ADDR_112_I_3]], 1
+; CHECK-NEXT:    br i1 [[CMP2_I_3]], label [[FOR_BODY3_I_3]], label [[FOO_EXIT_3:%.*]]
+; CHECK:       foo.exit.3:
+; CHECK-NEXT:    [[MUL_LCSSA:%.*]] = phi x86_fp80 [ [[MUL_I_3]], [[FOR_BODY3_I_3]] ]
+; CHECK-NEXT:    ret x86_fp80 [[MUL_LCSSA]]
+;
+foo.exit:
+  br label %for.body3.i.3
+
+for.body3.i.3:                                    ; preds = %for.body3.i.3, %foo.exit
+  %n.addr.112.i.3 = phi i64 [ %dec.i.3, %for.body3.i.3 ], [ 24, %foo.exit ]
+  %x.addr.111.i.3 = phi x86_fp80 [ %mul.i.3, %for.body3.i.3 ], [ undef, %foo.exit ]
+  %mul.i.3 = fmul x86_fp80 %x.addr.111.i.3, 0xK40008000000000000000
+  %dec.i.3 = add nsw i64 %n.addr.112.i.3, -1
+  %cmp2.i.3 = icmp sgt i64 %n.addr.112.i.3, 1
+  br i1 %cmp2.i.3, label %for.body3.i.3, label %foo.exit.3
+
+foo.exit.3:                                       ; preds = %for.body3.i.3
+  %mul.lcssa = phi x86_fp80 [ %mul.i.3, %for.body3.i.3 ]
+  ret x86_fp80 %mul.lcssa
+}