[llvm] b65267c - [LV] Invalidate widening decisions after maximizing vector bandwidth

Thu Mar 31 01:19:35 PDT 2022

Author: David Green
Date: 2022-03-31T09:19:31+01:00
New Revision: b65267ca7bd160e4eb4853650c158058ec7a3ccb

URL: https://github.com/llvm/llvm-project/commit/b65267ca7bd160e4eb4853650c158058ec7a3ccb
DIFF: https://github.com/llvm/llvm-project/commit/b65267ca7bd160e4eb4853650c158058ec7a3ccb.diff

LOG: [LV] Invalidate widening decisions after maximizing vector bandwidth

When MaximizeVectorBandwidth is enabled, we can end up (via calls to
collectUniformsAndScalars/setCostBasedWideningDecision through
calculateRegisterUsage) making widening decisions before we have decided
whether to fold the tail by masking. These decisions will be wrong if we
later decided to fold the tail, for example when the trip count is very
low. It will use incorrect costs for loads that should get masked, using
standard memory operation costs instead.

This still at the moment uses the EmulatedMaskMemRefHack costs (a bit
unfortunately), but the old costs without this change were 1, leading to
too optimistic vectorization.

This slightly changes the way that the MaximizeVectorBandwidth option
works to make it easier to test, always honouring the option if it is
set.

Differential Revision: https://reviews.llvm.org/D120215

Added: 
    llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 911d36e5c028e..772b276df124a 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5222,8 +5222,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
   }
 
   ElementCount MaxVF = MaxVectorElementCount;
-  if (TTI.shouldMaximizeVectorBandwidth() ||
-      (MaximizeBandwidth && isScalarEpilogueAllowed())) {
+  if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
+                            TTI.shouldMaximizeVectorBandwidth())) {
     auto MaxVectorElementCountMaxBW = ElementCount::get(
         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
         ComputeScalableMaxVF);
@@ -5261,6 +5261,11 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
         MaxVF = MinVF;
       }
     }
+
+    // Invalidate any widening decisions we might have made, in case the loop
+    // requires prediction (decided later), but we have already made some
+    // load/store widening decisions.
+    invalidateCostModelingDecisions();
   }
   return MaxVF;
 }

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
new file mode 100644
index 0000000000000..6f14dbd033fda
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -vectorizer-maximize-bandwidth -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -vectorizer-maximize-bandwidth -S -debug-only=loop-vectorize 2>&1 -disable-output | FileCheck %s --check-prefix=COST
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-eabi"
+
+; Check that the maximize vector bandwidth option does not give incorrect costs
+; due to invalid cost decisions. The loop below has a low maximum trip count,
+; so will be masked.
+
+; COST: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %0 = load
+; COST: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %0 = load
+; COST: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %0 = load
+; COST: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %0 = load
+; COST: LV: Selecting VF: 1.
+
+define i32 @test(i8* nocapture noundef readonly %pInVec, i8* nocapture noundef readonly %pInA1, i8* nocapture noundef readonly %pInA2, i8* nocapture noundef readonly %pInA3, i8* nocapture noundef readonly %pInA4, i32 noundef %numCols) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[NUMCOLS:%.*]], 3
+; CHECK-NEXT:    [[CMP_NOT32:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT32]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
+; CHECK:       while.body.preheader:
+; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
+; CHECK:       while.body:
+; CHECK-NEXT:    [[PINVEC_ADDR_042:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[PINVEC:%.*]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM4_041:%.*]] = phi i32 [ [[ADD14:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM3_040:%.*]] = phi i32 [ [[ADD10:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM2_039:%.*]] = phi i32 [ [[ADD6:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM1_038:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[COLCNT_037:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[AND]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[PINA1_ADDR_036:%.*]] = phi i8* [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PINA1:%.*]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[PINA4_ADDR_035:%.*]] = phi i8* [ [[INCDEC_PTR11:%.*]], [[WHILE_BODY]] ], [ [[PINA4:%.*]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[PINA3_ADDR_034:%.*]] = phi i8* [ [[INCDEC_PTR7:%.*]], [[WHILE_BODY]] ], [ [[PINA3:%.*]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[PINA2_ADDR_033:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[PINA2:%.*]], [[WHILE_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PINVEC_ADDR_042]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[PINVEC_ADDR_042]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i8, i8* [[PINA1_ADDR_036]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[PINA1_ADDR_036]], align 1
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[MUL]], [[SUM1_038]]
+; CHECK-NEXT:    [[INCDEC_PTR3]] = getelementptr inbounds i8, i8* [[PINA2_ADDR_033]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[PINA2_ADDR_033]], align 1
+; CHECK-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[CONV4]], [[CONV]]
+; CHECK-NEXT:    [[ADD6]] = add nsw i32 [[MUL5]], [[SUM2_039]]
+; CHECK-NEXT:    [[INCDEC_PTR7]] = getelementptr inbounds i8, i8* [[PINA3_ADDR_034]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, i8* [[PINA3_ADDR_034]], align 1
+; CHECK-NEXT:    [[CONV8:%.*]] = sext i8 [[TMP3]] to i32
+; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[CONV8]], [[CONV]]
+; CHECK-NEXT:    [[ADD10]] = add nsw i32 [[MUL9]], [[SUM3_040]]
+; CHECK-NEXT:    [[INCDEC_PTR11]] = getelementptr inbounds i8, i8* [[PINA4_ADDR_035]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[PINA4_ADDR_035]], align 1
+; CHECK-NEXT:    [[CONV12:%.*]] = sext i8 [[TMP4]] to i32
+; CHECK-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[CONV12]], [[CONV]]
+; CHECK-NEXT:    [[ADD14]] = add nsw i32 [[MUL13]], [[SUM4_041]]
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[COLCNT_037]], -1
+; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
+; CHECK:       while.end.loopexit:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[ADD6_LCSSA:%.*]] = phi i32 [ [[ADD6]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[ADD10_LCSSA:%.*]] = phi i32 [ [[ADD10]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[ADD14_LCSSA:%.*]] = phi i32 [ [[ADD14]], [[WHILE_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[ADD6_LCSSA]], [[ADD_LCSSA]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i32 [[TMP5]], [[ADD10_LCSSA]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw i32 [[TMP6]], [[ADD14_LCSSA]]
+; CHECK-NEXT:    br label [[WHILE_END]]
+; CHECK:       while.end:
+; CHECK-NEXT:    [[ADD17:%.*]] = phi i32 [ [[TMP7]], [[WHILE_END_LOOPEXIT]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[ADD17]]
+;
+entry:
+  %and = and i32 %numCols, 3
+  %cmp.not32 = icmp eq i32 %and, 0
+  br i1 %cmp.not32, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %pInVec.addr.042 = phi i8* [ %incdec.ptr, %while.body ], [ %pInVec, %entry ]
+  %sum4.041 = phi i32 [ %add14, %while.body ], [ 0, %entry ]
+  %sum3.040 = phi i32 [ %add10, %while.body ], [ 0, %entry ]
+  %sum2.039 = phi i32 [ %add6, %while.body ], [ 0, %entry ]
+  %sum1.038 = phi i32 [ %add, %while.body ], [ 0, %entry ]
+  %colCnt.037 = phi i32 [ %dec, %while.body ], [ %and, %entry ]
+  %pInA1.addr.036 = phi i8* [ %incdec.ptr1, %while.body ], [ %pInA1, %entry ]
+  %pInA4.addr.035 = phi i8* [ %incdec.ptr11, %while.body ], [ %pInA4, %entry ]
+  %pInA3.addr.034 = phi i8* [ %incdec.ptr7, %while.body ], [ %pInA3, %entry ]
+  %pInA2.addr.033 = phi i8* [ %incdec.ptr3, %while.body ], [ %pInA2, %entry ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %pInVec.addr.042, i64 1
+  %0 = load i8, i8* %pInVec.addr.042, align 1
+  %conv = sext i8 %0 to i32
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %pInA1.addr.036, i64 1
+  %1 = load i8, i8* %pInA1.addr.036, align 1
+  %conv2 = sext i8 %1 to i32
+  %mul = mul nsw i32 %conv2, %conv
+  %add = add nsw i32 %mul, %sum1.038
+  %incdec.ptr3 = getelementptr inbounds i8, i8* %pInA2.addr.033, i64 1
+  %2 = load i8, i8* %pInA2.addr.033, align 1
+  %conv4 = sext i8 %2 to i32
+  %mul5 = mul nsw i32 %conv4, %conv
+  %add6 = add nsw i32 %mul5, %sum2.039
+  %incdec.ptr7 = getelementptr inbounds i8, i8* %pInA3.addr.034, i64 1
+  %3 = load i8, i8* %pInA3.addr.034, align 1
+  %conv8 = sext i8 %3 to i32
+  %mul9 = mul nsw i32 %conv8, %conv
+  %add10 = add nsw i32 %mul9, %sum3.040
+  %incdec.ptr11 = getelementptr inbounds i8, i8* %pInA4.addr.035, i64 1
+  %4 = load i8, i8* %pInA4.addr.035, align 1
+  %conv12 = sext i8 %4 to i32
+  %mul13 = mul nsw i32 %conv12, %conv
+  %add14 = add nsw i32 %mul13, %sum4.041
+  %dec = add nsw i32 %colCnt.037, -1
+  %cmp.not = icmp eq i32 %dec, 0
+  br i1 %cmp.not, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:                               ; preds = %while.body
+  %5 = add nsw i32 %add6, %add
+  %6 = add nsw i32 %5, %add10
+  %7 = add nsw i32 %6, %add14
+  br label %while.end
+
+while.end:                                        ; preds = %while.end.loopexit, %entry
+  %add17 = phi i32 [ %7, %while.end.loopexit ], [ 0, %entry ]
+  ret i32 %add17
+}