[llvm] [LV] Increase max VF if vectorized function variants exist (PR #66639)

via llvm-commits llvm-commits at lists.llvm.org
Mon Sep 18 05:40:21 PDT 2023


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-vectorizers

<details>
<summary>Changes</summary>

If there are function calls in the candidate loop and we have vectorized
variants available, try some wider VFs in case the conservative initial
maximum based on the widest types in the loop won't actually allow us
to make use of those function variants.

---
Full diff: https://github.com/llvm/llvm-project/pull/66639.diff


4 Files Affected:

- (modified) llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h (+10) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp (+11) 
- (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+2-1) 
- (added) llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll (+92) 


``````````diff
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 20cfc680e8f90b3..a97cc014a3dae45 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -380,6 +380,10 @@ class LoopVectorizationLegality {
     return MaskedOp.contains(I);
   }
 
+  /// Returns true if there is at least one function call in the loop which
+  /// has a vectorized variant available.
+  bool hasVectorVariants() const { return VecVariantsFound; }
+
   unsigned getNumStores() const { return LAI->getNumStores(); }
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
 
@@ -538,6 +542,12 @@ class LoopVectorizationLegality {
   /// BFI and PSI are used to check for profile guided size optimizations.
   BlockFrequencyInfo *BFI;
   ProfileSummaryInfo *PSI;
+
+  /// If we discover function calls within the loop which have a valid
+  /// vectorized variant, record that fact so that LoopVectorize can
+  /// (potentially) make a better decision on the maximum VF and enable
+  /// the use of those function variants.
+  bool VecVariantsFound = false;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 35d69df56dc7220..d8d44b4cf96b601 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -78,6 +78,11 @@ static cl::opt<LoopVectorizeHints::ScalableForceKind>
                 "Scalable vectorization is available and favored when the "
                 "cost is inconclusive.")));
 
+static cl::opt<bool> UseWiderVFIfVariantsPresent(
+    "vectorizer-maximize-bandwidth-if-variant-present", cl::init(true),
+    cl::Hidden,
+    cl::desc("Try wider VFs if they enable the use of vector variants"));
+
 /// Maximum vectorization interleave count.
 static const unsigned MaxInterleaveFactor = 16;
 
@@ -943,6 +948,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           }
       }
 
+      // If we found a vectorized variant of a function, note that so LV can
+      // make better decisions about maximum VF.
+      if (CI && !VFDatabase::getMappings(*CI).empty() &&
+          UseWiderVFIfVariantsPresent)
+        VecVariantsFound = true;
+
       // Check that the instruction return type is vectorizable.
       // Also, we can't vectorize extractelement instructions.
       if ((!VectorType::isValidElementType(I.getType()) &&
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a203c4794eac943..e8842fcf56da49a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5152,7 +5152,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
                            : TargetTransformInfo::RGK_FixedWidthVector;
   ElementCount MaxVF = MaxVectorElementCount;
   if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
-                            TTI.shouldMaximizeVectorBandwidth(RegKind))) {
+                            (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
+                             Legal->hasVectorVariants()))) {
     auto MaxVectorElementCountMaxBW = ElementCount::get(
         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
         ComputeScalableMaxVF);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
new file mode 100644
index 000000000000000..1d40a7c5fbe96b9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=WIDE
+; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -vectorizer-maximize-bandwidth-if-variant-present=false -S | FileCheck %s --check-prefixes=NARROW
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @test_widen(ptr noalias %a, ptr readnone %b) #1 {
+; WIDE-LABEL: @test_widen(
+; WIDE-NEXT:  entry:
+; WIDE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; WIDE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; WIDE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
+; WIDE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; WIDE:       vector.ph:
+; WIDE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; WIDE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; WIDE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
+; WIDE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
+; WIDE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; WIDE:       vector.body:
+; WIDE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; WIDE-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
+; WIDE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x ptr>, ptr [[TMP4]], align 8
+; WIDE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; WIDE-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @foo_vector(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; WIDE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; WIDE-NEXT:    store <vscale x 4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; WIDE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; WIDE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; WIDE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; WIDE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; WIDE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; WIDE:       middle.block:
+; WIDE-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; WIDE:       scalar.ph:
+; WIDE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; WIDE-NEXT:    br label [[FOR_BODY:%.*]]
+; WIDE:       for.body:
+; WIDE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; WIDE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
+; WIDE-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[GEP]], align 8
+; WIDE-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[LOAD]], align 4
+; WIDE-NEXT:    [[CALL:%.*]] = call i32 @foo(i32 [[LOAD2]]) #[[ATTR3:[0-9]+]]
+; WIDE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; WIDE-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
+; WIDE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; WIDE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
+; WIDE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; WIDE:       for.cond.cleanup:
+; WIDE-NEXT:    ret void
+;
+; NARROW-LABEL: @test_widen(
+; NARROW-NEXT:  entry:
+; NARROW-NEXT:    br label [[FOR_BODY:%.*]]
+; NARROW:       for.body:
+; NARROW-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NARROW-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDVARS_IV]]
+; NARROW-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[GEP]], align 8
+; NARROW-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[LOAD]], align 4
+; NARROW-NEXT:    [[CALL:%.*]] = call i32 @foo(i32 [[LOAD2]]) #[[ATTR1:[0-9]+]]
+; NARROW-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; NARROW-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
+; NARROW-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NARROW-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
+; NARROW-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; NARROW:       for.cond.cleanup:
+; NARROW-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %gep = getelementptr i64, ptr %b, i64 %indvars.iv
+  %load = load ptr, ptr %gep
+  %load2 = load i32, ptr %load
+  %call = call i32 @foo(i32 %load2) #0
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %call, ptr %arrayidx
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1025
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+declare i32 @foo(i32)
+declare <vscale x 4 x i32> @foo_vector(<vscale x 4 x i32>, <vscale x 4 x i1>)
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Mxv_foo(foo_vector)" }
+attributes #1 = { "target-features"="+sve" vscale_range(1,16) "no-trapping-math"="false" }

``````````

</details>


https://github.com/llvm/llvm-project/pull/66639


More information about the llvm-commits mailing list