[llvm] b070629 - [LV] Increase max VF if vectorized function variants exist (#66639)

Mon Nov 13 02:27:14 PST 2023

Author: Graham Hunter
Date: 2023-11-13T10:27:10Z
New Revision: b070629c10431fef4923fdb158a46a6c74dc431a

URL: https://github.com/llvm/llvm-project/commit/b070629c10431fef4923fdb158a46a6c74dc431a
DIFF: https://github.com/llvm/llvm-project/commit/b070629c10431fef4923fdb158a46a6c74dc431a.diff

LOG: [LV] Increase max VF if vectorized function variants exist (#66639)

If there are function calls in the candidate loop and we have vectorized
variants available, try some wider VFs in case the conservative initial
maximum based on the widest types in the loop won't actually allow us
to make use of those function variants.

Added: 
    llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll

Modified: 
    llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
    llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 20cfc680e8f90b3..a509ebf6a7e1b38 100644

--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -380,6 +380,10 @@ class LoopVectorizationLegality {
     return MaskedOp.contains(I);
   }
 
+  /// Returns true if there is at least one function call in the loop which
+  /// has a vectorized variant available.
+  bool hasVectorCallVariants() const { return VecCallVariantsFound; }
+
   unsigned getNumStores() const { return LAI->getNumStores(); }
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
 
@@ -538,6 +542,12 @@ class LoopVectorizationLegality {
   /// BFI and PSI are used to check for profile guided size optimizations.
   BlockFrequencyInfo *BFI;
   ProfileSummaryInfo *PSI;
+
+  /// If we discover function calls within the loop which have a valid
+  /// vectorized variant, record that fact so that LoopVectorize can
+  /// (potentially) make a better decision on the maximum VF and enable
+  /// the use of those function variants.
+  bool VecCallVariantsFound = false;
 };
 
 } // namespace llvm

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index ca3c8d524d1b53b..37a356c43e29a41 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -943,6 +943,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           }
       }
 
+      // If we found a vectorized variant of a function, note that so LV can
+      // make better decisions about maximum VF.
+      if (CI && !VFDatabase::getMappings(*CI).empty())
+        VecCallVariantsFound = true;
+
       // Check that the instruction return type is vectorizable.
       // Also, we can't vectorize extractelement instructions.
       if ((!VectorType::isValidElementType(I.getType()) &&

diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 65f0662320de8a5..e9d0315d114f65c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -391,6 +391,11 @@ static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
     cl::desc(
         "Override cost based safe divisor widening for div/rem instructions"));
 
+static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
+    "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
+    cl::Hidden,
+    cl::desc("Try wider VFs if they enable the use of vector variants"));
+
 /// A helper function that returns true if the given type is irregular. The
 /// type is irregular if its allocated size doesn't equal the store size of an
 /// element of the corresponding vector type.
@@ -5095,8 +5100,10 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
                            : TargetTransformInfo::RGK_FixedWidthVector;
   ElementCount MaxVF = MaxVectorElementCount;
-  if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
-                            TTI.shouldMaximizeVectorBandwidth(RegKind))) {
+  if (MaximizeBandwidth ||
+      (MaximizeBandwidth.getNumOccurrences() == 0 &&
+       (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
+        (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
     auto MaxVectorElementCountMaxBW = ElementCount::get(
         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
         ComputeScalableMaxVF);

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
new file mode 100644
index 000000000000000..00e8881426fd8f9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=WIDE
+; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -vectorizer-maximize-bandwidth-for-vector-calls=false -S | FileCheck %s --check-prefixes=NARROW
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @test_widen(ptr noalias %a, ptr readnone %b) #1 {
+; WIDE-LABEL: @test_widen(
+; WIDE-NEXT:  entry:
+; WIDE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; WIDE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; WIDE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
+; WIDE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; WIDE:       vector.ph:
+; WIDE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; WIDE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; WIDE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
+; WIDE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
+; WIDE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; WIDE:       vector.body:
+; WIDE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; WIDE-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]]
+; WIDE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x double>, ptr [[TMP4]], align 8
+; WIDE-NEXT:    [[TMP5:%.*]] = fptrunc <vscale x 4 x double> [[WIDE_LOAD]] to <vscale x 4 x float>
+; WIDE-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x float> @foo_vector(<vscale x 4 x float> [[TMP5]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; WIDE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
+; WIDE-NEXT:    store <vscale x 4 x float> [[TMP6]], ptr [[TMP7]], align 4
+; WIDE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; WIDE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; WIDE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; WIDE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; WIDE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; WIDE:       middle.block:
+; WIDE-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; WIDE:       scalar.ph:
+; WIDE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; WIDE-NEXT:    br label [[FOR_BODY:%.*]]
+; WIDE:       for.body:
+; WIDE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; WIDE-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[INDVARS_IV]]
+; WIDE-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8
+; WIDE-NEXT:    [[TRUNC:%.*]] = fptrunc double [[LOAD]] to float
+; WIDE-NEXT:    [[CALL:%.*]] = call float @foo(float [[TRUNC]]) #[[ATTR2:[0-9]+]]
+; WIDE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; WIDE-NEXT:    store float [[CALL]], ptr [[ARRAYIDX]], align 4
+; WIDE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; WIDE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
+; WIDE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; WIDE:       for.cond.cleanup:
+; WIDE-NEXT:    ret void
+;
+; NARROW-LABEL: @test_widen(
+; NARROW-NEXT:  entry:
+; NARROW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NARROW:       vector.ph:
+; NARROW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NARROW:       vector.body:
+; NARROW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NARROW-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]]
+; NARROW-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; NARROW-NEXT:    [[TMP1:%.*]] = fptrunc <2 x double> [[WIDE_LOAD]] to <2 x float>
+; NARROW-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; NARROW-NEXT:    [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]]
+; NARROW-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; NARROW-NEXT:    [[TMP5:%.*]] = call float @foo(float [[TMP4]]) #[[ATTR1]]
+; NARROW-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
+; NARROW-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP5]], i32 1
+; NARROW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
+; NARROW-NEXT:    store <2 x float> [[TMP7]], ptr [[TMP8]], align 4
+; NARROW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; NARROW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; NARROW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NARROW:       middle.block:
+; NARROW-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NARROW:       scalar.ph:
+; NARROW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NARROW-NEXT:    br label [[FOR_BODY:%.*]]
+; NARROW:       for.body:
+; NARROW-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NARROW-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[INDVARS_IV]]
+; NARROW-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8
+; NARROW-NEXT:    [[TRUNC:%.*]] = fptrunc double [[LOAD]] to float
+; NARROW-NEXT:    [[CALL:%.*]] = call float @foo(float [[TRUNC]]) #[[ATTR1]]
+; NARROW-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; NARROW-NEXT:    store float [[CALL]], ptr [[ARRAYIDX]], align 4
+; NARROW-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NARROW-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
+; NARROW-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NARROW:       for.cond.cleanup:
+; NARROW-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %gep = getelementptr double, ptr %b, i64 %indvars.iv
+  %load = load double, ptr %gep
+  %trunc = fptrunc double %load to float
+  %call = call float @foo(float %trunc) #0
+  %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv
+  store float %call, ptr %arrayidx
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1025
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+declare float @foo(float)
+declare <vscale x 4 x float> @foo_vector(<vscale x 4 x float>, <vscale x 4 x i1>)
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Mxv_foo(foo_vector)" }
+attributes #1 = { "target-features"="+sve" vscale_range(1,16) "no-trapping-math"="false" }