[llvm] [LV] Increase max VF if vectorized function variants exist (PR #66639)

Wed Nov 8 06:57:53 PST 2023

https://github.com/huntergr-arm updated https://github.com/llvm/llvm-project/pull/66639

>From 090c3f31ff59f06a9ce0b7c17ff447615997ab92 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Mon, 18 Sep 2023 13:12:26 +0100
Subject: [PATCH 1/3] [LV] Increase max VF if vectorized function variants
 exist

If there are function calls in the candidate loop and we have vectorized
variants available, try some wider VFs in case the conservative initial
maximum based on the widest types in the loop won't actually allow us
to make use of those function variants.
---
 .../Vectorize/LoopVectorizationLegality.h     | 10 ++
 .../Vectorize/LoopVectorizationLegality.cpp   | 11 +++
 .../Transforms/Vectorize/LoopVectorize.cpp    |  3 +-
 .../AArch64/wider-VF-for-callinst.ll          | 92 +++++++++++++++++++
 4 files changed, 115 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 20cfc680e8f90b3..a97cc014a3dae45 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -380,6 +380,10 @@ class LoopVectorizationLegality {
     return MaskedOp.contains(I);
   }
 
+  /// Returns true if there is at least one function call in the loop which
+  /// has a vectorized variant available.
+  bool hasVectorVariants() const { return VecVariantsFound; }
+
   unsigned getNumStores() const { return LAI->getNumStores(); }
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
 
@@ -538,6 +542,12 @@ class LoopVectorizationLegality {
   /// BFI and PSI are used to check for profile guided size optimizations.
   BlockFrequencyInfo *BFI;
   ProfileSummaryInfo *PSI;
+
+  /// If we discover function calls within the loop which have a valid
+  /// vectorized variant, record that fact so that LoopVectorize can
+  /// (potentially) make a better decision on the maximum VF and enable
+  /// the use of those function variants.
+  bool VecVariantsFound = false;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 35d69df56dc7220..d8d44b4cf96b601 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -78,6 +78,11 @@ static cl::opt<LoopVectorizeHints::ScalableForceKind>
                 "Scalable vectorization is available and favored when the "
                 "cost is inconclusive.")));
 
+static cl::opt<bool> UseWiderVFIfVariantsPresent(
+    "vectorizer-maximize-bandwidth-if-variant-present", cl::init(true),
+    cl::Hidden,
+    cl::desc("Try wider VFs if they enable the use of vector variants"));
+
 /// Maximum vectorization interleave count.
 static const unsigned MaxInterleaveFactor = 16;
 
@@ -943,6 +948,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           }
       }
 
+      // If we found a vectorized variant of a function, note that so LV can
+      // make better decisions about maximum VF.
+      if (CI && !VFDatabase::getMappings(*CI).empty() &&
+          UseWiderVFIfVariantsPresent)
+        VecVariantsFound = true;
+
       // Check that the instruction return type is vectorizable.
       // Also, we can't vectorize extractelement instructions.
       if ((!VectorType::isValidElementType(I.getType()) &&
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a203c4794eac943..e8842fcf56da49a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5152,7 +5152,8 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
                            : TargetTransformInfo::RGK_FixedWidthVector;
   ElementCount MaxVF = MaxVectorElementCount;
   if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
-                            TTI.shouldMaximizeVectorBandwidth(RegKind))) {
+                            (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
+                             Legal->hasVectorVariants()))) {
     auto MaxVectorElementCountMaxBW = ElementCount::get(
         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
         ComputeScalableMaxVF);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
new file mode 100644
index 000000000000000..1d40a7c5fbe96b9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=WIDE
+; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -vectorizer-maximize-bandwidth-if-variant-present=false -S | FileCheck %s --check-prefixes=NARROW
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @test_widen(ptr noalias %a, ptr readnone %b) #1 {
+; WIDE-LABEL: @test_widen(
+; WIDE-NEXT:  entry:
+; WIDE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; WIDE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; WIDE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1025, [[TMP1]]
+; WIDE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; WIDE:       vector.ph:
+; WIDE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; WIDE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; WIDE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
+; WIDE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
+; WIDE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; WIDE:       vector.body:
+; WIDE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; WIDE-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
+; WIDE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x ptr>, ptr [[TMP4]], align 8
+; WIDE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; WIDE-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @foo_vector(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; WIDE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; WIDE-NEXT:    store <vscale x 4 x i32> [[TMP5]], ptr [[TMP6]], align 4
+; WIDE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; WIDE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
+; WIDE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; WIDE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; WIDE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; WIDE:       middle.block:
+; WIDE-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; WIDE:       scalar.ph:
+; WIDE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; WIDE-NEXT:    br label [[FOR_BODY:%.*]]
+; WIDE:       for.body:
+; WIDE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; WIDE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
+; WIDE-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[GEP]], align 8
+; WIDE-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[LOAD]], align 4
+; WIDE-NEXT:    [[CALL:%.*]] = call i32 @foo(i32 [[LOAD2]]) #[[ATTR3:[0-9]+]]
+; WIDE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; WIDE-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
+; WIDE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; WIDE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
+; WIDE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; WIDE:       for.cond.cleanup:
+; WIDE-NEXT:    ret void
+;
+; NARROW-LABEL: @test_widen(
+; NARROW-NEXT:  entry:
+; NARROW-NEXT:    br label [[FOR_BODY:%.*]]
+; NARROW:       for.body:
+; NARROW-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NARROW-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDVARS_IV]]
+; NARROW-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[GEP]], align 8
+; NARROW-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[LOAD]], align 4
+; NARROW-NEXT:    [[CALL:%.*]] = call i32 @foo(i32 [[LOAD2]]) #[[ATTR1:[0-9]+]]
+; NARROW-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; NARROW-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
+; NARROW-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; NARROW-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
+; NARROW-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; NARROW:       for.cond.cleanup:
+; NARROW-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %gep = getelementptr i64, ptr %b, i64 %indvars.iv
+  %load = load ptr, ptr %gep
+  %load2 = load i32, ptr %load
+  %call = call i32 @foo(i32 %load2) #0
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+  store i32 %call, ptr %arrayidx
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1025
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:
+  ret void
+}
+
+declare i32 @foo(i32)
+declare <vscale x 4 x i32> @foo_vector(<vscale x 4 x i32>, <vscale x 4 x i1>)
+
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Mxv_foo(foo_vector)" }
+attributes #1 = { "target-features"="+sve" vscale_range(1,16) "no-trapping-math"="false" }

>From 782a78bc67564185e6f76337cce559141854052e Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Fri, 27 Oct 2023 08:55:47 +0100
Subject: [PATCH 2/3] Improve naming and test case

---
 .../Vectorize/LoopVectorizationLegality.h     |  4 +-
 .../Vectorize/LoopVectorizationLegality.cpp   | 10 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    | 12 ++-
 .../AArch64/wider-VF-for-callinst.ll          | 91 ++++++++++++-------
 4 files changed, 70 insertions(+), 47 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index a97cc014a3dae45..a509ebf6a7e1b38 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -382,7 +382,7 @@ class LoopVectorizationLegality {
 
   /// Returns true if there is at least one function call in the loop which
   /// has a vectorized variant available.
-  bool hasVectorVariants() const { return VecVariantsFound; }
+  bool hasVectorCallVariants() const { return VecCallVariantsFound; }
 
   unsigned getNumStores() const { return LAI->getNumStores(); }
   unsigned getNumLoads() const { return LAI->getNumLoads(); }
@@ -547,7 +547,7 @@ class LoopVectorizationLegality {
   /// vectorized variant, record that fact so that LoopVectorize can
   /// (potentially) make a better decision on the maximum VF and enable
   /// the use of those function variants.
-  bool VecVariantsFound = false;
+  bool VecCallVariantsFound = false;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index d8d44b4cf96b601..298d2c0bc610794 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -78,11 +78,6 @@ static cl::opt<LoopVectorizeHints::ScalableForceKind>
                 "Scalable vectorization is available and favored when the "
                 "cost is inconclusive.")));
 
-static cl::opt<bool> UseWiderVFIfVariantsPresent(
-    "vectorizer-maximize-bandwidth-if-variant-present", cl::init(true),
-    cl::Hidden,
-    cl::desc("Try wider VFs if they enable the use of vector variants"));
-
 /// Maximum vectorization interleave count.
 static const unsigned MaxInterleaveFactor = 16;
 
@@ -950,9 +945,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
       // If we found a vectorized variant of a function, note that so LV can
       // make better decisions about maximum VF.
-      if (CI && !VFDatabase::getMappings(*CI).empty() &&
-          UseWiderVFIfVariantsPresent)
-        VecVariantsFound = true;
+      if (CI && !VFDatabase::getMappings(*CI).empty())
+        VecCallVariantsFound = true;
 
       // Check that the instruction return type is vectorizable.
       // Also, we can't vectorize extractelement instructions.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e8842fcf56da49a..fcb7ba9cc59c7c6 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -390,6 +390,11 @@ static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
     cl::desc(
         "Override cost based safe divisor widening for div/rem instructions"));
 
+static cl::opt<bool> UseWiderVFIfVariantsPresent(
+    "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
+    cl::Hidden,
+    cl::desc("Try wider VFs if they enable the use of vector variants"));
+
 /// A helper function that returns true if the given type is irregular. The
 /// type is irregular if its allocated size doesn't equal the store size of an
 /// element of the corresponding vector type.
@@ -5151,9 +5156,10 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
                            : TargetTransformInfo::RGK_FixedWidthVector;
   ElementCount MaxVF = MaxVectorElementCount;
-  if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
-                            (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
-                             Legal->hasVectorVariants()))) {
+  if (MaximizeBandwidth ||
+      (MaximizeBandwidth.getNumOccurrences() == 0 &&
+       (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
+        (UseWiderVFIfVariantsPresent && Legal->hasVectorCallVariants())))) {
     auto MaxVectorElementCountMaxBW = ElementCount::get(
         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
         ComputeScalableMaxVF);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
index 1d40a7c5fbe96b9..00e8881426fd8f9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=WIDE
-; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -vectorizer-maximize-bandwidth-if-variant-present=false -S | FileCheck %s --check-prefixes=NARROW
+; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -vectorizer-maximize-bandwidth-for-vector-calls=false -S | FileCheck %s --check-prefixes=NARROW
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -19,17 +19,17 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #1 {
 ; WIDE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; WIDE:       vector.body:
 ; WIDE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; WIDE-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; WIDE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x ptr>, ptr [[TMP4]], align 8
-; WIDE-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[WIDE_LOAD]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
-; WIDE-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @foo_vector(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; WIDE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; WIDE-NEXT:    store <vscale x 4 x i32> [[TMP5]], ptr [[TMP6]], align 4
-; WIDE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; WIDE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
-; WIDE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
-; WIDE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; WIDE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; WIDE-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]]
+; WIDE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x double>, ptr [[TMP4]], align 8
+; WIDE-NEXT:    [[TMP5:%.*]] = fptrunc <vscale x 4 x double> [[WIDE_LOAD]] to <vscale x 4 x float>
+; WIDE-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x float> @foo_vector(<vscale x 4 x float> [[TMP5]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; WIDE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
+; WIDE-NEXT:    store <vscale x 4 x float> [[TMP6]], ptr [[TMP7]], align 4
+; WIDE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; WIDE-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
+; WIDE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; WIDE-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; WIDE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; WIDE:       middle.block:
 ; WIDE-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; WIDE:       scalar.ph:
@@ -37,12 +37,12 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #1 {
 ; WIDE-NEXT:    br label [[FOR_BODY:%.*]]
 ; WIDE:       for.body:
 ; WIDE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; WIDE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; WIDE-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[GEP]], align 8
-; WIDE-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[LOAD]], align 4
-; WIDE-NEXT:    [[CALL:%.*]] = call i32 @foo(i32 [[LOAD2]]) #[[ATTR3:[0-9]+]]
-; WIDE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
-; WIDE-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
+; WIDE-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[INDVARS_IV]]
+; WIDE-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8
+; WIDE-NEXT:    [[TRUNC:%.*]] = fptrunc double [[LOAD]] to float
+; WIDE-NEXT:    [[CALL:%.*]] = call float @foo(float [[TRUNC]]) #[[ATTR2:[0-9]+]]
+; WIDE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; WIDE-NEXT:    store float [[CALL]], ptr [[ARRAYIDX]], align 4
 ; WIDE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; WIDE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; WIDE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -51,18 +51,41 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #1 {
 ;
 ; NARROW-LABEL: @test_widen(
 ; NARROW-NEXT:  entry:
+; NARROW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NARROW:       vector.ph:
+; NARROW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NARROW:       vector.body:
+; NARROW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NARROW-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]]
+; NARROW-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
+; NARROW-NEXT:    [[TMP1:%.*]] = fptrunc <2 x double> [[WIDE_LOAD]] to <2 x float>
+; NARROW-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; NARROW-NEXT:    [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]]
+; NARROW-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; NARROW-NEXT:    [[TMP5:%.*]] = call float @foo(float [[TMP4]]) #[[ATTR1]]
+; NARROW-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0
+; NARROW-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP5]], i32 1
+; NARROW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
+; NARROW-NEXT:    store <2 x float> [[TMP7]], ptr [[TMP8]], align 4
+; NARROW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; NARROW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; NARROW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NARROW:       middle.block:
+; NARROW-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NARROW:       scalar.ph:
+; NARROW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; NARROW-NEXT:    br label [[FOR_BODY:%.*]]
 ; NARROW:       for.body:
-; NARROW-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NARROW-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDVARS_IV]]
-; NARROW-NEXT:    [[LOAD:%.*]] = load ptr, ptr [[GEP]], align 8
-; NARROW-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[LOAD]], align 4
-; NARROW-NEXT:    [[CALL:%.*]] = call i32 @foo(i32 [[LOAD2]]) #[[ATTR1:[0-9]+]]
-; NARROW-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
-; NARROW-NEXT:    store i32 [[CALL]], ptr [[ARRAYIDX]], align 4
+; NARROW-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NARROW-NEXT:    [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[INDVARS_IV]]
+; NARROW-NEXT:    [[LOAD:%.*]] = load double, ptr [[GEP]], align 8
+; NARROW-NEXT:    [[TRUNC:%.*]] = fptrunc double [[LOAD]] to float
+; NARROW-NEXT:    [[CALL:%.*]] = call float @foo(float [[TRUNC]]) #[[ATTR1]]
+; NARROW-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]]
+; NARROW-NEXT:    store float [[CALL]], ptr [[ARRAYIDX]], align 4
 ; NARROW-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; NARROW-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
-; NARROW-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; NARROW-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; NARROW:       for.cond.cleanup:
 ; NARROW-NEXT:    ret void
 ;
@@ -71,12 +94,12 @@ entry:
 
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %gep = getelementptr i64, ptr %b, i64 %indvars.iv
-  %load = load ptr, ptr %gep
-  %load2 = load i32, ptr %load
-  %call = call i32 @foo(i32 %load2) #0
-  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
-  store i32 %call, ptr %arrayidx
+  %gep = getelementptr double, ptr %b, i64 %indvars.iv
+  %load = load double, ptr %gep
+  %trunc = fptrunc double %load to float
+  %call = call float @foo(float %trunc) #0
+  %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv
+  store float %call, ptr %arrayidx
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, 1025
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
@@ -85,8 +108,8 @@ for.cond.cleanup:
   ret void
 }
 
-declare i32 @foo(i32)
-declare <vscale x 4 x i32> @foo_vector(<vscale x 4 x i32>, <vscale x 4 x i1>)
+declare float @foo(float)
+declare <vscale x 4 x float> @foo_vector(<vscale x 4 x float>, <vscale x 4 x i1>)
 
 attributes #0 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_Mxv_foo(foo_vector)" }
 attributes #1 = { "target-features"="+sve" vscale_range(1,16) "no-trapping-math"="false" }

>From e07a77ba08bb658a56ae0436bc1f34e9220952c5 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Wed, 8 Nov 2023 14:48:31 +0000
Subject: [PATCH 3/3] Rename internal flag variable

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fcb7ba9cc59c7c6..7f9fe6e6fc05664 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -390,7 +390,7 @@ static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
     cl::desc(
         "Override cost based safe divisor widening for div/rem instructions"));
 
-static cl::opt<bool> UseWiderVFIfVariantsPresent(
+static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
     "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true),
     cl::Hidden,
     cl::desc("Try wider VFs if they enable the use of vector variants"));
@@ -5159,7 +5159,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
   if (MaximizeBandwidth ||
       (MaximizeBandwidth.getNumOccurrences() == 0 &&
        (TTI.shouldMaximizeVectorBandwidth(RegKind) ||
-        (UseWiderVFIfVariantsPresent && Legal->hasVectorCallVariants())))) {
+        (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
     auto MaxVectorElementCountMaxBW = ElementCount::get(
         llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
         ComputeScalableMaxVF);