[llvm] [SLP]Prefer segmeneted/deinterleaved loads to strided and fix codegen (PR #135058)

Wed Apr 9 10:45:56 PDT 2025

https://github.com/alexey-bataev created https://github.com/llvm/llvm-project/pull/135058

Need to estimate, which one is prefereable, deinterleaved/segmented
loads or strided. Segmented loads can be combined, improving
the overall performance.


>From 1fabde4b05ad586b155c1444fa287d0391f71fa0 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Wed, 9 Apr 2025 17:45:47 +0000
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 64 ++++++++++++-------
 .../SLPVectorizer/RISCV/complex-loads.ll      |  6 +-
 .../RISCV/segmented-loads-simple.ll           |  6 +-
 .../X86/reorder-reused-masked-gather.ll       |  2 +-
 4 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 87fc617eddedc..3ed4bd226b372 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5525,9 +5525,9 @@ static bool isMaskedLoadCompress(
   // Check for very large distances between elements.
   if (*Diff / Sz >= MaxRegSize / 8)
     return false;
-  Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
   LoadVecTy = getWidenedType(ScalarTy, *Diff + 1);
   auto *LI = cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()]);
+  Align CommonAlignment = LI->getAlign();
   IsMasked = !isSafeToLoadUnconditionally(
       Ptr0, LoadVecTy, CommonAlignment, DL,
       cast<LoadInst>(Order.empty() ? VL.back() : VL[Order.back()]), &AC, &DT,
@@ -5566,19 +5566,20 @@ static bool isMaskedLoadCompress(
         TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
                                   LI->getPointerAddressSpace(), CostKind);
   } else {
-    CommonAlignment = LI->getAlign();
     LoadCost =
         TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
                             LI->getPointerAddressSpace(), CostKind);
   }
   if (IsStrided) {
     // Check for potential segmented(interleaved) loads.
-    if (TTI.isLegalInterleavedAccessType(LoadVecTy, CompressMask[1],
+    auto *AlignedLoadVecTy = getWidenedType(
+        ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1));
+    if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1],
                                          CommonAlignment,
                                          LI->getPointerAddressSpace())) {
       InstructionCost InterleavedCost =
           VectorGEPCost + TTI.getInterleavedMemoryOpCost(
-                              Instruction::Load, LoadVecTy, CompressMask[1],
+                              Instruction::Load, AlignedLoadVecTy, CompressMask[1],
                               std::nullopt, CommonAlignment,
                               LI->getPointerAddressSpace(), CostKind, IsMasked);
       if (!Mask.empty())
@@ -5586,6 +5587,7 @@ static bool isMaskedLoadCompress(
                                             VecTy, Mask, CostKind);
       if (InterleavedCost < GatherCost) {
         InterleaveFactor = CompressMask[1];
+        LoadVecTy = AlignedLoadVecTy;
         return true;
       }
     }
@@ -5739,6 +5741,18 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
     // Check that the sorted loads are consecutive.
     if (static_cast<unsigned>(*Diff) == Sz - 1)
       return LoadsState::Vectorize;
+    bool IsMasked;
+    unsigned InterleaveFactor;
+    SmallVector<int> CompressMask;
+    VectorType *LoadVecTy;
+    if (isMaskedLoadCompress(
+            VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT, *TLI,
+            [&](Value *V) {
+              return areAllUsersVectorized(cast<Instruction>(V),
+                                           UserIgnoreList);
+            },
+            IsMasked, InterleaveFactor, CompressMask, LoadVecTy))
+      return LoadsState::CompressVectorize;
     // Simple check if not a strided access - clear order.
     bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
     // Try to generate strided load node.
@@ -5752,18 +5766,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
         isStridedLoad(VL, PointerOps, Order, *TTI, *DL, *SE,
                       IsAnyPointerUsedOutGraph, *Diff))
       return LoadsState::StridedVectorize;
-    bool IsMasked;
-    unsigned InterleaveFactor;
-    SmallVector<int> CompressMask;
-    VectorType *LoadVecTy;
-    if (isMaskedLoadCompress(
-            VL, PointerOps, Order, *TTI, *DL, *SE, *AC, *DT, *TLI,
-            [&](Value *V) {
-              return areAllUsersVectorized(cast<Instruction>(V),
-                                           UserIgnoreList);
-            },
-            IsMasked, InterleaveFactor, CompressMask, LoadVecTy))
-      return LoadsState::CompressVectorize;
   }
   if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
       TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
@@ -17558,20 +17560,34 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
             *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor,
             CompressMask, LoadVecTy);
         assert(IsVectorized && "Expected to be vectorized");
-        Align CommonAlignment;
-        if (IsMasked)
-          CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
-        else
-          CommonAlignment = LI->getAlign();
+        Align CommonAlignment = LI->getAlign();
         if (IsMasked) {
+          unsigned VF = getNumElements(LoadVecTy);
           SmallVector<Constant *> MaskValues(
-              getNumElements(LoadVecTy) / getNumElements(LI->getType()),
+              VF / getNumElements(LI->getType()),
               ConstantInt::getFalse(VecTy->getContext()));
           for (int I : CompressMask)
             MaskValues[I] = ConstantInt::getTrue(VecTy->getContext());
           Constant *MaskValue = ConstantVector::get(MaskValues);
-          NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
-                                           MaskValue);
+          if (InterleaveFactor) {
+            // FIXME: codegen currently recognizes only vp.load, not
+            // masked.load, as segmented (deinterleaved) loads.
+            Value *Operands[] = {
+                PO, MaskValue,
+                Builder.getInt32(InterleaveFactor * (E->Scalars.size() - 1) +
+                                 1)};
+            Type *Types[] = {LoadVecTy, Operands[0]->getType()};
+            CallInst *WideLoad =
+                Builder.CreateIntrinsic(Intrinsic::vp_load, Types, Operands,
+                                        nullptr, "wide.masked.load");
+            WideLoad->addParamAttr(
+                0, Attribute::getWithAlignment(WideLoad->getContext(),
+                                               CommonAlignment));
+            NewLI = WideLoad;
+          } else {
+            NewLI = Builder.CreateMaskedLoad(LoadVecTy, PO, CommonAlignment,
+                                             MaskValue);
+          }
         } else {
           NewLI = Builder.CreateAlignedLoad(LoadVecTy, PO, CommonAlignment);
         }
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 0f56862446a9d..a2ad021ee3f19 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -79,7 +79,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; CHECK-NEXT:    [[TMP59:%.*]] = add <4 x i32> [[TMP57]], [[TMP58]]
 ; CHECK-NEXT:    [[TMP60:%.*]] = sub <4 x i32> [[TMP57]], [[TMP58]]
 ; CHECK-NEXT:    [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP59]], <4 x i32> [[TMP60]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP62:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.vp.load.v8i8.p0(ptr align 1 null, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, i32 5)
+; CHECK-NEXT:    [[TMP62:%.*]] = shufflevector <8 x i8> [[WIDE_MASKED_LOAD]], <8 x i8> poison, <2 x i32> <i32 0, i32 4>
 ; CHECK-NEXT:    [[TMP63:%.*]] = load <4 x i8>, ptr null, align 1
 ; CHECK-NEXT:    [[TMP64:%.*]] = zext <4 x i8> [[TMP63]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP65:%.*]] = load <4 x i8>, ptr null, align 1
@@ -210,7 +211,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
 ; THR15-NEXT:    [[TMP59:%.*]] = add <4 x i32> [[TMP57]], [[TMP58]]
 ; THR15-NEXT:    [[TMP60:%.*]] = sub <4 x i32> [[TMP57]], [[TMP58]]
 ; THR15-NEXT:    [[TMP61:%.*]] = shufflevector <4 x i32> [[TMP59]], <4 x i32> [[TMP60]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; THR15-NEXT:    [[TMP62:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2)
+; THR15-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.vp.load.v8i8.p0(ptr align 1 null, <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, i32 5)
+; THR15-NEXT:    [[TMP62:%.*]] = shufflevector <8 x i8> [[WIDE_MASKED_LOAD]], <8 x i8> poison, <2 x i32> <i32 0, i32 4>
 ; THR15-NEXT:    [[TMP63:%.*]] = load <4 x i8>, ptr null, align 1
 ; THR15-NEXT:    [[TMP64:%.*]] = zext <4 x i8> [[TMP63]] to <4 x i32>
 ; THR15-NEXT:    [[TMP65:%.*]] = load <4 x i8>, ptr null, align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads-simple.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads-simple.ll
index 0718cc25fd80c..c8d06ba1c8355 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads-simple.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads-simple.ll
@@ -5,7 +5,8 @@ define i32 @sum_of_abs_stride_2(ptr noalias %a, ptr noalias %b) {
 ; CHECK-LABEL: define i32 @sum_of_abs_stride_2
 ; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr align 1 [[A]], i64 2, <8 x i1> splat (i1 true), i32 8)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.vp.load.v16i8.p0(ptr align 1 [[A]], <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, i32 15)
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_LOAD]], <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP0]], i1 false)
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
@@ -57,7 +58,8 @@ define i32 @sum_of_abs_stride_3(ptr noalias %a, ptr noalias %b) {
 ; CHECK-LABEL: define i32 @sum_of_abs_stride_3
 ; CHECK-SAME: (ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr align 1 [[A]], i64 3, <8 x i1> splat (i1 true), i32 8)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <32 x i8> @llvm.vp.load.v32i8.p0(ptr align 1 [[A]], <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, i32 22)
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <32 x i8> [[WIDE_MASKED_LOAD]], <32 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.abs.v8i8(<8 x i8> [[TMP0]], i1 false)
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
index 9369a5962e643..7bb436b9543bf 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll
@@ -4,7 +4,7 @@
 define void @test(ptr noalias %0, ptr %p) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[P:%.*]], i32 4, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <16 x float> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[P:%.*]], i32 16, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <16 x float> poison)
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> <i32 15, i32 4, i32 5, i32 0, i32 2, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> <i32 15, i32 4, i32 5, i32 15, i32 4, i32 5, i32 15, i32 0, i32 5, i32 2, i32 6, i32 7, i32 8, i32 6, i32 7, i32 8>
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 4, i32 24, i32 15, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>