[llvm] 02ad623 - [RISCV] Prefer strided store for interleave store with one lane active (#119027)

Fri Dec 6 16:45:05 PST 2024

Author: Philip Reames
Date: 2024-12-06T16:45:01-08:00
New Revision: 02ad623bb560afa1a789b49f715c9a0e48ea9b16

URL: https://github.com/llvm/llvm-project/commit/02ad623bb560afa1a789b49f715c9a0e48ea9b16
DIFF: https://github.com/llvm/llvm-project/commit/02ad623bb560afa1a789b49f715c9a0e48ea9b16.diff

LOG: [RISCV] Prefer strided store for interleave store with one lane active (#119027)

If we're performing a segment store and all but one of the segments are
undefined, that's equivalent to performing a strided store of the one
active segment.

This is the store side of a905203b. As before, this only covers fixed
vectors.

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 83308682835394..5cd3a3f1c32e66 100644

--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21906,6 +21906,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
                                                 ShuffleVectorInst *SVI,
                                                 unsigned Factor) const {
   IRBuilder<> Builder(SI);
+  auto Mask = SVI->getShuffleMask();
   auto *ShuffleVTy = cast<FixedVectorType>(SVI->getType());
   // Given SVI : <n*factor x ty>, then VTy : <n x ty>
   auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(),
@@ -21917,11 +21918,35 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
 
   auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
 
+  unsigned Index;
+  // If the segment store only has one active lane (i.e. the interleave is
+  // just a spread shuffle), we can use a strided store instead.  This will
+  // be equally fast, and create less vector register pressure.
+  if (!Subtarget.hasOptimizedSegmentLoadStore(Factor) &&
+      isSpreadMask(Mask, Factor, Index)) {
+    unsigned ScalarSizeInBytes = ShuffleVTy->getScalarSizeInBits() / 8;
+    Value *Data = SVI->getOperand(0);
+    auto *DataVTy = cast<FixedVectorType>(Data->getType());
+    Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
+    Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
+    Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset);
+    Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount());
+    Value *VL = Builder.getInt32(VTy->getNumElements());
+
+    CallInst *CI = Builder.CreateIntrinsic(
+        Intrinsic::experimental_vp_strided_store,
+        {Data->getType(), BasePtr->getType(), Stride->getType()},
+        {Data, BasePtr, Stride, Mask, VL});
+    CI->addParamAttr(
+        1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign()));
+
+    return true;
+  }
+
   Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
       SI->getModule(), FixedVssegIntrIds[Factor - 2],
       {VTy, SI->getPointerOperandType(), XLenTy});
 
-  auto Mask = SVI->getShuffleMask();
   SmallVector<Value *, 10> Ops;
 
   for (unsigned i = 0; i < Factor; i++) {

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 5649ee20a47092..8833634be1a0ed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -1285,17 +1285,55 @@ define void @load_factor4_one_active_storeback_full(ptr %ptr) {
   ret void
 }
 
-; TODO: This should be a strided store
-define void @store_factor4_one_active_storeback(ptr %ptr, <4 x i32> %v) {
-; CHECK-LABEL: store_factor4_one_active_storeback:
+define void @store_factor4_one_active(ptr %ptr, <4 x i32> %v) {
+; CHECK-LABEL: store_factor4_one_active:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
-; CHECK-NEXT:    vmv.v.v v10, v9
-; CHECK-NEXT:    vmv.v.v v11, v9
-; CHECK-NEXT:    vsseg4e32.v v8, (a0)
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
 ; CHECK-NEXT:    ret
   %v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3,  i32 undef, i32 undef, i32 undef>
   store <16 x i32> %v0, ptr %ptr
   ret void
 }
+
+define void @store_factor4_one_active_idx1(ptr %ptr, <4 x i32> %v) {
+; CHECK-LABEL: store_factor4_one_active_idx1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+  %v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> <i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3,  i32 undef, i32 undef>
+  store <16 x i32> %v0, ptr %ptr
+  ret void
+}
+
+define void @store_factor4_one_active_fullwidth(ptr %ptr, <16 x i32> %v) {
+; CHECK-LABEL: store_factor4_one_active_fullwidth:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:    vsetivli zero, 4, e32, m4, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+  %v0 = shufflevector <16 x i32> %v, <16 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3,  i32 undef, i32 undef, i32 undef>
+  store <16 x i32> %v0, ptr %ptr
+  ret void
+}
+
+; TODO: This could be a vslidedown followed by a strided store
+define void @store_factor4_one_active_slidedown(ptr %ptr, <4 x i32> %v) {
+; CHECK-LABEL: store_factor4_one_active_slidedown:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-NEXT:    vslideup.vi v10, v8, 1
+; CHECK-NEXT:    vmv.v.v v11, v10
+; CHECK-NEXT:    vmv.v.v v12, v10
+; CHECK-NEXT:    vsseg4e32.v v9, (a0)
+; CHECK-NEXT:    ret
+  %v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 4,  i32 undef, i32 undef, i32 undef>
+  store <16 x i32> %v0, ptr %ptr
+  ret void
+}