[llvm] 73245b0 - [RISCV] Rewrite deinterleave load as vlse optimization as DAG combine (#150049)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 29 07:52:43 PDT 2025
Author: Philip Reames
Date: 2025-07-29T07:52:39-07:00
New Revision: 73245b06b3da19ef70e04cf0f0a0d0df1ba82a57
URL: https://github.com/llvm/llvm-project/commit/73245b06b3da19ef70e04cf0f0a0d0df1ba82a57
DIFF: https://github.com/llvm/llvm-project/commit/73245b06b3da19ef70e04cf0f0a0d0df1ba82a57.diff
LOG: [RISCV] Rewrite deinterleave load as vlse optimization as DAG combine (#150049)
This reworks an existing optimization on the fixed vector (shuffle
based) deinterleave lowering into a DAG combine. This has the effect of
making it kick in much more widely - in particular on the deinterleave
intrinsic (i.e. scalable) path, deinterleaveN (without load) lowering,
but also the intrinsic lowering paths.
Added:
Modified:
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
llvm/test/CodeGen/RISCV/rvv/pr141907.ll
llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 607edd3d859f8..43e4f8e469905 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -20843,6 +20843,62 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
}
break;
}
+ case RISCVISD::TUPLE_EXTRACT: {
+ EVT VT = N->getValueType(0);
+ SDValue Tuple = N->getOperand(0);
+ unsigned Idx = N->getConstantOperandVal(1);
+ if (!Tuple.hasOneUse() || Tuple.getOpcode() != ISD::INTRINSIC_W_CHAIN)
+ break;
+
+ unsigned NF = 0;
+ switch (Tuple.getConstantOperandVal(1)) {
+ default:
+ break;
+ case Intrinsic::riscv_vlseg2_mask:
+ case Intrinsic::riscv_vlseg3_mask:
+ case Intrinsic::riscv_vlseg4_mask:
+ case Intrinsic::riscv_vlseg5_mask:
+ case Intrinsic::riscv_vlseg6_mask:
+ case Intrinsic::riscv_vlseg7_mask:
+ case Intrinsic::riscv_vlseg8_mask:
+ NF = Tuple.getValueType().getRISCVVectorTupleNumFields();
+ break;
+ }
+
+ if (!NF || Subtarget.hasOptimizedSegmentLoadStore(NF))
+ break;
+
+ unsigned SEW = VT.getScalarSizeInBits();
+ assert(Log2_64(SEW) == Tuple.getConstantOperandVal(7) &&
+ "Type mismatch without bitcast?");
+ unsigned Stride = SEW / 8 * NF;
+ unsigned Offset = SEW / 8 * Idx;
+
+ SDValue Ops[] = {
+ /*Chain=*/Tuple.getOperand(0),
+ /*IntID=*/DAG.getTargetConstant(Intrinsic::riscv_vlse_mask, DL, XLenVT),
+ /*Passthru=*/Tuple.getOperand(2),
+ /*Ptr=*/
+ DAG.getNode(ISD::ADD, DL, XLenVT, Tuple.getOperand(3),
+ DAG.getConstant(Offset, DL, XLenVT)),
+ /*Stride=*/DAG.getConstant(Stride, DL, XLenVT),
+ /*Mask=*/Tuple.getOperand(4),
+ /*VL=*/Tuple.getOperand(5),
+ /*Policy=*/Tuple.getOperand(6)};
+
+ auto TupleMemSD = cast<MemIntrinsicSDNode>(Tuple);
+ // Match getTgtMemIntrinsic for non-unit stride case
+ EVT MemVT = TupleMemSD->getMemoryVT().getScalarType();
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ TupleMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize);
+
+ SDVTList VTs = DAG.getVTList({VT, MVT::Other});
+ SDValue Result = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs,
+ Ops, MemVT, MMO);
+ DAG.ReplaceAllUsesOfValueWith(Tuple.getValue(1), Result.getValue(1));
+ return Result.getValue(0);
+ }
}
return SDValue();
diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
index 3cbe668b08244..17e2f01687bcc 100644
--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@@ -216,29 +216,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
return false;
- // If the segment load is going to be performed segment at a time anyways
- // and there's only one element used, use a strided load instead. This
- // will be equally fast, and create less vector register pressure.
- if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) {
- unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
- Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
- Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
- Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
- // For rv64, need to truncate i64 to i32 to match signature. As VL is at most
- // the number of active lanes (which is bounded by i32) this is safe.
- VL = Builder.CreateTrunc(VL, Builder.getInt32Ty());
-
- CallInst *CI =
- Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
- {VTy, BasePtr->getType(), Stride->getType()},
- {BasePtr, Stride, Mask, VL});
- Alignment = commonAlignment(Alignment, Indices[0] * ScalarSizeInBytes);
- CI->addParamAttr(0,
- Attribute::getWithAlignment(CI->getContext(), Alignment));
- Shuffles[0]->replaceAllUsesWith(CI);
- return true;
- };
-
CallInst *VlsegN = Builder.CreateIntrinsic(
FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr141907.ll b/llvm/test/CodeGen/RISCV/rvv/pr141907.ll
index 648b47dc440c3..f93f88a5bc06c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pr141907.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pr141907.ll
@@ -9,27 +9,29 @@ define void @pr141907(ptr %0) nounwind {
; CHECK-NEXT: slli a1, a1, 2
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.i v9, 0
+; CHECK-NEXT: vmv.v.i v8, 0
; CHECK-NEXT: vmclr.m v0
; CHECK-NEXT: li a1, 0
-; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vmv.v.i v10, 0
; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: addi a3, sp, 20
+; CHECK-NEXT: li a4, 12
; CHECK-NEXT: .LBB0_1: # %vector.body
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vs4r.v v8, (a2)
; CHECK-NEXT: vsetvli a1, a1, e8, mf8, ta, ma
; CHECK-NEXT: vsetivli zero, 0, e16, mf2, ta, ma
-; CHECK-NEXT: vnsrl.wi v11, v9, 0, v0.t
-; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
-; CHECK-NEXT: vlseg3e32.v v8, (a2)
+; CHECK-NEXT: vnsrl.wi v9, v8, 0, v0.t
+; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma
+; CHECK-NEXT: vlse32.v v8, (a3), a4
; CHECK-NEXT: vsetivli zero, 0, e16, mf2, ta, ma
-; CHECK-NEXT: vsseg2e16.v v11, (zero)
+; CHECK-NEXT: vsseg2e16.v v9, (zero)
; CHECK-NEXT: bnez a1, .LBB0_1
; CHECK-NEXT: .LBB0_2: # %while.body5
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; CHECK-NEXT: vse16.v v9, (a0)
+; CHECK-NEXT: vse16.v v8, (a0)
; CHECK-NEXT: j .LBB0_2
entry:
br label %vector.body
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index fba592d87bcd8..c4284bf0b9f21 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -407,8 +407,9 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor4_oneactive:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT: vlseg4e8.v v8, (a0)
+; CHECK-NEXT: li a1, 4
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlse8.v v8, (a0), a1
; CHECK-NEXT: ret
%vec = load <vscale x 32 x i8>, ptr %p
%d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec)
@@ -419,8 +420,10 @@ define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive(ptr %p) {
define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive2(ptr %p) {
; CHECK-LABEL: vector_deinterleave_load_factor4_oneactive2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
-; CHECK-NEXT: vlseg4e8.v v5, (a0)
+; CHECK-NEXT: addi a0, a0, 3
+; CHECK-NEXT: li a1, 4
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlse8.v v8, (a0), a1
; CHECK-NEXT: ret
%vec = load <vscale x 32 x i8>, ptr %p
%d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index 5b1746d38d03f..ac9f26314a9ab 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -3712,8 +3712,9 @@ define <vscale x 1 x float> @vector_deinterleave_nxv1f32_nxv8f32_oneactive(<vsca
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs4r.v v8, (a0)
-; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vlseg8e32.v v8, (a0)
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vlse32.v v8, (a0), a1
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: add sp, sp, a0
@@ -3732,9 +3733,11 @@ define <vscale x 1 x float> @vector_deinterleave_nxv1f32_nxv8f32_oneactive2(<vsc
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: addi a1, sp, 36
; CHECK-NEXT: vs4r.v v8, (a0)
-; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vlseg8e32.v v3, (a0)
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vlse32.v v8, (a1), a0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
index 23c0c826e85e3..2afb72fc71b39 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll
@@ -674,16 +674,20 @@ define <vscale x 2 x i32> @load_factor2_oneactive(ptr %ptr, i32 %evl) {
define <vscale x 2 x i32> @load_factor5_oneactive(ptr %ptr, i32 %evl) {
; RV32-LABEL: load_factor5_oneactive:
; RV32: # %bb.0:
+; RV32-NEXT: addi a0, a0, 12
+; RV32-NEXT: li a2, 20
; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
-; RV32-NEXT: vlseg5e32.v v5, (a0)
+; RV32-NEXT: vlse32.v v8, (a0), a2
; RV32-NEXT: ret
;
; RV64-LABEL: load_factor5_oneactive:
; RV64: # %bb.0:
; RV64-NEXT: slli a1, a1, 32
+; RV64-NEXT: addi a0, a0, 12
; RV64-NEXT: srli a1, a1, 32
+; RV64-NEXT: li a2, 20
; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
-; RV64-NEXT: vlseg5e32.v v5, (a0)
+; RV64-NEXT: vlse32.v v8, (a0), a2
; RV64-NEXT: ret
%rvl = mul nuw i32 %evl, 5
%wide.masked.load = call <vscale x 10 x i32> @llvm.vp.load(ptr %ptr, <vscale x 10 x i1> splat (i1 true), i32 %rvl)
More information about the llvm-commits
mailing list