[llvm] [DAGCombiner] Spill dynamic insertelt chain in one go (PR #162368)
Princeton Ferro via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 7 14:00:04 PDT 2025
https://github.com/Prince781 updated https://github.com/llvm/llvm-project/pull/162368
>From 36150fd8a35b0e7ca37e56c18f327e7174c20f11 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Fri, 3 Oct 2025 20:43:38 -0700
Subject: [PATCH] [DAGCombiner] Spill dynamic insertelt chain in one go
A chain of dynamic insertelts (that is: insertelt (insertelt (...)) with
dynamic indices) can be spilled at once. This avoids each insertelt
being spilled in DAGTypeLegalizer which reduces code size and compile
time.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 54 +++++
llvm/test/CodeGen/NVPTX/vector-spill.ll | 207 ++++++++++++++++++
2 files changed, 261 insertions(+)
create mode 100644 llvm/test/CodeGen/NVPTX/vector-spill.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 309f1bea8b77c..be8daa51f1701 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23445,6 +23445,60 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
// inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT))
return DAG.getSplat(VT, DL, InVal);
+
+ // For dynamic insertelts, the type legalizer will spill the entire vector.
+ // For a chain of dynamic insertelts, this can be really inefficient and
+ // bad for compile time. If each insertelt is only fed into the next, the
+ // vector is write-only across this chain, and we can just spill once.
+ SmallVector<SDNode *> Seq{N};
+ while (true) {
+ SDValue InVec = Seq.back()->getOperand(0);
+ SDValue EltNo = InVec.getOperand(2);
+ if (!(InVec.getOpcode() == ISD::INSERT_VECTOR_ELT &&
+ !isa<ConstantSDNode>(EltNo)))
+ break;
+ Seq.push_back(InVec.getNode());
+ }
+
+ // Only care about chains, otherwise this instruction can be handled by
+ // the type legalizer just fine.
+ if (Seq.size() > 1) {
+ // In cases where the vector is illegal it will be broken down into parts
+ // and stored in parts - we should use the alignment for the smallest
+ // part.
+ Align SmallestAlign = DAG.getReducedAlign(VT, /*UseABI=*/false);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(VT.getStoreSize(), SmallestAlign);
+ auto &MF = DAG.getMachineFunction();
+ auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+ // Begin spilling
+ SDValue InVec = Seq.back()->getOperand(0);
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), DL, InVec, StackPtr,
+ PtrInfo, SmallestAlign);
+
+ // Lower each dynamic insertelt to a store
+ for (SDNode *N : reverse(Seq)) {
+ SDValue Elmnt = N->getOperand(1);
+ SDValue Index = N->getOperand(2);
+
+ // Store the new element. This may be larger than the vector element
+ // type, so use a truncating store.
+ SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VT, Index);
+ EVT EltVT = Elmnt.getValueType();
+ Store = DAG.getTruncStore(
+ Store, DL, Elmnt, EltPtr, MachinePointerInfo::getUnknownStack(MF),
+ EltVT,
+ commonAlignment(SmallestAlign, EltVT.getFixedSizeInBits() / 8));
+ }
+
+ // Load the spilled vector
+ SDValue Load =
+ DAG.getLoad(VT, DL, Store, StackPtr, PtrInfo, SmallestAlign);
+ return Load.getValue(0);
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/NVPTX/vector-spill.ll b/llvm/test/CodeGen/NVPTX/vector-spill.ll
new file mode 100644
index 0000000000000..a0b00d340ec03
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/vector-spill.ll
@@ -0,0 +1,207 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %}
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; COM: Spill the vector once.
+define ptx_kernel void @spill_once(ptr addrspace(3) %shared.mem, <8 x double> %vector, i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) local_unnamed_addr {
+; CHECK-LABEL: spill_once(
+; CHECK: {
+; CHECK-NEXT: .local .align 8 .b8 __local_depot0[64];
+; CHECK-NEXT: .reg .b64 %SP;
+; CHECK-NEXT: .reg .b64 %SPL;
+; CHECK-NEXT: .reg .b64 %rd<39>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: mov.b64 %SPL, __local_depot0;
+; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT: ld.param.b64 %rd1, [spill_once_param_0];
+; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [spill_once_param_1];
+; CHECK-NEXT: ld.param.v2.b64 {%rd6, %rd7}, [spill_once_param_1+16];
+; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [spill_once_param_1+32];
+; CHECK-NEXT: ld.param.v2.b64 {%rd10, %rd11}, [spill_once_param_1+48];
+; CHECK-NEXT: ld.shared.b64 %rd12, [%rd1+16];
+; CHECK-NEXT: ld.shared.b64 %rd13, [%rd1+24];
+; CHECK-NEXT: ld.param.b32 %rd14, [spill_once_param_2];
+; CHECK-NEXT: and.b64 %rd15, %rd14, 7;
+; CHECK-NEXT: shl.b64 %rd16, %rd15, 3;
+; CHECK-NEXT: add.u64 %rd17, %SP, 0;
+; CHECK-NEXT: add.s64 %rd18, %rd17, %rd16;
+; CHECK-NEXT: ld.param.b32 %rd19, [spill_once_param_3];
+; CHECK-NEXT: and.b64 %rd20, %rd19, 7;
+; CHECK-NEXT: shl.b64 %rd21, %rd20, 3;
+; CHECK-NEXT: add.s64 %rd22, %rd17, %rd21;
+; CHECK-NEXT: ld.param.b32 %rd23, [spill_once_param_4];
+; CHECK-NEXT: and.b64 %rd24, %rd23, 7;
+; CHECK-NEXT: shl.b64 %rd25, %rd24, 3;
+; CHECK-NEXT: add.s64 %rd26, %rd17, %rd25;
+; CHECK-NEXT: st.b64 [%SP+56], %rd11;
+; CHECK-NEXT: st.b64 [%SP+48], %rd10;
+; CHECK-NEXT: st.b64 [%SP+40], %rd9;
+; CHECK-NEXT: st.b64 [%SP+32], %rd8;
+; CHECK-NEXT: st.b64 [%SP+24], %rd7;
+; CHECK-NEXT: st.b64 [%SP+16], %rd6;
+; CHECK-NEXT: st.b64 [%SP+8], %rd5;
+; CHECK-NEXT: st.b64 [%SP], %rd4;
+; CHECK-NEXT: st.b64 [%rd18], %rd2;
+; CHECK-NEXT: st.b64 [%rd22], %rd3;
+; CHECK-NEXT: st.b64 [%rd26], %rd12;
+; CHECK-NEXT: ld.param.b32 %rd27, [spill_once_param_5];
+; CHECK-NEXT: and.b64 %rd28, %rd27, 7;
+; CHECK-NEXT: shl.b64 %rd29, %rd28, 3;
+; CHECK-NEXT: add.s64 %rd30, %rd17, %rd29;
+; CHECK-NEXT: st.b64 [%rd30], %rd13;
+; CHECK-NEXT: ld.b64 %rd31, [%SP+8];
+; CHECK-NEXT: ld.b64 %rd32, [%SP];
+; CHECK-NEXT: ld.b64 %rd33, [%SP+24];
+; CHECK-NEXT: ld.b64 %rd34, [%SP+16];
+; CHECK-NEXT: ld.b64 %rd35, [%SP+40];
+; CHECK-NEXT: ld.b64 %rd36, [%SP+32];
+; CHECK-NEXT: ld.b64 %rd37, [%SP+56];
+; CHECK-NEXT: ld.b64 %rd38, [%SP+48];
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1072], {%rd38, %rd37};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1056], {%rd36, %rd35};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1040], {%rd34, %rd33};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1024], {%rd32, %rd31};
+; CHECK-NEXT: ret;
+entry:
+ %offset.0 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 0
+ %element.0 = load double, ptr addrspace(3) %offset.0, align 64
+ %offset.1 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 8
+ %element.1 = load double, ptr addrspace(3) %offset.1, align 8
+ %offset.2 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 16
+ %element.2 = load double, ptr addrspace(3) %offset.2, align 8
+ %offset.3 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 24
+ %element.3 = load double, ptr addrspace(3) %offset.3, align 8
+ %vector.build0 = insertelement <8 x double> %vector, double %element.0, i32 %idx0
+ %vector.build1 = insertelement <8 x double> %vector.build0, double %element.1, i32 %idx1
+ %vector.build2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2
+ %vector.build3 = insertelement <8 x double> %vector.build2, double %element.3, i32 %idx3
+ %location = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1024
+ store <8 x double> %vector.build3, ptr addrspace(3) %location, align 64
+ ret void
+}
+
+; COM: Spill the vector twice. Because these are in two different slots, the
+; resulting spill codes may be non-overlapping even though the insertelt
+; sequences overlap.
+define ptx_kernel void @spill_twice(ptr addrspace(3) %shared.mem, <8 x double> %vector, i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) local_unnamed_addr {
+; CHECK-LABEL: spill_twice(
+; CHECK: {
+; CHECK-NEXT: .local .align 8 .b8 __local_depot1[128];
+; CHECK-NEXT: .reg .b64 %SP;
+; CHECK-NEXT: .reg .b64 %SPL;
+; CHECK-NEXT: .reg .b64 %rd<51>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: mov.b64 %SPL, __local_depot1;
+; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT: ld.param.b64 %rd1, [spill_twice_param_0];
+; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [spill_twice_param_1];
+; CHECK-NEXT: ld.param.v2.b64 {%rd6, %rd7}, [spill_twice_param_1+16];
+; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [spill_twice_param_1+32];
+; CHECK-NEXT: ld.param.v2.b64 {%rd10, %rd11}, [spill_twice_param_1+48];
+; CHECK-NEXT: ld.shared.b64 %rd12, [%rd1+16];
+; CHECK-NEXT: ld.shared.b64 %rd13, [%rd1+24];
+; CHECK-NEXT: ld.param.b32 %rd14, [spill_twice_param_2];
+; CHECK-NEXT: and.b64 %rd15, %rd14, 7;
+; CHECK-NEXT: shl.b64 %rd16, %rd15, 3;
+; CHECK-NEXT: add.u64 %rd17, %SP, 0;
+; CHECK-NEXT: add.s64 %rd18, %rd17, %rd16;
+; CHECK-NEXT: add.u64 %rd19, %SP, 64;
+; CHECK-NEXT: add.s64 %rd20, %rd19, %rd16;
+; CHECK-NEXT: ld.param.b32 %rd21, [spill_twice_param_3];
+; CHECK-NEXT: and.b64 %rd22, %rd21, 7;
+; CHECK-NEXT: shl.b64 %rd23, %rd22, 3;
+; CHECK-NEXT: add.s64 %rd24, %rd17, %rd23;
+; CHECK-NEXT: add.s64 %rd25, %rd19, %rd23;
+; CHECK-NEXT: st.b64 [%SP+120], %rd11;
+; CHECK-NEXT: st.b64 [%SP+112], %rd10;
+; CHECK-NEXT: st.b64 [%SP+104], %rd9;
+; CHECK-NEXT: st.b64 [%SP+96], %rd8;
+; CHECK-NEXT: st.b64 [%SP+88], %rd7;
+; CHECK-NEXT: st.b64 [%SP+80], %rd6;
+; CHECK-NEXT: st.b64 [%SP+72], %rd5;
+; CHECK-NEXT: st.b64 [%SP+64], %rd4;
+; CHECK-NEXT: st.b64 [%rd20], %rd2;
+; CHECK-NEXT: st.b64 [%rd25], %rd3;
+; CHECK-NEXT: ld.param.b32 %rd26, [spill_twice_param_4];
+; CHECK-NEXT: and.b64 %rd27, %rd26, 7;
+; CHECK-NEXT: shl.b64 %rd28, %rd27, 3;
+; CHECK-NEXT: add.s64 %rd29, %rd19, %rd28;
+; CHECK-NEXT: st.b64 [%rd29], %rd12;
+; CHECK-NEXT: add.s64 %rd30, %rd17, %rd28;
+; CHECK-NEXT: ld.b64 %rd31, [%SP+72];
+; CHECK-NEXT: ld.b64 %rd32, [%SP+64];
+; CHECK-NEXT: ld.b64 %rd33, [%SP+88];
+; CHECK-NEXT: ld.b64 %rd34, [%SP+80];
+; CHECK-NEXT: ld.b64 %rd35, [%SP+104];
+; CHECK-NEXT: ld.b64 %rd36, [%SP+96];
+; CHECK-NEXT: ld.b64 %rd37, [%SP+120];
+; CHECK-NEXT: ld.b64 %rd38, [%SP+112];
+; CHECK-NEXT: st.b64 [%SP+56], %rd11;
+; CHECK-NEXT: st.b64 [%SP+48], %rd10;
+; CHECK-NEXT: st.b64 [%SP+40], %rd9;
+; CHECK-NEXT: st.b64 [%SP+32], %rd8;
+; CHECK-NEXT: st.b64 [%SP+24], %rd7;
+; CHECK-NEXT: st.b64 [%SP+16], %rd6;
+; CHECK-NEXT: st.b64 [%SP+8], %rd5;
+; CHECK-NEXT: st.b64 [%SP], %rd4;
+; CHECK-NEXT: st.b64 [%rd18], %rd2;
+; CHECK-NEXT: st.b64 [%rd24], %rd3;
+; CHECK-NEXT: st.b64 [%rd30], %rd12;
+; CHECK-NEXT: ld.param.b32 %rd39, [spill_twice_param_5];
+; CHECK-NEXT: and.b64 %rd40, %rd39, 7;
+; CHECK-NEXT: shl.b64 %rd41, %rd40, 3;
+; CHECK-NEXT: add.s64 %rd42, %rd17, %rd41;
+; CHECK-NEXT: st.b64 [%rd42], %rd13;
+; CHECK-NEXT: ld.b64 %rd43, [%SP+8];
+; CHECK-NEXT: ld.b64 %rd44, [%SP];
+; CHECK-NEXT: ld.b64 %rd45, [%SP+24];
+; CHECK-NEXT: ld.b64 %rd46, [%SP+16];
+; CHECK-NEXT: ld.b64 %rd47, [%SP+40];
+; CHECK-NEXT: ld.b64 %rd48, [%SP+32];
+; CHECK-NEXT: ld.b64 %rd49, [%SP+56];
+; CHECK-NEXT: ld.b64 %rd50, [%SP+48];
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1072], {%rd50, %rd49};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1056], {%rd48, %rd47};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1040], {%rd46, %rd45};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1024], {%rd44, %rd43};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1144], {%rd38, %rd37};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1128], {%rd36, %rd35};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1112], {%rd34, %rd33};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1096], {%rd32, %rd31};
+; CHECK-NEXT: ret;
+entry:
+ %offset.0 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 0
+ %element.0 = load double, ptr addrspace(3) %offset.0, align 64
+ %offset.1 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 8
+ %element.1 = load double, ptr addrspace(3) %offset.1, align 8
+ %offset.2 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 16
+ %element.2 = load double, ptr addrspace(3) %offset.2, align 8
+ %offset.3 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 24
+ %element.3 = load double, ptr addrspace(3) %offset.3, align 8
+
+; COM: begin chain 1
+ %vector.build0 = insertelement <8 x double> %vector, double %element.0, i32 %idx0
+ %vector.build1 = insertelement <8 x double> %vector.build0, double %element.1, i32 %idx1
+
+; COM: interleave a second chain of insertelements
+ %vector.build1-2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2
+
+; COM: continue chain 1
+ %vector.build2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2
+ %vector.build3 = insertelement <8 x double> %vector.build2, double %element.3, i32 %idx3
+
+; COM: save chain 1
+ %location = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1024
+ store <8 x double> %vector.build3, ptr addrspace(3) %location, align 64
+
+; COM: save chain 2
+ %location-2 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1096
+ store <8 x double> %vector.build1-2, ptr addrspace(3) %location-2, align 64
+ ret void
+}
More information about the llvm-commits
mailing list