[llvm] [DAGCombiner] Spill dynamic insertelt chain in one go (PR #162368)
Princeton Ferro via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 8 15:24:01 PDT 2025
https://github.com/Prince781 updated https://github.com/llvm/llvm-project/pull/162368
>From 74eb28a49d158f60814261121876fe67fd5ca33d Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Fri, 3 Oct 2025 20:43:38 -0700
Subject: [PATCH] [DAGCombiner] Spill dynamic insertelt chain in one go
A chain of dynamic insertelts with can be spilled at once. This avoids
each insertelt being spilled in DAGTypeLegalizer which reduces code size
and compile time.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 62 +++
llvm/test/CodeGen/NVPTX/vector-spill.ll | 358 ++++++++++++++++++
2 files changed, 420 insertions(+)
create mode 100644 llvm/test/CodeGen/NVPTX/vector-spill.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 309f1bea8b77c..0d426e70cf87f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23445,6 +23445,68 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
// inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT))
return DAG.getSplat(VT, DL, InVal);
+
+ if (TLI.getTypeAction(*DAG.getContext(), VT) ==
+ TargetLowering::TypeSplitVector) {
+ // For dynamic insertelts, the type legalizer may spill the entire
+ // vector. For a chain of dynamic insertelts, this can be really
+ // inefficient and bad for compile time. If each insertelt is only fed
+ // into the next, the vector is write-only across this chain, and we can
+ // just spill once.
+ SmallVector<SDNode *> Seq{N};
+ unsigned NumDynamic = 1;
+ while (true) {
+ SDValue InVec = Seq.back()->getOperand(0);
+ if (InVec.getOpcode() != ISD::INSERT_VECTOR_ELT)
+ break;
+ Seq.push_back(InVec.getNode());
+ NumDynamic += !isa<ConstantSDNode>(InVec.getOperand(2));
+ }
+
+ // We will lower every insertelt in the sequence to a store. In the
+ // default handling, only dynamic insertelts in the sequence will be
+ // lowered to a store (+ vector spilling for each). Check that our
+ // approach reduces the total number of loads and stores over the default.
+ if (2 * VT.getVectorMinNumElements() + Seq.size() <
+ NumDynamic * 2 * VT.getVectorMinNumElements()) {
+ // In cases where the vector is illegal it will be broken down into
+ // parts and stored in parts - we should use the alignment for the
+ // smallest part.
+ Align SmallestAlign = DAG.getReducedAlign(VT, /*UseABI=*/false);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(VT.getStoreSize(), SmallestAlign);
+ auto &MF = DAG.getMachineFunction();
+ int FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+ // Begin spilling
+ SDValue InVec = Seq.back()->getOperand(0);
+ SDValue Store = DAG.getStore(DAG.getEntryNode(), DL, InVec, StackPtr,
+ PtrInfo, SmallestAlign);
+
+ // Lower each dynamic insertelt to a store
+ for (SDNode *N : reverse(Seq)) {
+ SDValue Elmnt = N->getOperand(1);
+ SDValue Index = N->getOperand(2);
+
+ // Store the new element. This may be larger than the vector element
+ // type, so use a truncating store.
+ SDValue EltPtr =
+ TLI.getVectorElementPointer(DAG, StackPtr, VT, Index);
+ EVT EltVT = Elmnt.getValueType();
+ Store = DAG.getTruncStore(
+ Store, DL, Elmnt, EltPtr, MachinePointerInfo::getUnknownStack(MF),
+ EltVT,
+ commonAlignment(SmallestAlign, EltVT.getFixedSizeInBits() / 8));
+ }
+
+ // Load the spilled vector
+ SDValue Load =
+ DAG.getLoad(VT, DL, Store, StackPtr, PtrInfo, SmallestAlign);
+ return Load.getValue(0);
+ }
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/NVPTX/vector-spill.ll b/llvm/test/CodeGen/NVPTX/vector-spill.ll
new file mode 100644
index 0000000000000..63dc668e49a99
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/vector-spill.ll
@@ -0,0 +1,358 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -O3 < %s -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mcpu=sm_20 | %ptxas-verify %}
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; COM: Spill the vector once.
+define ptx_kernel void @spill_once(ptr addrspace(3) %shared.mem, <8 x double> %vector, i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) local_unnamed_addr {
+; CHECK-LABEL: spill_once(
+; CHECK: {
+; CHECK-NEXT: .local .align 8 .b8 __local_depot0[64];
+; CHECK-NEXT: .reg .b64 %SP;
+; CHECK-NEXT: .reg .b64 %SPL;
+; CHECK-NEXT: .reg .b64 %rd<39>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: mov.b64 %SPL, __local_depot0;
+; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT: ld.param.b64 %rd1, [spill_once_param_0];
+; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [spill_once_param_1];
+; CHECK-NEXT: ld.param.v2.b64 {%rd6, %rd7}, [spill_once_param_1+16];
+; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [spill_once_param_1+32];
+; CHECK-NEXT: ld.param.v2.b64 {%rd10, %rd11}, [spill_once_param_1+48];
+; CHECK-NEXT: ld.shared.b64 %rd12, [%rd1+16];
+; CHECK-NEXT: ld.shared.b64 %rd13, [%rd1+24];
+; CHECK-NEXT: ld.param.b32 %rd14, [spill_once_param_2];
+; CHECK-NEXT: and.b64 %rd15, %rd14, 7;
+; CHECK-NEXT: shl.b64 %rd16, %rd15, 3;
+; CHECK-NEXT: add.u64 %rd17, %SP, 0;
+; CHECK-NEXT: add.s64 %rd18, %rd17, %rd16;
+; CHECK-NEXT: ld.param.b32 %rd19, [spill_once_param_3];
+; CHECK-NEXT: and.b64 %rd20, %rd19, 7;
+; CHECK-NEXT: shl.b64 %rd21, %rd20, 3;
+; CHECK-NEXT: add.s64 %rd22, %rd17, %rd21;
+; CHECK-NEXT: ld.param.b32 %rd23, [spill_once_param_4];
+; CHECK-NEXT: and.b64 %rd24, %rd23, 7;
+; CHECK-NEXT: shl.b64 %rd25, %rd24, 3;
+; CHECK-NEXT: add.s64 %rd26, %rd17, %rd25;
+; CHECK-NEXT: st.b64 [%SP+56], %rd11;
+; CHECK-NEXT: st.b64 [%SP+48], %rd10;
+; CHECK-NEXT: st.b64 [%SP+40], %rd9;
+; CHECK-NEXT: st.b64 [%SP+32], %rd8;
+; CHECK-NEXT: st.b64 [%SP+24], %rd7;
+; CHECK-NEXT: st.b64 [%SP+16], %rd6;
+; CHECK-NEXT: st.b64 [%SP+8], %rd5;
+; CHECK-NEXT: st.b64 [%SP], %rd4;
+; CHECK-NEXT: st.b64 [%rd18], %rd2;
+; CHECK-NEXT: st.b64 [%rd22], %rd3;
+; CHECK-NEXT: st.b64 [%rd26], %rd12;
+; CHECK-NEXT: ld.param.b32 %rd27, [spill_once_param_5];
+; CHECK-NEXT: and.b64 %rd28, %rd27, 7;
+; CHECK-NEXT: shl.b64 %rd29, %rd28, 3;
+; CHECK-NEXT: add.s64 %rd30, %rd17, %rd29;
+; CHECK-NEXT: st.b64 [%rd30], %rd13;
+; CHECK-NEXT: ld.b64 %rd31, [%SP+8];
+; CHECK-NEXT: ld.b64 %rd32, [%SP];
+; CHECK-NEXT: ld.b64 %rd33, [%SP+24];
+; CHECK-NEXT: ld.b64 %rd34, [%SP+16];
+; CHECK-NEXT: ld.b64 %rd35, [%SP+40];
+; CHECK-NEXT: ld.b64 %rd36, [%SP+32];
+; CHECK-NEXT: ld.b64 %rd37, [%SP+56];
+; CHECK-NEXT: ld.b64 %rd38, [%SP+48];
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1072], {%rd38, %rd37};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1056], {%rd36, %rd35};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1040], {%rd34, %rd33};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1024], {%rd32, %rd31};
+; CHECK-NEXT: ret;
+entry:
+ %offset.0 = getelementptr double, ptr addrspace(3) %shared.mem, i32 0
+ %element.0 = load double, ptr addrspace(3) %offset.0, align 64
+ %offset.1 = getelementptr double, ptr addrspace(3) %shared.mem, i32 1
+ %element.1 = load double, ptr addrspace(3) %offset.1, align 8
+ %offset.2 = getelementptr double, ptr addrspace(3) %shared.mem, i32 2
+ %element.2 = load double, ptr addrspace(3) %offset.2, align 8
+ %offset.3 = getelementptr double, ptr addrspace(3) %shared.mem, i32 3
+ %element.3 = load double, ptr addrspace(3) %offset.3, align 8
+ %vector.build0 = insertelement <8 x double> %vector, double %element.0, i32 %idx0
+ %vector.build1 = insertelement <8 x double> %vector.build0, double %element.1, i32 %idx1
+ %vector.build2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2
+ %vector.build3 = insertelement <8 x double> %vector.build2, double %element.3, i32 %idx3
+ %location = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1024
+ store <8 x double> %vector.build3, ptr addrspace(3) %location, align 64
+ ret void
+}
+
+; COM: Spill the vector twice. Because these are in two different slots, the
+; resulting spill codes may be non-overlapping even though the insertelt
+; sequences overlap.
+define ptx_kernel void @spill_twice(ptr addrspace(3) %shared.mem, <8 x double> %vector, i32 %idx0, i32 %idx1, i32 %idx2, i32 %idx3) local_unnamed_addr {
+; CHECK-LABEL: spill_twice(
+; CHECK: {
+; CHECK-NEXT: .local .align 8 .b8 __local_depot1[128];
+; CHECK-NEXT: .reg .b64 %SP;
+; CHECK-NEXT: .reg .b64 %SPL;
+; CHECK-NEXT: .reg .b64 %rd<51>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: mov.b64 %SPL, __local_depot1;
+; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT: ld.param.b64 %rd1, [spill_twice_param_0];
+; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [spill_twice_param_1];
+; CHECK-NEXT: ld.param.v2.b64 {%rd6, %rd7}, [spill_twice_param_1+16];
+; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [spill_twice_param_1+32];
+; CHECK-NEXT: ld.param.v2.b64 {%rd10, %rd11}, [spill_twice_param_1+48];
+; CHECK-NEXT: ld.shared.b64 %rd12, [%rd1+16];
+; CHECK-NEXT: ld.shared.b64 %rd13, [%rd1+24];
+; CHECK-NEXT: ld.param.b32 %rd14, [spill_twice_param_2];
+; CHECK-NEXT: and.b64 %rd15, %rd14, 7;
+; CHECK-NEXT: shl.b64 %rd16, %rd15, 3;
+; CHECK-NEXT: add.u64 %rd17, %SP, 0;
+; CHECK-NEXT: add.s64 %rd18, %rd17, %rd16;
+; CHECK-NEXT: add.u64 %rd19, %SP, 64;
+; CHECK-NEXT: add.s64 %rd20, %rd19, %rd16;
+; CHECK-NEXT: ld.param.b32 %rd21, [spill_twice_param_3];
+; CHECK-NEXT: and.b64 %rd22, %rd21, 7;
+; CHECK-NEXT: shl.b64 %rd23, %rd22, 3;
+; CHECK-NEXT: add.s64 %rd24, %rd17, %rd23;
+; CHECK-NEXT: add.s64 %rd25, %rd19, %rd23;
+; CHECK-NEXT: st.b64 [%SP+120], %rd11;
+; CHECK-NEXT: st.b64 [%SP+112], %rd10;
+; CHECK-NEXT: st.b64 [%SP+104], %rd9;
+; CHECK-NEXT: st.b64 [%SP+96], %rd8;
+; CHECK-NEXT: st.b64 [%SP+88], %rd7;
+; CHECK-NEXT: st.b64 [%SP+80], %rd6;
+; CHECK-NEXT: st.b64 [%SP+72], %rd5;
+; CHECK-NEXT: st.b64 [%SP+64], %rd4;
+; CHECK-NEXT: st.b64 [%rd20], %rd2;
+; CHECK-NEXT: st.b64 [%rd25], %rd3;
+; CHECK-NEXT: ld.param.b32 %rd26, [spill_twice_param_4];
+; CHECK-NEXT: and.b64 %rd27, %rd26, 7;
+; CHECK-NEXT: shl.b64 %rd28, %rd27, 3;
+; CHECK-NEXT: add.s64 %rd29, %rd19, %rd28;
+; CHECK-NEXT: st.b64 [%rd29], %rd12;
+; CHECK-NEXT: add.s64 %rd30, %rd17, %rd28;
+; CHECK-NEXT: ld.b64 %rd31, [%SP+72];
+; CHECK-NEXT: ld.b64 %rd32, [%SP+64];
+; CHECK-NEXT: ld.b64 %rd33, [%SP+88];
+; CHECK-NEXT: ld.b64 %rd34, [%SP+80];
+; CHECK-NEXT: ld.b64 %rd35, [%SP+104];
+; CHECK-NEXT: ld.b64 %rd36, [%SP+96];
+; CHECK-NEXT: ld.b64 %rd37, [%SP+120];
+; CHECK-NEXT: ld.b64 %rd38, [%SP+112];
+; CHECK-NEXT: st.b64 [%SP+56], %rd11;
+; CHECK-NEXT: st.b64 [%SP+48], %rd10;
+; CHECK-NEXT: st.b64 [%SP+40], %rd9;
+; CHECK-NEXT: st.b64 [%SP+32], %rd8;
+; CHECK-NEXT: st.b64 [%SP+24], %rd7;
+; CHECK-NEXT: st.b64 [%SP+16], %rd6;
+; CHECK-NEXT: st.b64 [%SP+8], %rd5;
+; CHECK-NEXT: st.b64 [%SP], %rd4;
+; CHECK-NEXT: st.b64 [%rd18], %rd2;
+; CHECK-NEXT: st.b64 [%rd24], %rd3;
+; CHECK-NEXT: st.b64 [%rd30], %rd12;
+; CHECK-NEXT: ld.param.b32 %rd39, [spill_twice_param_5];
+; CHECK-NEXT: and.b64 %rd40, %rd39, 7;
+; CHECK-NEXT: shl.b64 %rd41, %rd40, 3;
+; CHECK-NEXT: add.s64 %rd42, %rd17, %rd41;
+; CHECK-NEXT: st.b64 [%rd42], %rd13;
+; CHECK-NEXT: ld.b64 %rd43, [%SP+8];
+; CHECK-NEXT: ld.b64 %rd44, [%SP];
+; CHECK-NEXT: ld.b64 %rd45, [%SP+24];
+; CHECK-NEXT: ld.b64 %rd46, [%SP+16];
+; CHECK-NEXT: ld.b64 %rd47, [%SP+40];
+; CHECK-NEXT: ld.b64 %rd48, [%SP+32];
+; CHECK-NEXT: ld.b64 %rd49, [%SP+56];
+; CHECK-NEXT: ld.b64 %rd50, [%SP+48];
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1072], {%rd50, %rd49};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1056], {%rd48, %rd47};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1040], {%rd46, %rd45};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1024], {%rd44, %rd43};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1144], {%rd38, %rd37};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1128], {%rd36, %rd35};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1112], {%rd34, %rd33};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1096], {%rd32, %rd31};
+; CHECK-NEXT: ret;
+entry:
+ %offset.0 = getelementptr double, ptr addrspace(3) %shared.mem, i32 0
+ %element.0 = load double, ptr addrspace(3) %offset.0, align 64
+ %offset.1 = getelementptr double, ptr addrspace(3) %shared.mem, i32 1
+ %element.1 = load double, ptr addrspace(3) %offset.1, align 8
+ %offset.2 = getelementptr double, ptr addrspace(3) %shared.mem, i32 2
+ %element.2 = load double, ptr addrspace(3) %offset.2, align 8
+ %offset.3 = getelementptr double, ptr addrspace(3) %shared.mem, i32 3
+ %element.3 = load double, ptr addrspace(3) %offset.3, align 8
+
+; COM: begin chain 1
+ %vector.build0 = insertelement <8 x double> %vector, double %element.0, i32 %idx0
+ %vector.build1 = insertelement <8 x double> %vector.build0, double %element.1, i32 %idx1
+
+; COM: interleave a second chain of insertelements
+ %vector.build1-2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2
+
+; COM: continue chain 1
+ %vector.build2 = insertelement <8 x double> %vector.build1, double %element.2, i32 %idx2
+ %vector.build3 = insertelement <8 x double> %vector.build2, double %element.3, i32 %idx3
+
+; COM: save chain 1
+ %location = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1024
+ store <8 x double> %vector.build3, ptr addrspace(3) %location, align 64
+
+; COM: save chain 2
+ %location-2 = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1096
+ store <8 x double> %vector.build1-2, ptr addrspace(3) %location-2, align 64
+ ret void
+}
+
+; COM: for a chain mixing dynamic and constant insertelts, we can spill early
+; to avoid spilling individual dynamic insertelts later.
+define ptx_kernel void @mix_spill_early(ptr addrspace(3) %shared.mem, <8 x double> %vector, i32 %idx0, i32 %idx2) local_unnamed_addr {
+; CHECK-LABEL: mix_spill_early(
+; CHECK: {
+; CHECK-NEXT: .local .align 8 .b8 __local_depot2[64];
+; CHECK-NEXT: .reg .b64 %SP;
+; CHECK-NEXT: .reg .b64 %SPL;
+; CHECK-NEXT: .reg .b64 %rd<31>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: mov.b64 %SPL, __local_depot2;
+; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT: ld.param.b64 %rd1, [mix_spill_early_param_0];
+; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [mix_spill_early_param_1];
+; CHECK-NEXT: ld.param.v2.b64 {%rd6, %rd7}, [mix_spill_early_param_1+16];
+; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [mix_spill_early_param_1+32];
+; CHECK-NEXT: ld.param.v2.b64 {%rd10, %rd11}, [mix_spill_early_param_1+48];
+; CHECK-NEXT: ld.shared.b64 %rd12, [%rd1+16];
+; CHECK-NEXT: ld.shared.b64 %rd13, [%rd1+24];
+; CHECK-NEXT: ld.param.b32 %rd14, [mix_spill_early_param_2];
+; CHECK-NEXT: and.b64 %rd15, %rd14, 7;
+; CHECK-NEXT: shl.b64 %rd16, %rd15, 3;
+; CHECK-NEXT: add.u64 %rd17, %SP, 0;
+; CHECK-NEXT: add.s64 %rd18, %rd17, %rd16;
+; CHECK-NEXT: st.b64 [%SP+56], %rd11;
+; CHECK-NEXT: st.b64 [%SP+48], %rd10;
+; CHECK-NEXT: st.b64 [%SP+40], %rd9;
+; CHECK-NEXT: st.b64 [%SP+32], %rd8;
+; CHECK-NEXT: st.b64 [%SP+24], %rd7;
+; CHECK-NEXT: st.b64 [%SP+16], %rd6;
+; CHECK-NEXT: st.b64 [%SP+8], %rd5;
+; CHECK-NEXT: st.b64 [%SP], %rd4;
+; CHECK-NEXT: st.b64 [%rd18], %rd2;
+; CHECK-NEXT: st.b64 [%SP+16], %rd12;
+; CHECK-NEXT: st.b64 [%SP+8], %rd3;
+; CHECK-NEXT: ld.param.b32 %rd19, [mix_spill_early_param_3];
+; CHECK-NEXT: and.b64 %rd20, %rd19, 7;
+; CHECK-NEXT: shl.b64 %rd21, %rd20, 3;
+; CHECK-NEXT: add.s64 %rd22, %rd17, %rd21;
+; CHECK-NEXT: st.b64 [%rd22], %rd13;
+; CHECK-NEXT: ld.b64 %rd23, [%SP+8];
+; CHECK-NEXT: ld.b64 %rd24, [%SP];
+; CHECK-NEXT: ld.b64 %rd25, [%SP+24];
+; CHECK-NEXT: ld.b64 %rd26, [%SP+16];
+; CHECK-NEXT: ld.b64 %rd27, [%SP+40];
+; CHECK-NEXT: ld.b64 %rd28, [%SP+32];
+; CHECK-NEXT: ld.b64 %rd29, [%SP+56];
+; CHECK-NEXT: ld.b64 %rd30, [%SP+48];
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1072], {%rd30, %rd29};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1056], {%rd28, %rd27};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1040], {%rd26, %rd25};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1024], {%rd24, %rd23};
+; CHECK-NEXT: ret;
+entry:
+ %offset.0 = getelementptr double, ptr addrspace(3) %shared.mem, i32 0
+ %element.0 = load double, ptr addrspace(3) %offset.0, align 64
+ %offset.1 = getelementptr double, ptr addrspace(3) %shared.mem, i32 1
+ %element.1 = load double, ptr addrspace(3) %offset.1, align 8
+ %offset.2 = getelementptr double, ptr addrspace(3) %shared.mem, i32 2
+ %element.2 = load double, ptr addrspace(3) %offset.2, align 8
+ %offset.3 = getelementptr double, ptr addrspace(3) %shared.mem, i32 3
+ %element.3 = load double, ptr addrspace(3) %offset.3, align 8
+ %vector.build0 = insertelement <8 x double> %vector, double %element.0, i32 %idx0
+ %vector.build1 = insertelement <8 x double> %vector.build0, double %element.1, i32 1
+ %vector.build2 = insertelement <8 x double> %vector.build1, double %element.2, i32 2
+ %vector.build3 = insertelement <8 x double> %vector.build2, double %element.3, i32 %idx2
+ %location = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1024
+ store <8 x double> %vector.build3, ptr addrspace(3) %location, align 64
+ ret void
+}
+
+; COM: let later phases spill the single dynamic insertelt without spilling the
+; other constant insertelts, reducing the amount of stores and keeping more
+; elements in registers.
+define ptx_kernel void @mix_spill_late(ptr addrspace(3) %shared.mem, <8 x double> %vector, i32 %idx0) local_unnamed_addr {
+; CHECK-LABEL: mix_spill_late(
+; CHECK: {
+; CHECK-NEXT: .local .align 8 .b8 __local_depot3[64];
+; CHECK-NEXT: .reg .b64 %SP;
+; CHECK-NEXT: .reg .b64 %SPL;
+; CHECK-NEXT: .reg .b64 %rd<25>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0: // %entry
+; CHECK-NEXT: mov.b64 %SPL, __local_depot3;
+; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT: ld.param.b64 %rd1, [mix_spill_late_param_0];
+; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1];
+; CHECK-NEXT: ld.param.v2.b64 {%rd4, %rd5}, [mix_spill_late_param_1+16];
+; CHECK-NEXT: ld.param.v2.b64 {%rd6, %rd7}, [mix_spill_late_param_1+32];
+; CHECK-NEXT: ld.param.v2.b64 {%rd8, %rd9}, [mix_spill_late_param_1+48];
+; CHECK-NEXT: ld.param.b32 %rd10, [mix_spill_late_param_2];
+; CHECK-NEXT: and.b64 %rd11, %rd10, 7;
+; CHECK-NEXT: shl.b64 %rd12, %rd11, 3;
+; CHECK-NEXT: add.u64 %rd13, %SP, 0;
+; CHECK-NEXT: add.s64 %rd14, %rd13, %rd12;
+; CHECK-NEXT: ld.shared.b64 %rd15, [%rd1+16];
+; CHECK-NEXT: ld.shared.b64 %rd16, [%rd1+24];
+; CHECK-NEXT: ld.shared.b64 %rd17, [%rd1+32];
+; CHECK-NEXT: ld.shared.b64 %rd18, [%rd1+40];
+; CHECK-NEXT: ld.shared.b64 %rd19, [%rd1+48];
+; CHECK-NEXT: ld.shared.b64 %rd20, [%rd1+56];
+; CHECK-NEXT: st.b64 [%SP+56], %rd9;
+; CHECK-NEXT: st.b64 [%SP+48], %rd8;
+; CHECK-NEXT: st.b64 [%SP+40], %rd7;
+; CHECK-NEXT: st.b64 [%SP+32], %rd6;
+; CHECK-NEXT: st.b64 [%SP+24], %rd5;
+; CHECK-NEXT: st.b64 [%SP+16], %rd15;
+; CHECK-NEXT: st.b64 [%SP+8], %rd3;
+; CHECK-NEXT: st.b64 [%SP], %rd2;
+; CHECK-NEXT: st.b64 [%rd14], %rd16;
+; CHECK-NEXT: ld.b64 %rd21, [%SP+8];
+; CHECK-NEXT: ld.b64 %rd22, [%SP];
+; CHECK-NEXT: ld.b64 %rd23, [%SP+24];
+; CHECK-NEXT: ld.b64 %rd24, [%SP+16];
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1072], {%rd19, %rd20};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1056], {%rd17, %rd18};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1040], {%rd24, %rd23};
+; CHECK-NEXT: st.shared.v2.b64 [%rd1+1024], {%rd22, %rd21};
+; CHECK-NEXT: ret;
+entry:
+ %offset.0 = getelementptr double, ptr addrspace(3) %shared.mem, i32 0
+ %element.0 = load double, ptr addrspace(3) %offset.0, align 64
+ %offset.1 = getelementptr double, ptr addrspace(3) %shared.mem, i32 1
+ %element.1 = load double, ptr addrspace(3) %offset.1, align 8
+ %offset.2 = getelementptr double, ptr addrspace(3) %shared.mem, i32 2
+ %element.2 = load double, ptr addrspace(3) %offset.2, align 8
+ %offset.3 = getelementptr double, ptr addrspace(3) %shared.mem, i32 3
+ %element.3 = load double, ptr addrspace(3) %offset.3, align 8
+ %offset.4 = getelementptr double, ptr addrspace(3) %shared.mem, i32 4
+ %element.4 = load double, ptr addrspace(3) %offset.4, align 8
+ %offset.5 = getelementptr double, ptr addrspace(3) %shared.mem, i32 5
+ %element.5 = load double, ptr addrspace(3) %offset.5, align 8
+ %offset.6 = getelementptr double, ptr addrspace(3) %shared.mem, i32 6
+ %element.6 = load double, ptr addrspace(3) %offset.6, align 8
+ %offset.7 = getelementptr double, ptr addrspace(3) %shared.mem, i32 7
+ %element.7 = load double, ptr addrspace(3) %offset.7, align 8
+ %vector.build0 = insertelement <8 x double> %vector, double %element.0, i32 0
+ %vector.build1 = insertelement <8 x double> %vector.build0, double %element.1, i32 1
+ %vector.build2 = insertelement <8 x double> %vector.build1, double %element.2, i32 2
+ %vector.build3 = insertelement <8 x double> %vector.build2, double %element.3, i32 %idx0
+ %vector.build4 = insertelement <8 x double> %vector.build3, double %element.4, i32 4
+ %vector.build5 = insertelement <8 x double> %vector.build4, double %element.5, i32 5
+ %vector.build6 = insertelement <8 x double> %vector.build5, double %element.6, i32 6
+ %vector.build7 = insertelement <8 x double> %vector.build6, double %element.7, i32 7
+ %location = getelementptr i8, ptr addrspace(3) %shared.mem, i32 1024
+ store <8 x double> %vector.build7, ptr addrspace(3) %location, align 64
+ ret void
+}
More information about the llvm-commits
mailing list