[llvm] [NVPTX] fix type propagation when expanding Store[V4 -> V8] (PR #151576)
Princeton Ferro via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 31 14:11:09 PDT 2025
https://github.com/Prince781 updated https://github.com/llvm/llvm-project/pull/151576
>From 8946daba950c38184ec6caecce94a3167d3dc7e4 Mon Sep 17 00:00:00 2001
From: Princeton Ferro <pferro at nvidia.com>
Date: Thu, 31 Jul 2025 11:28:10 -0700
Subject: [PATCH] [NVPTX] fix type propagation when expanding Store[V4 -> V8]
Propagate the correct type when expanding a StoreV4 x <2 x float> to
StoreV8 x float.
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 12 +++----
llvm/test/CodeGen/NVPTX/fold-movs.ll | 38 +++++++++++++++++++++
2 files changed, 42 insertions(+), 8 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/fold-movs.ll
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 4fd362303b6e5..65d1be3a3847d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -4917,7 +4917,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return SDValue();
auto *LD = cast<MemSDNode>(N);
- EVT MemVT = LD->getMemoryVT();
SDLoc DL(LD);
// the new opcode after we double the number of operands
@@ -4958,9 +4957,9 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end());
// Create the new load
- SDValue NewLoad =
- DCI.DAG.getMemIntrinsicNode(Opcode, DL, DCI.DAG.getVTList(NewVTs),
- Operands, MemVT, LD->getMemOperand());
+ SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
+ Opcode, DL, DCI.DAG.getVTList(NewVTs), Operands, LD->getMemoryVT(),
+ LD->getMemOperand());
// Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep
// the outputs the same. These nodes will be optimized away in later
@@ -5002,7 +5001,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
return SDValue();
auto *ST = cast<MemSDNode>(N);
- EVT MemVT = ElementVT.getVectorElementType();
// The new opcode after we double the number of operands.
NVPTXISD::NodeType Opcode;
@@ -5011,11 +5009,9 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
// Any packed type is legal, so the legalizer will not have lowered
// ISD::STORE -> NVPTXISD::Store (unless it's under-aligned). We have to do
// it here.
- MemVT = ST->getMemoryVT();
Opcode = NVPTXISD::StoreV2;
break;
case NVPTXISD::StoreV2:
- MemVT = ST->getMemoryVT();
Opcode = NVPTXISD::StoreV4;
break;
case NVPTXISD::StoreV4:
@@ -5066,7 +5062,7 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
// Now we replace the store
return DCI.DAG.getMemIntrinsicNode(Opcode, SDLoc(N), N->getVTList(), Operands,
- MemVT, ST->getMemOperand());
+ ST->getMemoryVT(), ST->getMemOperand());
}
static SDValue PerformStoreCombine(SDNode *N,
diff --git a/llvm/test/CodeGen/NVPTX/fold-movs.ll b/llvm/test/CodeGen/NVPTX/fold-movs.ll
new file mode 100644
index 0000000000000..41e86cee766dd
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fold-movs.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O3 -disable-post-ra \
+; RUN: -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefixes=CHECK-F32X2
+; RUN: %if ptxas-12.7 %{ \
+; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O3 -disable-post-ra \
+; RUN: -frame-pointer=all -verify-machineinstrs | %ptxas-verify -arch=sm_100 \
+; RUN: %}
+target triple = "nvptx64-nvidia-cuda"
+
+; Since fdiv doesn't support f32x2, this will create BUILD_VECTORs that will be
+; folded into the store, turning it into st.global.v8.b32.
+define void @writevec(<8 x float> %v1, <8 x float> %v2, ptr addrspace(1) %p) {
+; CHECK-F32X2-LABEL: writevec(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<25>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [writevec_param_0];
+; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [writevec_param_0+16];
+; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [writevec_param_1];
+; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [writevec_param_1+16];
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [writevec_param_2];
+; CHECK-F32X2-NEXT: div.rn.f32 %r17, %r8, %r16;
+; CHECK-F32X2-NEXT: div.rn.f32 %r18, %r7, %r15;
+; CHECK-F32X2-NEXT: div.rn.f32 %r19, %r6, %r14;
+; CHECK-F32X2-NEXT: div.rn.f32 %r20, %r5, %r13;
+; CHECK-F32X2-NEXT: div.rn.f32 %r21, %r4, %r12;
+; CHECK-F32X2-NEXT: div.rn.f32 %r22, %r3, %r11;
+; CHECK-F32X2-NEXT: div.rn.f32 %r23, %r2, %r10;
+; CHECK-F32X2-NEXT: div.rn.f32 %r24, %r1, %r9;
+; CHECK-F32X2-NEXT: st.global.v8.b32 [%rd1], {%r24, %r23, %r22, %r21, %r20, %r19, %r18, %r17};
+; CHECK-F32X2-NEXT: ret;
+ %v = fdiv <8 x float> %v1, %v2
+ store <8 x float> %v, ptr addrspace(1) %p, align 32
+ ret void
+}
More information about the llvm-commits
mailing list