[llvm] 92ca087 - [NVPTX] fix type propagation when expanding Store[V4 -> V8] (#151576)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 31 16:52:45 PDT 2025
Author: Princeton Ferro
Date: 2025-07-31T16:52:42-07:00
New Revision: 92ca087b456c6cb1b1005e7d225bdd4ad6f6eeec
URL: https://github.com/llvm/llvm-project/commit/92ca087b456c6cb1b1005e7d225bdd4ad6f6eeec
DIFF: https://github.com/llvm/llvm-project/commit/92ca087b456c6cb1b1005e7d225bdd4ad6f6eeec.diff
LOG: [NVPTX] fix type propagation when expanding Store[V4 -> V8] (#151576)
This was an edge case we missed. Propagate the correct type when
expanding a StoreV4 x <2 x float> to StoreV8 x float.
Added:
llvm/test/CodeGen/NVPTX/fold-movs.ll
Modified:
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Removed:
################################################################################
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 4fd362303b6e5..65d1be3a3847d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -4917,7 +4917,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
return SDValue();
auto *LD = cast<MemSDNode>(N);
- EVT MemVT = LD->getMemoryVT();
SDLoc DL(LD);
// the new opcode after we double the number of operands
@@ -4958,9 +4957,9 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end());
// Create the new load
- SDValue NewLoad =
- DCI.DAG.getMemIntrinsicNode(Opcode, DL, DCI.DAG.getVTList(NewVTs),
- Operands, MemVT, LD->getMemOperand());
+ SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
+ Opcode, DL, DCI.DAG.getVTList(NewVTs), Operands, LD->getMemoryVT(),
+ LD->getMemOperand());
// Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep
// the outputs the same. These nodes will be optimized away in later
@@ -5002,7 +5001,6 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
return SDValue();
auto *ST = cast<MemSDNode>(N);
- EVT MemVT = ElementVT.getVectorElementType();
// The new opcode after we double the number of operands.
NVPTXISD::NodeType Opcode;
@@ -5011,11 +5009,9 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
// Any packed type is legal, so the legalizer will not have lowered
// ISD::STORE -> NVPTXISD::Store (unless it's under-aligned). We have to do
// it here.
- MemVT = ST->getMemoryVT();
Opcode = NVPTXISD::StoreV2;
break;
case NVPTXISD::StoreV2:
- MemVT = ST->getMemoryVT();
Opcode = NVPTXISD::StoreV4;
break;
case NVPTXISD::StoreV4:
@@ -5066,7 +5062,7 @@ static SDValue combinePackingMovIntoStore(SDNode *N,
// Now we replace the store
return DCI.DAG.getMemIntrinsicNode(Opcode, SDLoc(N), N->getVTList(), Operands,
- MemVT, ST->getMemOperand());
+ ST->getMemoryVT(), ST->getMemOperand());
}
static SDValue PerformStoreCombine(SDNode *N,
diff --git a/llvm/test/CodeGen/NVPTX/fold-movs.ll b/llvm/test/CodeGen/NVPTX/fold-movs.ll
new file mode 100644
index 0000000000000..6ee0fb2eeed29
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fold-movs.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O3 -disable-post-ra \
+; RUN: -frame-pointer=all -verify-machineinstrs \
+; RUN: | FileCheck %s --check-prefixes=CHECK-F32X2
+; RUN: %if ptxas-12.7 %{ \
+; RUN: llc < %s -mcpu=sm_100 -mattr=+ptx88 -O3 -disable-post-ra \
+; RUN: -frame-pointer=all -verify-machineinstrs | %ptxas-verify -arch=sm_100 \
+; RUN: %}
+target triple = "nvptx64-nvidia-cuda"
+
+; Since fdiv doesn't support f32x2, this will create BUILD_VECTORs that will be
+; folded into the store, turning it into st.global.v8.b32.
+define void @writevec(<8 x float> %v1, <8 x float> %v2, ptr addrspace(1) %p) {
+; CHECK-F32X2-LABEL: writevec(
+; CHECK-F32X2: {
+; CHECK-F32X2-NEXT: .reg .b32 %r<25>;
+; CHECK-F32X2-NEXT: .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT: // %bb.0:
+; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [writevec_param_0];
+; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [writevec_param_0+16];
+; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [writevec_param_1+16];
+; CHECK-F32X2-NEXT: div.rn.f32 %r13, %r8, %r12;
+; CHECK-F32X2-NEXT: div.rn.f32 %r14, %r7, %r11;
+; CHECK-F32X2-NEXT: div.rn.f32 %r15, %r6, %r10;
+; CHECK-F32X2-NEXT: div.rn.f32 %r16, %r5, %r9;
+; CHECK-F32X2-NEXT: ld.param.v4.b32 {%r17, %r18, %r19, %r20}, [writevec_param_1];
+; CHECK-F32X2-NEXT: div.rn.f32 %r21, %r4, %r20;
+; CHECK-F32X2-NEXT: div.rn.f32 %r22, %r3, %r19;
+; CHECK-F32X2-NEXT: div.rn.f32 %r23, %r2, %r18;
+; CHECK-F32X2-NEXT: div.rn.f32 %r24, %r1, %r17;
+; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [writevec_param_2];
+; CHECK-F32X2-NEXT: st.global.v8.b32 [%rd1], {%r24, %r23, %r22, %r21, %r16, %r15, %r14, %r13};
+; CHECK-F32X2-NEXT: ret;
+ %v = fdiv <8 x float> %v1, %v2
+ store <8 x float> %v, ptr addrspace(1) %p, align 32
+ ret void
+}
More information about the llvm-commits
mailing list