[llvm] 831592d - [NVPTX] Fixup under-aligned dynamic alloca lowering (#139628)
via llvm-commits
llvm-commits at lists.llvm.org
Tue May 13 09:56:44 PDT 2025
Author: Alex MacLean
Date: 2025-05-13T09:56:41-07:00
New Revision: 831592d6171bc62f6991387546512b9cf2ce1c9e
URL: https://github.com/llvm/llvm-project/commit/831592d6171bc62f6991387546512b9cf2ce1c9e
DIFF: https://github.com/llvm/llvm-project/commit/831592d6171bc62f6991387546512b9cf2ce1c9e.diff
LOG: [NVPTX] Fixup under-aligned dynamic alloca lowering (#139628)
The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate
that the default stack alignment should be used. Prior to this change,
we passed this alignment through unchanged leading to an error in
ptxas. Now, we use the stack-alignment in this case. Also did a little
cleanup while I'm here.
Added:
Modified:
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll
llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 3769aae7b620f..8bf0723220093 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2056,18 +2056,28 @@ SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
return DAG.getMergeValues(Ops, SDLoc());
}
+ SDLoc DL(Op.getNode());
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
- uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
- SDLoc DL(Op.getNode());
+ uint64_t Align = Op.getConstantOperandVal(2);
+
+ // The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that
+ // the default stack alignment should be used.
+ if (Align == 0)
+ Align = DAG.getSubtarget().getFrameLowering()->getStackAlign().value();
// The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
- MVT ValueSizeTy = nvTM->is64Bit() ? MVT::i64 : MVT::i32;
+ const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
+
+ SDValue Alloc =
+ DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, {LocalVT, MVT::Other},
+ {Chain, DAG.getZExtOrTrunc(Size, DL, LocalVT),
+ DAG.getTargetConstant(Align, DL, MVT::i32)});
+
+ SDValue ASC = DAG.getAddrSpaceCast(
+ DL, Op.getValueType(), Alloc, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);
- SDValue AllocOps[] = {Chain, DAG.getZExtOrTrunc(Size, DL, ValueSizeTy),
- DAG.getTargetConstant(Align, DL, MVT::i32)};
- EVT RetTypes[] = {ValueSizeTy, MVT::Other};
- return DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, RetTypes, AllocOps);
+ return DAG.getMergeValues({ASC, SDValue(Alloc.getNode(), 1)}, DL);
}
SDValue NVPTXTargetLowering::LowerSTACKRESTORE(SDValue Op,
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 6639554e450f2..a90dfe7a0e6ca 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3102,28 +3102,20 @@ def CALL_PROTOTYPE :
"$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
def SDTDynAllocaOp :
- SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisInt<2>]>;
+ SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisVT<2, i32>]>;
def dyn_alloca :
SDNode<"NVPTXISD::DYNAMIC_STACKALLOC", SDTDynAllocaOp,
[SDNPHasChain, SDNPSideEffect]>;
-def DYNAMIC_STACKALLOC32 :
- NVPTXInst<(outs Int32Regs:$ptr),
- (ins Int32Regs:$size, i32imm:$align),
- "alloca.u32 \t$ptr, $size, $align;\n\t"
- "cvta.local.u32 \t$ptr, $ptr;",
- [(set i32:$ptr, (dyn_alloca i32:$size, (i32 timm:$align)))]>,
- Requires<[hasPTX<73>, hasSM<52>]>;
-
-def DYNAMIC_STACKALLOC64 :
- NVPTXInst<(outs Int64Regs:$ptr),
- (ins Int64Regs:$size, i32imm:$align),
- "alloca.u64 \t$ptr, $size, $align;\n\t"
- "cvta.local.u64 \t$ptr, $ptr;",
- [(set i64:$ptr, (dyn_alloca i64:$size, (i32 timm:$align)))]>,
- Requires<[hasPTX<73>, hasSM<52>]>;
-
+foreach t = [I32RT, I64RT] in {
+ def DYNAMIC_STACKALLOC # t.Size :
+ NVPTXInst<(outs t.RC:$ptr),
+ (ins t.RC:$size, i32imm:$align),
+ "alloca.u" # t.Size # " \t$ptr, $size, $align;",
+ [(set t.Ty:$ptr, (dyn_alloca t.Ty:$size, timm:$align))]>,
+ Requires<[hasPTX<73>, hasSM<52>]>;
+}
//
// BRX
diff --git a/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll b/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll
index f70831cc97ae1..0474d82556c1e 100644
--- a/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll
+++ b/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll
@@ -6,20 +6,20 @@ target triple = "nvptx64-nvidia-cuda"
define void @foo(i64 %a, ptr %p0, ptr %p1) {
; CHECK-LABEL: foo(
; CHECK: {
-; CHECK-NEXT: .reg .b64 %rd<8>;
+; CHECK-NEXT: .reg .b64 %rd<10>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0];
; CHECK-NEXT: add.s64 %rd2, %rd1, 7;
; CHECK-NEXT: and.b64 %rd3, %rd2, -8;
; CHECK-NEXT: alloca.u64 %rd4, %rd3, 16;
-; CHECK-NEXT: cvta.local.u64 %rd4, %rd4;
-; CHECK-NEXT: ld.param.b64 %rd5, [foo_param_1];
-; CHECK-NEXT: alloca.u64 %rd6, %rd3, 16;
-; CHECK-NEXT: cvta.local.u64 %rd6, %rd6;
-; CHECK-NEXT: ld.param.b64 %rd7, [foo_param_2];
-; CHECK-NEXT: st.b64 [%rd5], %rd4;
-; CHECK-NEXT: st.b64 [%rd7], %rd6;
+; CHECK-NEXT: cvta.local.u64 %rd5, %rd4;
+; CHECK-NEXT: ld.param.b64 %rd6, [foo_param_1];
+; CHECK-NEXT: alloca.u64 %rd7, %rd3, 16;
+; CHECK-NEXT: cvta.local.u64 %rd8, %rd7;
+; CHECK-NEXT: ld.param.b64 %rd9, [foo_param_2];
+; CHECK-NEXT: st.b64 [%rd6], %rd5;
+; CHECK-NEXT: st.b64 [%rd9], %rd8;
; CHECK-NEXT: ret;
%b = alloca i8, i64 %a, align 16
%c = alloca i8, i64 %a, align 16
diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
index 664569e3c525c..28bef0de48166 100644
--- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
@@ -1,42 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: not llc < %s -mtriple=nvptx -mattr=+ptx72 -mcpu=sm_52 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
; RUN: not llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_50 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
-; RUN: llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-32
-; RUN: llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-64
+; RUN: llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK-32
+; RUN: llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK-64
; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
; CHECK-FAILS: in function test_dynamic_stackalloc{{.*}}: Support for dynamic alloca introduced in PTX ISA version 7.3 and requires target sm_52.
-; CHECK-LABEL: .visible .func (.param .b32 func_retval0) test_dynamic_stackalloc(
-; CHECK-NOT: __local_depot
-
-; CHECK-32: ld.param.b32 %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
-; CHECK-32-NEXT: add.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 7;
-; CHECK-32-NEXT: and.b32 %r[[SIZE3:[0-9]]], %r[[SIZE2]], -8;
-; CHECK-32-NEXT: alloca.u32 %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16;
-; CHECK-32-NEXT: cvta.local.u32 %r[[ALLOCA]], %r[[ALLOCA]];
-; CHECK-32-NEXT: { // callseq 0, 0
-; CHECK-32-NEXT: .param .b32 param0;
-; CHECK-32-NEXT: st.param.b32 [param0], %r[[ALLOCA]];
-
-; CHECK-64: ld.param.b64 %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
-; CHECK-64-NEXT: add.s64 %rd[[SIZE2:[0-9]]], %rd[[SIZE]], 7;
-; CHECK-64-NEXT: and.b64 %rd[[SIZE3:[0-9]]], %rd[[SIZE2]], -8;
-; CHECK-64-NEXT: alloca.u64 %rd[[ALLOCA:[0-9]]], %rd[[SIZE3]], 16;
-; CHECK-64-NEXT: cvta.local.u64 %rd[[ALLOCA]], %rd[[ALLOCA]];
-; CHECK-64-NEXT: { // callseq 0, 0
-; CHECK-64-NEXT: .param .b64 param0;
-; CHECK-64-NEXT: st.param.b64 [param0], %rd[[ALLOCA]];
-
-; CHECK-NEXT: .param .b32 retval0;
-; CHECK-NEXT: call.uni (retval0),
-; CHECK-NEXT: bar,
-
define i32 @test_dynamic_stackalloc(i64 %n) {
+; CHECK-32-LABEL: test_dynamic_stackalloc(
+; CHECK-32: {
+; CHECK-32-NEXT: .reg .b32 %r<8>;
+; CHECK-32-EMPTY:
+; CHECK-32-NEXT: // %bb.0:
+; CHECK-32-NEXT: ld.param.b32 %r1, [test_dynamic_stackalloc_param_0];
+; CHECK-32-NEXT: add.s32 %r2, %r1, 7;
+; CHECK-32-NEXT: and.b32 %r3, %r2, -8;
+; CHECK-32-NEXT: alloca.u32 %r4, %r3, 16;
+; CHECK-32-NEXT: cvta.local.u32 %r5, %r4;
+; CHECK-32-NEXT: { // callseq 0, 0
+; CHECK-32-NEXT: .param .b32 param0;
+; CHECK-32-NEXT: st.param.b32 [param0], %r5;
+; CHECK-32-NEXT: .param .b32 retval0;
+; CHECK-32-NEXT: call.uni (retval0),
+; CHECK-32-NEXT: bar,
+; CHECK-32-NEXT: (
+; CHECK-32-NEXT: param0
+; CHECK-32-NEXT: );
+; CHECK-32-NEXT: ld.param.b32 %r6, [retval0];
+; CHECK-32-NEXT: } // callseq 0
+; CHECK-32-NEXT: st.param.b32 [func_retval0], %r6;
+; CHECK-32-NEXT: ret;
+;
+; CHECK-64-LABEL: test_dynamic_stackalloc(
+; CHECK-64: {
+; CHECK-64-NEXT: .reg .b32 %r<3>;
+; CHECK-64-NEXT: .reg .b64 %rd<6>;
+; CHECK-64-EMPTY:
+; CHECK-64-NEXT: // %bb.0:
+; CHECK-64-NEXT: ld.param.b64 %rd1, [test_dynamic_stackalloc_param_0];
+; CHECK-64-NEXT: add.s64 %rd2, %rd1, 7;
+; CHECK-64-NEXT: and.b64 %rd3, %rd2, -8;
+; CHECK-64-NEXT: alloca.u64 %rd4, %rd3, 16;
+; CHECK-64-NEXT: cvta.local.u64 %rd5, %rd4;
+; CHECK-64-NEXT: { // callseq 0, 0
+; CHECK-64-NEXT: .param .b64 param0;
+; CHECK-64-NEXT: st.param.b64 [param0], %rd5;
+; CHECK-64-NEXT: .param .b32 retval0;
+; CHECK-64-NEXT: call.uni (retval0),
+; CHECK-64-NEXT: bar,
+; CHECK-64-NEXT: (
+; CHECK-64-NEXT: param0
+; CHECK-64-NEXT: );
+; CHECK-64-NEXT: ld.param.b32 %r1, [retval0];
+; CHECK-64-NEXT: } // callseq 0
+; CHECK-64-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-64-NEXT: ret;
%alloca = alloca i8, i64 %n, align 16
%call = call i32 @bar(ptr %alloca)
ret i32 %call
}
+define float @test_dynamic_stackalloc_unaligned(i64 %0) {
+; CHECK-32-LABEL: test_dynamic_stackalloc_unaligned(
+; CHECK-32: {
+; CHECK-32-NEXT: .reg .b32 %r<6>;
+; CHECK-32-NEXT: .reg .b32 %f<2>;
+; CHECK-32-EMPTY:
+; CHECK-32-NEXT: // %bb.0:
+; CHECK-32-NEXT: ld.param.b32 %r1, [test_dynamic_stackalloc_unaligned_param_0];
+; CHECK-32-NEXT: shl.b32 %r2, %r1, 2;
+; CHECK-32-NEXT: add.s32 %r3, %r2, 7;
+; CHECK-32-NEXT: and.b32 %r4, %r3, -8;
+; CHECK-32-NEXT: alloca.u32 %r5, %r4, 8;
+; CHECK-32-NEXT: ld.local.b32 %f1, [%r5];
+; CHECK-32-NEXT: st.param.b32 [func_retval0], %f1;
+; CHECK-32-NEXT: ret;
+;
+; CHECK-64-LABEL: test_dynamic_stackalloc_unaligned(
+; CHECK-64: {
+; CHECK-64-NEXT: .reg .b32 %f<2>;
+; CHECK-64-NEXT: .reg .b64 %rd<6>;
+; CHECK-64-EMPTY:
+; CHECK-64-NEXT: // %bb.0:
+; CHECK-64-NEXT: ld.param.b64 %rd1, [test_dynamic_stackalloc_unaligned_param_0];
+; CHECK-64-NEXT: shl.b64 %rd2, %rd1, 2;
+; CHECK-64-NEXT: add.s64 %rd3, %rd2, 7;
+; CHECK-64-NEXT: and.b64 %rd4, %rd3, -8;
+; CHECK-64-NEXT: alloca.u64 %rd5, %rd4, 8;
+; CHECK-64-NEXT: ld.local.b32 %f1, [%rd5];
+; CHECK-64-NEXT: st.param.b32 [func_retval0], %f1;
+; CHECK-64-NEXT: ret;
+ %4 = alloca float, i64 %0, align 4
+ %5 = getelementptr float, ptr %4, i64 0
+ %6 = load float, ptr %5, align 4
+ ret float %6
+}
+
declare i32 @bar(ptr)
+
More information about the llvm-commits
mailing list