[llvm] [NVPTX] support dynamic allocas with PTX alloca instruction (PR #84585)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 8 16:04:29 PST 2024
https://github.com/AlexMaclean created https://github.com/llvm/llvm-project/pull/84585
Add support for dynamically sized alloca instructions with the PTX alloca instruction introduced in PTX 7.3 ([9.7.15.3. Stack Manipulation Instructions: alloca](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca))
>From 15660e33872fbabe86dca6ae29ec33146cada10f Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean at nvidia.com>
Date: Fri, 8 Mar 2024 06:45:59 +0000
Subject: [PATCH] [NVPTX] support dynamic allocas with PTX alloca instruction
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 29 ++++++++-----
llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 1 +
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 22 ++++++++++
llvm/test/CodeGen/Generic/ForceStackAlign.ll | 3 --
llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll | 43 ++++++++++++++++---
5 files changed, 78 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index c979c03dc1b835..ac7d03895eba63 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -645,8 +645,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
- // Lowering of DYNAMIC_STACKALLOC is unsupported.
- // Custom lower to produce an error.
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
@@ -937,6 +935,7 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::BFE)
MAKE_CASE(NVPTXISD::BFI)
MAKE_CASE(NVPTXISD::PRMT)
+ MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
MAKE_CASE(NVPTXISD::SETP_F16X2)
MAKE_CASE(NVPTXISD::SETP_BF16X2)
MAKE_CASE(NVPTXISD::Dummy)
@@ -2211,14 +2210,24 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
- const Function &Fn = DAG.getMachineFunction().getFunction();
-
- DiagnosticInfoUnsupported NoDynamicAlloca(
- Fn, "dynamic alloca unsupported by NVPTX backend",
- SDLoc(Op).getDebugLoc());
- DAG.getContext()->diagnose(NoDynamicAlloca);
- auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
- return DAG.getMergeValues(Ops, SDLoc());
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ SDLoc DL(Op.getNode());
+
+ // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
+ if (nvTM->is64Bit())
+ Size = DAG.getZExtOrTrunc(Size, DL, MVT::i64);
+ else
+ Size = DAG.getZExtOrTrunc(Size, DL, MVT::i32);
+
+ SDValue AllocOps[] = {Chain, Size,
+ DAG.getTargetConstant(Align, DL, MVT::i32)};
+ SDValue Alloca = DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL,
+ nvTM->is64Bit() ? MVT::i64 : MVT::i32, AllocOps);
+
+ SDValue MergeOps[] = {Alloca, Chain};
+ return DAG.getMergeValues(MergeOps, DL);
}
// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index cf1d4580766918..c9db10e555cefe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -61,6 +61,7 @@ enum NodeType : unsigned {
BFE,
BFI,
PRMT,
+ DYNAMIC_STACKALLOC,
Dummy,
LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 3dc5b450cbf5cf..25d3eb5bed8fc1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3805,6 +3805,28 @@ def CALL_PROTOTYPE :
NVPTXInst<(outs), (ins ProtoIdent:$ident),
"$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
+def SDTDynAllocaOp :
+ SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisInt<2>]>;
+
+def dyn_alloca :
+ SDNode<"NVPTXISD::DYNAMIC_STACKALLOC", SDTDynAllocaOp,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def DYNAMIC_STACKALLOC32 :
+ NVPTXInst<(outs Int32Regs:$ptr),
+ (ins Int32Regs:$size, i32imm:$align),
+ "alloca.u32 \t$ptr, $size, $align;\n\t"
+ "cvta.local.u32 \t$ptr, $ptr;",
+ [(set (i32 Int32Regs:$ptr), (dyn_alloca Int32Regs:$size, (i32 timm:$align)))]>,
+ Requires<[hasPTX<73>]>;
+
+def DYNAMIC_STACKALLOC64 :
+ NVPTXInst<(outs Int64Regs:$ptr),
+ (ins Int64Regs:$size, i32imm:$align),
+ "alloca.u64 \t$ptr, $size, $align;\n\t"
+ "cvta.local.u64 \t$ptr, $ptr;",
+ [(set Int64Regs:$ptr, (dyn_alloca Int64Regs:$size, (i32 timm:$align)))]>,
+ Requires<[hasPTX<73>]>;
include "NVPTXIntrinsics.td"
diff --git a/llvm/test/CodeGen/Generic/ForceStackAlign.ll b/llvm/test/CodeGen/Generic/ForceStackAlign.ll
index 2c35ad350f0473..0cec2895adc8a7 100644
--- a/llvm/test/CodeGen/Generic/ForceStackAlign.ll
+++ b/llvm/test/CodeGen/Generic/ForceStackAlign.ll
@@ -8,9 +8,6 @@
; Stack realignment not supported.
; XFAIL: target=sparc{{.*}}
-; NVPTX cannot select dynamic_stackalloc
-; XFAIL: target=nvptx{{.*}}
-
define i32 @f(ptr %p) nounwind {
entry:
%0 = load i8, ptr %p
diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
index 3ef55ca5309f88..3d5ec7c90630a1 100644
--- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
@@ -1,10 +1,39 @@
-; RUN: not llc -march=nvptx < %s 2>&1 | FileCheck %s
-; RUN: not llc -march=nvptx64 < %s 2>&1 | FileCheck %s
-; CHECK: in function test_dynamic_stackalloc{{.*}}: dynamic alloca unsupported by NVPTX backend
+; RUN: llc < %s -march=nvptx -mattr=+ptx73 | FileCheck %s --check-prefixes=CHECK,CHECK-32
+; RUN: llc < %s -march=nvptx64 -mattr=+ptx73 | FileCheck %s --check-prefixes=CHECK,CHECK-64
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mattr=+ptx73 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mattr=+ptx73 | %ptxas-verify %}
-define void @test_dynamic_stackalloc(i64 %n) {
- %alloca = alloca i32, i64 %n
- store volatile i32 0, ptr %alloca
- ret void
+; CHECK-LABEL: .visible .func (.param .b32 func_retval0) test_dynamic_stackalloc(
+
+; CHECK-32: ld.param.u32 %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
+; CHECK-32-NEXT: mad.lo.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 1, 7;
+; CHECK-32-NEXT: and.b32 %r[[SIZE3:[0-9]]], %r[[SIZE2]], -8;
+; CHECK-32-NEXT: alloca.u32 %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16;
+; CHECK-32-NEXT: cvta.local.u32 %r[[ALLOCA]], %r[[ALLOCA]];
+; CHECK-32-NEXT: { // callseq 0, 0
+; CHECK-32-NEXT: .reg .b32 temp_param_reg;
+; CHECK-32-NEXT: .param .b32 param0;
+; CHECK-32-NEXT: st.param.b32 [param0+0], %r[[ALLOCA]];
+
+; CHECK-64: ld.param.u64 %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
+; CHECK-64-NEXT: add.s64 %rd[[SIZE2:[0-9]]], %rd[[SIZE]], 7;
+; CHECK-64-NEXT: and.b64 %rd[[SIZE3:[0-9]]], %rd[[SIZE2]], -8;
+; CHECK-64-NEXT: alloca.u64 %rd[[ALLOCA:[0-9]]], %rd[[SIZE3]], 16;
+; CHECK-64-NEXT: cvta.local.u64 %rd[[ALLOCA]], %rd[[ALLOCA]];
+; CHECK-64-NEXT: { // callseq 0, 0
+; CHECK-64-NEXT: .reg .b32 temp_param_reg;
+; CHECK-64-NEXT: .param .b64 param0;
+; CHECK-64-NEXT: st.param.b64 [param0+0], %rd[[ALLOCA]];
+
+; CHECK-NEXT: .param .b32 retval0;
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: bar,
+
+define i32 @test_dynamic_stackalloc(i64 %n) {
+ %alloca = alloca i8, i64 %n, align 16
+ %call = call i32 @bar(ptr %alloca)
+ ret i32 %call
}
+
+declare i32 @bar(ptr)
\ No newline at end of file
More information about the llvm-commits
mailing list