[llvm] 89b7b3b - [NVPTX] support dynamic allocas with PTX alloca instruction (#84585)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 15 11:51:50 PDT 2024
Author: Alex MacLean
Date: 2024-03-15T11:51:46-07:00
New Revision: 89b7b3b9952210fbd9bd0db95385bfed69ffc7a3
URL: https://github.com/llvm/llvm-project/commit/89b7b3b9952210fbd9bd0db95385bfed69ffc7a3
DIFF: https://github.com/llvm/llvm-project/commit/89b7b3b9952210fbd9bd0db95385bfed69ffc7a3.diff
LOG: [NVPTX] support dynamic allocas with PTX alloca instruction (#84585)
Add support for dynamically sized alloca instructions with the PTX
alloca instruction introduced in PTX 7.3
([9.7.15.3. Stack Manipulation Instructions: alloca]
(https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca))
Added:
Modified:
llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
llvm/lib/Target/NVPTX/NVPTXISelLowering.h
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
llvm/test/CodeGen/Generic/ForceStackAlign.ll
llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 86fb367780dc1a..c34472c21ccbe9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -60,10 +60,12 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
NRI->getFrameRegister(MF))
.addReg(NRI->getFrameLocalRegister(MF));
}
- BuildMI(MBB, MBBI, dl,
- MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode),
- NRI->getFrameLocalRegister(MF))
- .addImm(MF.getFunctionNumber());
+ if (!MR.use_empty(NRI->getFrameLocalRegister(MF))) {
+ BuildMI(MBB, MBBI, dl,
+ MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode),
+ NRI->getFrameLocalRegister(MF))
+ .addImm(MF.getFunctionNumber());
+ }
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index c411c8ef9528d7..7ad3d99ab71f20 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -645,8 +645,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);
- // Lowering of DYNAMIC_STACKALLOC is unsupported.
- // Custom lower to produce an error.
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
@@ -937,6 +935,7 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(NVPTXISD::BFE)
MAKE_CASE(NVPTXISD::BFI)
MAKE_CASE(NVPTXISD::PRMT)
+ MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
MAKE_CASE(NVPTXISD::SETP_F16X2)
MAKE_CASE(NVPTXISD::SETP_BF16X2)
MAKE_CASE(NVPTXISD::Dummy)
@@ -2211,14 +2210,39 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SelectionDAG &DAG) const {
- const Function &Fn = DAG.getMachineFunction().getFunction();
-
- DiagnosticInfoUnsupported NoDynamicAlloca(
- Fn, "dynamic alloca unsupported by NVPTX backend",
- SDLoc(Op).getDebugLoc());
- DAG.getContext()->diagnose(NoDynamicAlloca);
- auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
- return DAG.getMergeValues(Ops, SDLoc());
+
+ if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
+ const Function &Fn = DAG.getMachineFunction().getFunction();
+
+ DiagnosticInfoUnsupported NoDynamicAlloca(
+ Fn,
+ "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
+ "requires target sm_52.",
+ SDLoc(Op).getDebugLoc());
+ DAG.getContext()->diagnose(NoDynamicAlloca);
+ auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
+ Op.getOperand(0)};
+ return DAG.getMergeValues(Ops, SDLoc());
+ }
+
+ SDValue Chain = Op.getOperand(0);
+ SDValue Size = Op.getOperand(1);
+ uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+ SDLoc DL(Op.getNode());
+
+ // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
+ if (nvTM->is64Bit())
+ Size = DAG.getZExtOrTrunc(Size, DL, MVT::i64);
+ else
+ Size = DAG.getZExtOrTrunc(Size, DL, MVT::i32);
+
+ SDValue AllocOps[] = {Chain, Size,
+ DAG.getTargetConstant(Align, DL, MVT::i32)};
+ SDValue Alloca = DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL,
+ nvTM->is64Bit() ? MVT::i64 : MVT::i32, AllocOps);
+
+ SDValue MergeOps[] = {Alloca, Chain};
+ return DAG.getMergeValues(MergeOps, DL);
}
// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index cf1d4580766918..c9db10e555cefe 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -61,6 +61,7 @@ enum NodeType : unsigned {
BFE,
BFI,
PRMT,
+ DYNAMIC_STACKALLOC,
Dummy,
LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 3dc5b450cbf5cf..3d387ed574fabc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3805,6 +3805,28 @@ def CALL_PROTOTYPE :
NVPTXInst<(outs), (ins ProtoIdent:$ident),
"$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
+def SDTDynAllocaOp :
+ SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisInt<1>, SDTCisInt<2>]>;
+
+def dyn_alloca :
+ SDNode<"NVPTXISD::DYNAMIC_STACKALLOC", SDTDynAllocaOp,
+ [SDNPHasChain, SDNPSideEffect]>;
+
+def DYNAMIC_STACKALLOC32 :
+ NVPTXInst<(outs Int32Regs:$ptr),
+ (ins Int32Regs:$size, i32imm:$align),
+ "alloca.u32 \t$ptr, $size, $align;\n\t"
+ "cvta.local.u32 \t$ptr, $ptr;",
+ [(set (i32 Int32Regs:$ptr), (dyn_alloca Int32Regs:$size, (i32 timm:$align)))]>,
+ Requires<[hasPTX<73>, hasSM<52>]>;
+
+def DYNAMIC_STACKALLOC64 :
+ NVPTXInst<(outs Int64Regs:$ptr),
+ (ins Int64Regs:$size, i32imm:$align),
+ "alloca.u64 \t$ptr, $size, $align;\n\t"
+ "cvta.local.u64 \t$ptr, $ptr;",
+ [(set Int64Regs:$ptr, (dyn_alloca Int64Regs:$size, (i32 timm:$align)))]>,
+ Requires<[hasPTX<73>, hasSM<52>]>;
include "NVPTXIntrinsics.td"
diff --git a/llvm/test/CodeGen/Generic/ForceStackAlign.ll b/llvm/test/CodeGen/Generic/ForceStackAlign.ll
index 2c35ad350f0473..7993b3eff65b68 100644
--- a/llvm/test/CodeGen/Generic/ForceStackAlign.ll
+++ b/llvm/test/CodeGen/Generic/ForceStackAlign.ll
@@ -8,7 +8,7 @@
; Stack realignment not supported.
; XFAIL: target=sparc{{.*}}
-; NVPTX cannot select dynamic_stackalloc
+; NVPTX can only select dynamic_stackalloc on sm_52+ and with ptx73+
; XFAIL: target=nvptx{{.*}}
define i32 @f(ptr %p) nounwind {
diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
index 3ef55ca5309f88..2db0c67800a259 100644
--- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
@@ -1,10 +1,44 @@
-; RUN: not llc -march=nvptx < %s 2>&1 | FileCheck %s
-; RUN: not llc -march=nvptx64 < %s 2>&1 | FileCheck %s
+; RUN: not llc < %s -march=nvptx -mattr=+ptx72 -mcpu=sm_52 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
+; RUN: not llc < %s -march=nvptx -mattr=+ptx73 -mcpu=sm_50 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
-; CHECK: in function test_dynamic_stackalloc{{.*}}: dynamic alloca unsupported by NVPTX backend
+; RUN: llc < %s -march=nvptx -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-32
+; RUN: llc < %s -march=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-64
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
-define void @test_dynamic_stackalloc(i64 %n) {
- %alloca = alloca i32, i64 %n
- store volatile i32 0, ptr %alloca
- ret void
+; CHECK-FAILS: in function test_dynamic_stackalloc{{.*}}: Support for dynamic alloca introduced in PTX ISA version 7.3 and requires target sm_52.
+
+; CHECK-LABEL: .visible .func (.param .b32 func_retval0) test_dynamic_stackalloc(
+; CHECK-NOT: __local_depot
+
+; CHECK-32: ld.param.u32 %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
+; CHECK-32-NEXT: mad.lo.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 1, 7;
+; CHECK-32-NEXT: and.b32 %r[[SIZE3:[0-9]]], %r[[SIZE2]], -8;
+; CHECK-32-NEXT: alloca.u32 %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16;
+; CHECK-32-NEXT: cvta.local.u32 %r[[ALLOCA]], %r[[ALLOCA]];
+; CHECK-32-NEXT: { // callseq 0, 0
+; CHECK-32-NEXT: .reg .b32 temp_param_reg;
+; CHECK-32-NEXT: .param .b32 param0;
+; CHECK-32-NEXT: st.param.b32 [param0+0], %r[[ALLOCA]];
+
+; CHECK-64: ld.param.u64 %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
+; CHECK-64-NEXT: add.s64 %rd[[SIZE2:[0-9]]], %rd[[SIZE]], 7;
+; CHECK-64-NEXT: and.b64 %rd[[SIZE3:[0-9]]], %rd[[SIZE2]], -8;
+; CHECK-64-NEXT: alloca.u64 %rd[[ALLOCA:[0-9]]], %rd[[SIZE3]], 16;
+; CHECK-64-NEXT: cvta.local.u64 %rd[[ALLOCA]], %rd[[ALLOCA]];
+; CHECK-64-NEXT: { // callseq 0, 0
+; CHECK-64-NEXT: .reg .b32 temp_param_reg;
+; CHECK-64-NEXT: .param .b64 param0;
+; CHECK-64-NEXT: st.param.b64 [param0+0], %rd[[ALLOCA]];
+
+; CHECK-NEXT: .param .b32 retval0;
+; CHECK-NEXT: call.uni (retval0),
+; CHECK-NEXT: bar,
+
+define i32 @test_dynamic_stackalloc(i64 %n) {
+ %alloca = alloca i8, i64 %n, align 16
+ %call = call i32 @bar(ptr %alloca)
+ ret i32 %call
}
+
+declare i32 @bar(ptr)
\ No newline at end of file
More information about the llvm-commits
mailing list