[llvm] a773356 - [NVPTX][NFC] Move more TMA lowering to tablegen (#140914)
via llvm-commits
llvm-commits at lists.llvm.org
Thu May 29 00:16:04 PDT 2025
Author: Durgadoss R
Date: 2025-05-29T12:46:01+05:30
New Revision: a773356eaa0905dbd0d14f52a9e38ad3ed447539
URL: https://github.com/llvm/llvm-project/commit/a773356eaa0905dbd0d14f52a9e38ad3ed447539
DIFF: https://github.com/llvm/llvm-project/commit/a773356eaa0905dbd0d14f52a9e38ad3ed447539.diff
LOG: [NVPTX][NFC] Move more TMA lowering to tablegen (#140914)
This patch migrates the lowering of the
non-tensor TMA intrinsics to table-gen based.
Also, use ADDR nodes for the pointer operands
wherever applicable.
Signed-off-by: Durgadoss R <durgadossr at nvidia.com>
Added:
Modified:
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
llvm/test/CodeGen/NVPTX/cp-async-bulk.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index b05a4713e6340..5705f1d8e2aaa 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -2686,69 +2686,6 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
}
-void NVPTXDAGToDAGISel::SelectCpAsyncBulkG2S(SDNode *N) {
- // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
- // {dst, mbar, src, size, multicast, cache_hint,
- // multicast_flag, cache_hint_flag}
- // NumOperands = {Chain, IID} + {Actual intrinsic args}
- // = {2} + {8}
- size_t NumOps = N->getNumOperands();
- bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
- bool IsMultiCast = N->getConstantOperandVal(NumOps - 2) == 1;
- size_t NumBaseArgs = 4; // dst, mbar, src, size
- size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID
-
- SDLoc DL(N);
- SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));
-
- // Push MultiCast operand, if available
- if (IsMultiCast)
- Ops.push_back(N->getOperand(MultiCastIdx));
-
- // Push CacheHint operand, if available
- if (IsCacheHint)
- Ops.push_back(N->getOperand(MultiCastIdx + 1));
-
- // Finally, the chain operand
- Ops.push_back(N->getOperand(0));
-
- bool IsShared32 =
- CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
- unsigned Opcode = [&]() {
- if (IsMultiCast && IsCacheHint)
- return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_MC_CH
- : NVPTX::CP_ASYNC_BULK_G2S_MC_CH;
- if (IsMultiCast)
- return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_MC
- : NVPTX::CP_ASYNC_BULK_G2S_MC;
- if (IsCacheHint)
- return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_CH
- : NVPTX::CP_ASYNC_BULK_G2S_CH;
- return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32
- : NVPTX::CP_ASYNC_BULK_G2S;
- }();
- ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
-}
-
-void NVPTXDAGToDAGISel::SelectCpAsyncBulkPrefetchL2(SDNode *N) {
- // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
- // src, size, cache_hint, cache_hint_flag
- // NumOperands = {Chain, IID} + {Actual intrinsic args}
- // = {2} + {4}
- size_t NumOps = N->getNumOperands();
- bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
- size_t NumArgs = IsCacheHint ? 3 : 2; // src, size, cache_hint
-
- SDLoc DL(N);
- SmallVector<SDValue, 4> Ops(N->ops().slice(2, NumArgs));
- Ops.push_back(N->getOperand(0)); // Chain operand
-
- unsigned Opcode = IsCacheHint
- ? NVPTX::CP_ASYNC_BULK_PREFETCH_CH
- : NVPTX::CP_ASYNC_BULK_PREFETCH;
- ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
-}
-
#define TCGEN05_ST_OPCODE(SHAPE, NUM) \
(enableUnpack ? NVPTX::TCGEN05_ST_##SHAPE##_##NUM##_UNPACK \
: NVPTX::TCGEN05_ST_##SHAPE##_##NUM)
@@ -2865,12 +2802,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
switch (IID) {
default:
return false;
- case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster:
- SelectCpAsyncBulkG2S(N);
- return true;
- case Intrinsic::nvvm_cp_async_bulk_prefetch_L2:
- SelectCpAsyncBulkPrefetchL2(N);
- return true;
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 648e8e239cf78..71a5b7ff8cd30 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -92,8 +92,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
void SelectV2I64toI128(SDNode *N);
void SelectI128toV2I64(SDNode *N);
- void SelectCpAsyncBulkG2S(SDNode *N);
- void SelectCpAsyncBulkPrefetchL2(SDNode *N);
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);
void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false);
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 8fb5884fa2a20..1b7eb1643634b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -544,52 +544,50 @@ multiclass CP_ASYNC_BULK_S2G_INTR<bit has_ch> {
[(int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0), i16:$mask)]>,
Requires<[hasPTX<86>, hasSM<100>]>;
}
-defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G_INTR<0>;
-defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR<1>;
+defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G_INTR<has_ch = 0>;
+defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR<has_ch = 1>;
-multiclass CP_ASYNC_BULK_G2S<NVPTXRegClass rc> {
- def NAME: NVPTXInst<(outs),
- (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size),
- !strconcat(CpAsyncBulkStr<0, 0>.G2S, " [$dst], [$src], $size, [$mbar];"), []>,
- Requires<[hasPTX<80>, hasSM<90>]>;
- def NAME # _MC: NVPTXInst<(outs),
- (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int16Regs:$mc),
- !strconcat(CpAsyncBulkStr<1, 0>.G2S, " [$dst], [$src], $size, [$mbar], $mc;"), []>,
- Requires<[hasPTX<80>, hasSM<90>]>;
- def NAME # _CH: NVPTXInst<(outs),
- (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch),
- !strconcat(CpAsyncBulkStr<0, 1>.G2S, " [$dst], [$src], $size, [$mbar], $ch;"), []>,
- Requires<[hasPTX<80>, hasSM<90>]>;
- def NAME # _MC_CH: NVPTXInst<(outs),
- (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int16Regs:$mc, Int64Regs:$ch),
- !strconcat(CpAsyncBulkStr<1, 1>.G2S, " [$dst], [$src], $size, [$mbar], $mc, $ch;"), []>,
- Requires<[hasPTX<80>, hasSM<90>]>;
+multiclass CP_ASYNC_BULK_G2S_INTR<bit has_ch> {
+ defvar Intr = int_nvvm_cp_async_bulk_global_to_shared_cluster;
+
+ def NAME : NVPTXInst<(outs),
+ (ins ADDR:$dst, ADDR:$mbar, ADDR:$src,
+ Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch),
+ !if(has_ch,
+ CpAsyncBulkStr<0, 1>.G2S # " [$dst], [$src], $size, [$mbar], $ch;",
+ CpAsyncBulkStr<0, 0>.G2S # " [$dst], [$src], $size, [$mbar];"),
+ [(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i16:$mask, i64:$ch, 0, !if(has_ch, -1, 0))]>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+
+ def NAME # _MC : NVPTXInst<(outs),
+ (ins ADDR:$dst, ADDR:$mbar, ADDR:$src,
+ Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch),
+ !if(has_ch,
+ CpAsyncBulkStr<1, 1>.G2S # " [$dst], [$src], $size, [$mbar], $mask, $ch;",
+ CpAsyncBulkStr<1, 0>.G2S # " [$dst], [$src], $size, [$mbar], $mask;"),
+ [(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i16:$mask, i64:$ch, -1, !if(has_ch, -1, 0))]>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
}
-defm CP_ASYNC_BULK_G2S : CP_ASYNC_BULK_G2S<Int64Regs>;
-defm CP_ASYNC_BULK_G2S_SHARED32 : CP_ASYNC_BULK_G2S<Int32Regs>;
+defm CP_ASYNC_BULK_G2S : CP_ASYNC_BULK_G2S_INTR<has_ch = 0>;
+defm CP_ASYNC_BULK_G2S_CH : CP_ASYNC_BULK_G2S_INTR<has_ch = 1>;
-multiclass CP_ASYNC_BULK_CTA_TO_CLUSTER<NVPTXRegClass rc> {
- def NAME: NVPTXInst<(outs),
- (ins rc:$dst, rc:$mbar, rc:$src, Int32Regs:$size),
- !strconcat(CpAsyncBulkStr<0, 0>.C2C, " [$dst], [$src], $size, [$mbar];"),
- [(int_nvvm_cp_async_bulk_shared_cta_to_cluster rc:$dst, rc:$mbar, rc:$src, Int32Regs:$size)]>,
- Requires<[hasPTX<80>, hasSM<90>]>;
+def CP_ASYNC_BULK_CTA_TO_CLUSTER : NVPTXInst<(outs),
+ (ins ADDR:$dst, ADDR:$mbar, ADDR:$src, Int32Regs:$size),
+ CpAsyncBulkStr<0, 0>.C2C # " [$dst], [$src], $size, [$mbar];",
+ [(int_nvvm_cp_async_bulk_shared_cta_to_cluster addr:$dst, addr:$mbar, addr:$src, i32:$size)]>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
+
+multiclass CP_ASYNC_BULK_PREFETCH_INTR<bit has_ch> {
+ def NAME : NVPTXInst<(outs), (ins ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
+ !if(has_ch,
+ "cp.async.bulk.prefetch.L2.global.L2::cache_hint" # " [$src], $size, $ch;",
+ "cp.async.bulk.prefetch.L2.global" # " [$src], $size;"),
+ [(int_nvvm_cp_async_bulk_prefetch_L2 addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0))]>,
+ Requires<[hasPTX<80>, hasSM<90>]>;
}
-defm CP_ASYNC_BULK_CTA_TO_CLUSTER : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int64Regs>;
-defm CP_ASYNC_BULK_CTA_TO_CLUSTER_SHARED32 : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int32Regs>;
+defm CP_ASYNC_BULK_PREFETCH : CP_ASYNC_BULK_PREFETCH_INTR<has_ch = 0>;
+defm CP_ASYNC_BULK_PREFETCH_CH : CP_ASYNC_BULK_PREFETCH_INTR<has_ch = 1>;
-//------------------------------
-// Bulk Copy Prefetch Functions
-//------------------------------
-def CP_ASYNC_BULK_PREFETCH : NVPTXInst<(outs),
- (ins Int64Regs:$src, Int32Regs:$size),
- "cp.async.bulk.prefetch.L2.global [$src], $size;", []>,
- Requires<[hasPTX<80>, hasSM<90>]>;
-
-def CP_ASYNC_BULK_PREFETCH_CH : NVPTXInst<(outs),
- (ins Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch),
- "cp.async.bulk.prefetch.L2.global.L2::cache_hint [$src], $size, $ch;", []>,
- Requires<[hasPTX<80>, hasSM<90>]>;
//-------------------------------------
// TMA Async Bulk Tensor Copy Functions
//-------------------------------------
diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll
index d7f2a5df5547e..46a026313d971 100644
--- a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll
+++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll
@@ -23,10 +23,10 @@ define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_param_1];
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_g2s_param_2];
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_param_3];
-; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2];
+; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_g2s_param_4];
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_g2s_param_5];
+; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2];
; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rd4;
-; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_g2s_param_4];
; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%rd3], [%rd1], %r1, [%rd2], %rs1;
; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rs1, %rd4;
; CHECK-PTX64-NEXT: ret;
@@ -42,48 +42,101 @@ define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_param_1];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_g2s_param_2];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_g2s_param_3];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%r2], [%rd1], %r3, [%r1];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_g2s_param_4];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_param_5];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%r2], [%rd1], %r3, [%r1];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rd2;
-; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_g2s_param_4];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%r2], [%rd1], %r3, [%r1], %rs1;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rs1, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
- tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 0, i1 0, i1 0)
- tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 %ch, i1 0, i1 1)
- tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 0, i1 1, i1 0)
+ tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 0, i1 0)
+ tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 0, i1 1)
+ tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 1, i1 0)
tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 1, i1 1)
ret void
}
+; Tests to verify that the immediate values for "mc and ch" land correctly in PTX.
+; The values of 16 and 64 are arbitrary and do not have any significance.
+define void @cp_async_bulk_g2s_imm_mc_ch(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr addrspace(7) %dst, i32 %size, i16 %mc, i64 %ch) {
+; CHECK-PTX64-LABEL: cp_async_bulk_g2s_imm_mc_ch(
+; CHECK-PTX64: {
+; CHECK-PTX64-NEXT: .reg .b16 %rs<3>;
+; CHECK-PTX64-NEXT: .reg .b32 %r<2>;
+; CHECK-PTX64-NEXT: .reg .b64 %rd<6>;
+; CHECK-PTX64-EMPTY:
+; CHECK-PTX64-NEXT: // %bb.0:
+; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_g2s_imm_mc_ch_param_0];
+; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_imm_mc_ch_param_1];
+; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_g2s_imm_mc_ch_param_2];
+; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_imm_mc_ch_param_3];
+; CHECK-PTX64-NEXT: mov.b64 %rd4, 64;
+; CHECK-PTX64-NEXT: mov.b16 %rs1, 16;
+; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rs1, %rd4;
+; CHECK-PTX64-NEXT: ld.param.b64 %rd5, [cp_async_bulk_g2s_imm_mc_ch_param_5];
+; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%rd3], [%rd1], %r1, [%rd2], %rs1;
+; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_g2s_imm_mc_ch_param_4];
+; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rd4;
+; CHECK-PTX64-NEXT: ret;
+;
+; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_g2s_imm_mc_ch(
+; CHECK-PTX-SHARED32: {
+; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>;
+; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>;
+; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<4>;
+; CHECK-PTX-SHARED32-EMPTY:
+; CHECK-PTX-SHARED32-NEXT: // %bb.0:
+; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_g2s_imm_mc_ch_param_0];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_imm_mc_ch_param_1];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_g2s_imm_mc_ch_param_2];
+; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_g2s_imm_mc_ch_param_3];
+; CHECK-PTX-SHARED32-NEXT: mov.b64 %rd2, 64;
+; CHECK-PTX-SHARED32-NEXT: mov.b16 %rs1, 16;
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rs1, %rd2;
+; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd3, [cp_async_bulk_g2s_imm_mc_ch_param_5];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%r2], [%rd1], %r3, [%r1], %rs1;
+; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_g2s_imm_mc_ch_param_4];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rd2;
+; CHECK-PTX-SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 16, i64 64, i1 1, i1 1)
+ tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 16, i64 %ch, i1 1, i1 0)
+ tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 64, i1 0, i1 1)
+ ret void
+}
+
define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32 %size, i64 %ch) {
; CHECK-PTX64-LABEL: cp_async_bulk_s2g(
; CHECK-PTX64: {
; CHECK-PTX64-NEXT: .reg .b32 %r<2>;
-; CHECK-PTX64-NEXT: .reg .b64 %rd<4>;
+; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
; CHECK-PTX64-EMPTY:
; CHECK-PTX64-NEXT: // %bb.0:
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_param_0];
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_param_1];
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_param_2];
-; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_s2g_param_3];
-; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd2], [%rd1], %r1;
+; CHECK-PTX64-NEXT: mov.b64 %rd3, 64;
; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd2], [%rd1], %r1, %rd3;
+; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_s2g_param_3];
+; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd2], [%rd1], %r1;
+; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd2], [%rd1], %r1, %rd4;
; CHECK-PTX64-NEXT: ret;
;
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_s2g(
; CHECK-PTX-SHARED32: {
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<3>;
-; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
+; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<4>;
; CHECK-PTX-SHARED32-EMPTY:
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_param_0];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_param_1];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_s2g_param_2];
-; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_param_3];
-; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd1], [%r1], %r2;
+; CHECK-PTX-SHARED32-NEXT: mov.b64 %rd2, 64;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd1], [%r1], %r2, %rd2;
+; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd3, [cp_async_bulk_s2g_param_3];
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd1], [%r1], %r2;
+; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd1], [%r1], %r2, %rd3;
; CHECK-PTX-SHARED32-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 64, i1 1)
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 0)
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 1)
ret void
@@ -122,16 +175,19 @@ define void @cp_async_bulk_prefetch(ptr addrspace(1) %src, i32 %size, i64 %ch) {
; CHECK-LABEL: cp_async_bulk_prefetch(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<2>;
-; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [cp_async_bulk_prefetch_param_0];
; CHECK-NEXT: ld.param.b32 %r1, [cp_async_bulk_prefetch_param_1];
-; CHECK-NEXT: ld.param.b64 %rd2, [cp_async_bulk_prefetch_param_2];
+; CHECK-NEXT: mov.b64 %rd2, 64;
; CHECK-NEXT: cp.async.bulk.prefetch.L2.global.L2::cache_hint [%rd1], %r1, %rd2;
+; CHECK-NEXT: ld.param.b64 %rd3, [cp_async_bulk_prefetch_param_2];
+; CHECK-NEXT: cp.async.bulk.prefetch.L2.global.L2::cache_hint [%rd1], %r1, %rd3;
; CHECK-NEXT: cp.async.bulk.prefetch.L2.global [%rd1], %r1;
; CHECK-NEXT: ret;
+ tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 64, i1 1)
tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 %ch, i1 1)
- tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 0, i1 0)
+ tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 %ch, i1 0)
ret void
}
More information about the llvm-commits
mailing list