[llvm] [NVPTX] Add intrinsics and codegen for tensormap.replace (PR #172458)
Srinivasa Ravi via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 19 04:50:46 PST 2025
https://github.com/Wolfram70 updated https://github.com/llvm/llvm-project/pull/172458
>From c7ba6360103ccec95a6533fd154bcb103b08eb21 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Thu, 11 Dec 2025 04:33:20 +0000
Subject: [PATCH 1/8] [NVPTX] Add tensormap.replace intrinsics
This change adds NVVM intrinsics and NVPTX codegen for the
tensormap.replace instructions.
Tests are added in `tensormap_replace.ll`, `tensormap_replace_sm_100a.ll`,
and `tensormap_replace_sm_103a.ll` and tested through `ptxas-13.0`.
PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-tensormap-replace
---
llvm/include/llvm/IR/IntrinsicsNVVM.td | 52 ++++
llvm/include/llvm/IR/NVVMIntrinsicUtils.h | 6 +
llvm/lib/IR/NVVMIntrinsicUtils.cpp | 145 ++++++++++
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 91 ++++++
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 64 +++++
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 28 ++
llvm/test/CodeGen/NVPTX/tensormap_replace.ll | 263 ++++++++++++++++++
.../NVPTX/tensormap_replace_sm_100a.ll | 60 ++++
.../NVPTX/tensormap_replace_sm_103a.ll | 19 ++
9 files changed, 728 insertions(+)
create mode 100644 llvm/test/CodeGen/NVPTX/tensormap_replace.ll
create mode 100644 llvm/test/CodeGen/NVPTX/tensormap_replace_sm_100a.ll
create mode 100644 llvm/test/CodeGen/NVPTX/tensormap_replace_sm_103a.ll
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index aab85c2a86373..a82af450b35b3 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -3312,4 +3312,56 @@ foreach sp = [0, 1] in {
}
}
+//
+// tensormap.replace intrinsics
+//
+
+let IntrProperties = [IntrArgMemOnly, IntrWriteMem, NoCapture<ArgIndex<0>>] in {
+ def int_nvvm_tensormap_replace_global_address :
+ DefaultAttrsIntrinsic<[], [llvm_anyptr_ty, llvm_i64_ty], []>;
+ def int_nvvm_tensormap_replace_rank :
+ DefaultAttrsIntrinsic<[], [llvm_anyptr_ty, llvm_i32_ty], []>;
+}
+
+let IntrProperties = [IntrArgMemOnly, ImmArg<ArgIndex<1>>, IntrWriteMem,
+ NoCapture<ArgIndex<0>>] in {
+ def int_nvvm_tensormap_replace_global_stride :
+ DefaultAttrsIntrinsic<[], [llvm_anyptr_ty, llvm_i32_ty, llvm_i64_ty], []>;
+ foreach tmap_field = ["box_dim", "global_dim", "element_stride"] in {
+ def int_nvvm_tensormap_replace_ # tmap_field :
+ DefaultAttrsIntrinsic<[], [llvm_anyptr_ty, llvm_i32_ty, llvm_i32_ty], []>;
+ }
+}
+
+def int_nvvm_tensormap_replace_elemtype :
+ DefaultAttrsIntrinsic<[], [llvm_anyptr_ty, llvm_i32_ty],
+ [IntrArgMemOnly, IntrWriteMem, ImmArg<ArgIndex<1>>, NoCapture<ArgIndex<0>>,
+ Range<ArgIndex<1>, 0, 16>,
+ ArgInfo<ArgIndex<1>, [ArgName<"elemtype">,
+ ImmArgPrinter<"printTensormapElemType">]>]>;
+def int_nvvm_tensormap_replace_interleave_layout :
+ DefaultAttrsIntrinsic<[], [llvm_anyptr_ty, llvm_i32_ty],
+ [IntrArgMemOnly, IntrWriteMem, ImmArg<ArgIndex<1>>, NoCapture<ArgIndex<0>>,
+ Range<ArgIndex<1>, 0, 3>,
+ ArgInfo<ArgIndex<1>, [ArgName<"interleave_layout">,
+ ImmArgPrinter<"printTensormapInterleaveLayout">]>]>;
+def int_nvvm_tensormap_replace_swizzle_mode :
+ DefaultAttrsIntrinsic<[], [llvm_anyptr_ty, llvm_i32_ty],
+ [IntrArgMemOnly, IntrWriteMem, ImmArg<ArgIndex<1>>, NoCapture<ArgIndex<0>>,
+ Range<ArgIndex<1>, 0, 5>,
+ ArgInfo<ArgIndex<1>, [ArgName<"swizzle_mode">,
+ ImmArgPrinter<"printTensormapSwizzleMode">]>]>;
+def int_nvvm_tensormap_replace_swizzle_atomicity :
+ DefaultAttrsIntrinsic<[], [llvm_anyptr_ty, llvm_i32_ty],
+ [IntrArgMemOnly, IntrWriteMem, ImmArg<ArgIndex<1>>, NoCapture<ArgIndex<0>>,
+ Range<ArgIndex<1>, 0, 4>,
+ ArgInfo<ArgIndex<1>, [ArgName<"swizzle_atomicity">,
+ ImmArgPrinter<"printTensormapSwizzleAtomicity">]>]>;
+def int_nvvm_tensormap_replace_fill_mode :
+ DefaultAttrsIntrinsic<[], [llvm_anyptr_ty, llvm_i32_ty],
+ [IntrArgMemOnly, IntrWriteMem, ImmArg<ArgIndex<1>>, NoCapture<ArgIndex<0>>,
+ Range<ArgIndex<1>, 0, 2>,
+ ArgInfo<ArgIndex<1>, [ArgName<"fill_mode">,
+ ImmArgPrinter<"printTensormapFillMode">]>]>;
+
} // let TargetPrefix = "nvvm"
diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
index 62f2a249b1357..a16051b8cdb01 100644
--- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
+++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
@@ -63,6 +63,12 @@ void printTcgen05MMAKind(raw_ostream &OS, const Constant *ImmArgVal);
void printTcgen05CollectorUsageOp(raw_ostream &OS, const Constant *ImmArgVal);
+void printTensormapElemType(raw_ostream &OS, const Constant *ImmArgVal);
+void printTensormapInterleaveLayout(raw_ostream &OS, const Constant *ImmArgVal);
+void printTensormapSwizzleMode(raw_ostream &OS, const Constant *ImmArgVal);
+void printTensormapSwizzleAtomicity(raw_ostream &OS, const Constant *ImmArgVal);
+void printTensormapFillMode(raw_ostream &OS, const Constant *ImmArgVal);
+
inline bool FPToIntegerIntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) {
switch (IntrinsicID) {
case Intrinsic::nvvm_f2i_rm_ftz:
diff --git a/llvm/lib/IR/NVVMIntrinsicUtils.cpp b/llvm/lib/IR/NVVMIntrinsicUtils.cpp
index 4389fa38ad3af..4446792b89988 100644
--- a/llvm/lib/IR/NVVMIntrinsicUtils.cpp
+++ b/llvm/lib/IR/NVVMIntrinsicUtils.cpp
@@ -59,3 +59,148 @@ void nvvm::printTcgen05CollectorUsageOp(raw_ostream &OS,
llvm_unreachable("printTcgen05CollectorUsageOp called with invalid value for "
"immediate argument");
}
+
+void nvvm::printTensormapElemType(raw_ostream &OS, const Constant *ImmArgVal) {
+ if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
+ uint64_t Val = CI->getZExtValue();
+ switch (Val) {
+ case 0:
+ OS << "u8";
+ return;
+ case 1:
+ OS << "u16";
+ return;
+ case 2:
+ OS << "u32";
+ return;
+ case 3:
+ OS << "s32";
+ return;
+ case 4:
+ OS << "u64";
+ return;
+ case 5:
+ OS << "s64";
+ return;
+ case 6:
+ OS << "f16";
+ return;
+ case 7:
+ OS << "f32";
+ return;
+ case 8:
+ OS << "f32.ftz";
+ return;
+ case 9:
+ OS << "f64";
+ return;
+ case 10:
+ OS << "bf16";
+ return;
+ case 11:
+ OS << "tf32";
+ return;
+ case 12:
+ OS << "tf32.ftz";
+ return;
+ case 13:
+ OS << "b4x16";
+ return;
+ case 14:
+ OS << "b4x16_p64";
+ return;
+ case 15:
+ OS << "b6x16_p32";
+ return;
+ }
+ }
+ llvm_unreachable("printTensormapElemType called with invalid value for "
+ "immediate argument");
+}
+
+void nvvm::printTensormapInterleaveLayout(raw_ostream &OS,
+ const Constant *ImmArgVal) {
+ if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
+ uint64_t Val = CI->getZExtValue();
+ switch (Val) {
+ case 0:
+ OS << "No interleave";
+ return;
+ case 1:
+ OS << "16B interleave";
+ return;
+ case 2:
+ OS << "32B interleave";
+ return;
+ }
+ }
+ llvm_unreachable(
+ "printTensormapInterleaveLayout called with invalid value for "
+ "immediate argument");
+}
+
+void nvvm::printTensormapSwizzleMode(raw_ostream &OS,
+ const Constant *ImmArgVal) {
+ if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
+ uint64_t Val = CI->getZExtValue();
+ switch (Val) {
+ case 0:
+ OS << "No swizzling";
+ return;
+ case 1:
+ OS << "32B swizzling";
+ return;
+ case 2:
+ OS << "64B swizzling";
+ return;
+ case 3:
+ OS << "128B swizzling";
+ return;
+ case 4:
+ OS << "96B swizzling";
+ return;
+ }
+ }
+ llvm_unreachable("printTensormapSwizzleMode called with invalid value for "
+ "immediate argument");
+}
+
+void nvvm::printTensormapSwizzleAtomicity(raw_ostream &OS,
+ const Constant *ImmArgVal) {
+ if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
+ uint64_t Val = CI->getZExtValue();
+ switch (Val) {
+ case 0:
+ OS << "16B";
+ return;
+ case 1:
+ OS << "32B";
+ return;
+ case 2:
+ OS << "32B + 8B flip";
+ return;
+ case 3:
+ OS << "64B";
+ return;
+ }
+ }
+ llvm_unreachable(
+ "printTensormapSwizzleAtomicity called with invalid value for "
+ "immediate argument");
+}
+
+void nvvm::printTensormapFillMode(raw_ostream &OS, const Constant *ImmArgVal) {
+ if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
+ uint64_t Val = CI->getZExtValue();
+ switch (Val) {
+ case 0:
+ OS << "Zero fill";
+ return;
+ case 1:
+ OS << "OOB-NaN fill";
+ return;
+ }
+ }
+ llvm_unreachable("printTensormapFillMode called with invalid value for "
+ "immediate argument");
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 92f3865818530..a8d5da3407a67 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2746,6 +2746,64 @@ lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset = false) {
return {{BuildVector, Chain}};
}
+static SDValue lowerTensormapReplaceElemtype(SDValue Op, SelectionDAG &DAG) {
+ SDNode *N = Op.getNode();
+ SDLoc DL(N);
+ unsigned Val = N->getConstantOperandVal(3);
+
+ if (!DAG.getSubtarget<NVPTXSubtarget>().hasTensormapReplaceElemtypeSupport(
+ Val)) {
+ const Function &Fn = DAG.getMachineFunction().getFunction();
+
+ unsigned AS = 0;
+ if (auto *MemN = dyn_cast<MemIntrinsicSDNode>(N)) {
+ AS = MemN->getAddressSpace();
+ }
+ Type *PtrTy = PointerType::get(*DAG.getContext(), AS);
+ Module *M = DAG.getMachineFunction().getFunction().getParent();
+
+ DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+ Fn,
+ "Intrinsic " +
+ Intrinsic::getName(N->getConstantOperandVal(1), {PtrTy}, M) +
+ " with elemtype " + Twine(Val) +
+ " is not supported on the given target.",
+ DL.getDebugLoc()));
+ return Op.getOperand(0);
+ }
+
+ return Op;
+}
+
+static SDValue lowerTensormapReplaceSwizzleMode(SDValue Op, SelectionDAG &DAG) {
+ SDNode *N = Op.getNode();
+ SDLoc DL(N);
+ unsigned Val = N->getConstantOperandVal(3);
+
+ if (!DAG.getSubtarget<NVPTXSubtarget>().hasTensormapReplaceSwizzleModeSupport(
+ Val)) {
+ const Function &Fn = DAG.getMachineFunction().getFunction();
+
+ unsigned AS = 0;
+ if (auto *MemN = dyn_cast<MemIntrinsicSDNode>(N)) {
+ AS = MemN->getAddressSpace();
+ }
+ Type *PtrTy = PointerType::get(*DAG.getContext(), AS);
+ Module *M = DAG.getMachineFunction().getFunction().getParent();
+
+ DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+ Fn,
+ "Intrinsic " +
+ Intrinsic::getName(N->getConstantOperandVal(1), {PtrTy}, M) +
+ " with swizzle mode " + Twine(Val) +
+ " is not supported on the given target.",
+ DL.getDebugLoc()));
+ return Op.getOperand(0);
+ }
+
+ return Op;
+}
+
static SDValue lowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) {
SDNode *N = Op.getNode();
SDValue Intrin = N->getOperand(1);
@@ -2822,6 +2880,10 @@ static SDValue lowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG) {
case Intrinsic::
nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
return LowerTcgen05MMADisableOutputLane(Op, DAG);
+ case Intrinsic::nvvm_tensormap_replace_elemtype:
+ return lowerTensormapReplaceElemtype(Op, DAG);
+ case Intrinsic::nvvm_tensormap_replace_swizzle_mode:
+ return lowerTensormapReplaceSwizzleMode(Op, DAG);
}
return Op;
}
@@ -4526,6 +4588,35 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
+ case Intrinsic::nvvm_tensormap_replace_global_address:
+ case Intrinsic::nvvm_tensormap_replace_global_stride:{
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = MVT::i64;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOStore;
+ Info.align.reset();
+ return true;
+ }
+
+ case Intrinsic::nvvm_tensormap_replace_rank:
+ case Intrinsic::nvvm_tensormap_replace_box_dim:
+ case Intrinsic::nvvm_tensormap_replace_global_dim:
+ case Intrinsic::nvvm_tensormap_replace_element_stride:
+ case Intrinsic::nvvm_tensormap_replace_elemtype:
+ case Intrinsic::nvvm_tensormap_replace_interleave_layout:
+ case Intrinsic::nvvm_tensormap_replace_swizzle_mode:
+ case Intrinsic::nvvm_tensormap_replace_swizzle_atomicity:
+ case Intrinsic::nvvm_tensormap_replace_fill_mode: {
+ Info.opc = ISD::INTRINSIC_VOID;
+ Info.memVT = MVT::i32;
+ Info.ptrVal = I.getArgOperand(0);
+ Info.offset = 0;
+ Info.flags = MachineMemOperand::MOStore;
+ Info.align.reset();
+ return true;
+ }
+
case Intrinsic::nvvm_ldu_global_i:
case Intrinsic::nvvm_ldu_global_f:
case Intrinsic::nvvm_ldu_global_p: {
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 817006c367379..b15d1210ded32 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -6233,3 +6233,67 @@ foreach sp = [0, 1] in {
}
}
+//
+// tensormap.replace Instructions
+//
+
+class TensormapReplaceInst_2<string state_space, string field_name,
+ string regclass_name, NVPTXRegClass val_RC, ValueType ValTy, Intrinsic Intrin,
+ code predicate> :
+ BasicNVPTXInst<(outs),
+ (ins ADDR:$addr, val_RC:$val),
+ "tensormap.replace.tile." # field_name # "." # state_space # ".b1024." # regclass_name,
+ [(PatFrag<(ops node:$addr, node:$val),
+ (Intrin node:$addr, node:$val), predicate>
+ addr:$addr, ValTy:$val)]>;
+
+class TensormapReplaceInst_3<string state_space, string field_name,
+ string regclass_name, NVPTXRegClass val_RC, ValueType ValTy, Intrinsic Intrin,
+ code predicate> :
+ BasicNVPTXInst<(outs),
+ (ins ADDR:$addr, B32:$ord, val_RC:$val),
+ "tensormap.replace.tile." # field_name # "." # state_space # ".b1024." # regclass_name,
+ [(PatFrag<(ops node:$addr, node:$ord, node:$val),
+ (Intrin node:$addr, node:$ord, node:$val), predicate>
+ addr:$addr, i32:$ord, ValTy:$val)]>;
+
+foreach state_space = ["GLOBAL", "SHARED_CTA"] in {
+ defvar pred = !if(!eq(state_space, "GLOBAL"), AS_match.global, AS_match.shared);
+ defvar ss_ptx = !tolower(!subst("_", "::", state_space));
+ let Predicates = [callSubtarget<"hasTensormapReplaceSupport">] in {
+ def TMAP_REPLACE_TILE_GLOBAL_ADDRESS_ # state_space :
+ TensormapReplaceInst_2<ss_ptx, "global_address", "b64", B64, i64,
+ int_nvvm_tensormap_replace_global_address, pred>;
+
+ foreach field_name = ["INTERLEAVE_LAYOUT", "FILL_MODE", "RANK"] in {
+ defvar intrin = !cast<Intrinsic>("int_nvvm_tensormap_replace_" # !tolower(field_name));
+ def TMAP_REPLACE_TILE_ # field_name # _ # state_space :
+ TensormapReplaceInst_2<ss_ptx, !tolower(field_name), "b32", B32, i32,
+ intrin, pred>;
+ } // field_name
+
+ def TMAP_REPLACE_TILE_GLOBAL_STRIDE_ # state_space :
+ TensormapReplaceInst_3<ss_ptx, "global_stride", "b64", B64, i64,
+ int_nvvm_tensormap_replace_global_stride, pred>;
+
+ foreach field_name = ["BOX_DIM", "GLOBAL_DIM", "ELEMENT_STRIDE"] in {
+ defvar intrin = !cast<Intrinsic>("int_nvvm_tensormap_replace_" # !tolower(field_name));
+ def TMAP_REPLACE_TILE_ # field_name # _ # state_space :
+ TensormapReplaceInst_3<ss_ptx, !tolower(field_name), "b32", B32, i32,
+ intrin, pred>;
+ } // field_name
+ } // hasTensormapReplaceSupport
+
+ def TMAP_REPLACE_TILE_ELEMTYPE_ # state_space :
+ TensormapReplaceInst_2<ss_ptx, "elemtype", "b32", B32, i32,
+ int_nvvm_tensormap_replace_elemtype, pred>;
+
+ def TMAP_REPLACE_SWIZZLE_ATOMICITY_ # state_space :
+ TensormapReplaceInst_2<ss_ptx, "swizzle_atomicity", "b32", B32, i32,
+ int_nvvm_tensormap_replace_swizzle_atomicity, pred>,
+ Requires<[callSubtarget<"hasTensormapReplaceSwizzleAtomicitySupport">]>;
+
+ def TMAP_REPLACE_SWIZZLE_MODE_ # state_space :
+ TensormapReplaceInst_2<ss_ptx, "swizzle_mode", "b32", B32, i32,
+ int_nvvm_tensormap_replace_swizzle_mode, pred>;
+} // state_space
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 6f6057b3689e6..46cdaadeae6b9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -202,6 +202,34 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
hasPTXWithAccelSMs(86, {100, 101, 120});
}
+ bool hasTensormapReplaceSupport() const {
+ return hasPTXWithFamilySMs(90, {90, 100, 110, 120}) ||
+ hasPTXWithFamilySMs(88, {90, 100, 101, 120}) ||
+ hasPTXWithAccelSMs(83, {90, 100, 101, 120});
+ }
+
+ bool hasTensormapReplaceElemtypeSupport(unsigned value) const {
+ if (value >= 13)
+ return hasPTXWithFamilySMs(90, {100, 110, 120}) ||
+ hasPTXWithFamilySMs(88, {100, 101, 120}) ||
+ hasPTXWithAccelSMs(87, {100, 101, 120});
+
+ return hasTensormapReplaceSupport();
+ }
+
+ bool hasTensormapReplaceSwizzleAtomicitySupport() const {
+ return hasPTXWithFamilySMs(90, {100, 110, 120}) ||
+ hasPTXWithFamilySMs(88, {100, 101, 120}) ||
+ hasPTXWithAccelSMs(87, {100, 101, 120});
+ }
+
+ bool hasTensormapReplaceSwizzleModeSupport(unsigned value) const {
+ if (value == 4)
+ return hasPTXWithAccelSMs(88, {103});
+
+ return hasTensormapReplaceSupport();
+ }
+
// Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
// terminates a basic block. Instead, it would assume that control flow
// continued to the next instruction. The next instruction could be in the
diff --git a/llvm/test/CodeGen/NVPTX/tensormap_replace.ll b/llvm/test/CodeGen/NVPTX/tensormap_replace.ll
new file mode 100644
index 0000000000000..e1be5f9adbce7
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tensormap_replace.ll
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90a -mattr=+ptx83 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90a -mattr=+ptx83 | %ptxas-verify -arch=sm_90a %}
+
+define void @tensormap_replace_global_address(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i64 %value) {
+; CHECK-LABEL: tensormap_replace_global_address(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_global_address_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_global_address_param_2];
+; CHECK-NEXT: tensormap.replace.tile.global_address.global.b1024.b64 [%rd1], %rd2;
+; CHECK-NEXT: ld.param.b64 %rd3, [tensormap_replace_global_address_param_1];
+; CHECK-NEXT: tensormap.replace.tile.global_address.shared::cta.b1024.b64 [%rd3], %rd2;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.tensormap.replace.global.address.p1(ptr addrspace(1) %global_addr, i64 %value)
+ call void @llvm.nvvm.tensormap.replace.global.address.p3(ptr addrspace(3) %shared_addr, i64 %value)
+ ret void
+}
+
+define void @tensormap_replace_rank(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i32 %value) {
+; CHECK-LABEL: tensormap_replace_rank(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_rank_param_0];
+; CHECK-NEXT: ld.param.b32 %r1, [tensormap_replace_rank_param_2];
+; CHECK-NEXT: tensormap.replace.tile.rank.global.b1024.b32 [%rd1], %r1;
+; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_rank_param_1];
+; CHECK-NEXT: tensormap.replace.tile.rank.shared::cta.b1024.b32 [%rd2], %r1;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.tensormap.replace.rank.p1(ptr addrspace(1) %global_addr, i32 %value)
+ call void @llvm.nvvm.tensormap.replace.rank.p3(ptr addrspace(3) %shared_addr, i32 %value)
+ ret void
+}
+
+define void @tensormap_replace_box_dim(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i32 %value) {
+; CHECK-LABEL: tensormap_replace_box_dim(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_box_dim_param_0];
+; CHECK-NEXT: ld.param.b32 %r1, [tensormap_replace_box_dim_param_2];
+; CHECK-NEXT: tensormap.replace.tile.box_dim.global.b1024.b32 [%rd1], 0, %r1;
+; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_box_dim_param_1];
+; CHECK-NEXT: tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [%rd2], 0, %r1;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.tensormap.replace.box.dim.p1(ptr addrspace(1) %global_addr, i32 0, i32 %value)
+ call void @llvm.nvvm.tensormap.replace.box.dim.p3(ptr addrspace(3) %shared_addr, i32 0, i32 %value)
+ ret void
+}
+
+define void @tensormap_replace_global_dim(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i32 %value) {
+; CHECK-LABEL: tensormap_replace_global_dim(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_global_dim_param_0];
+; CHECK-NEXT: ld.param.b32 %r1, [tensormap_replace_global_dim_param_2];
+; CHECK-NEXT: tensormap.replace.tile.global_dim.global.b1024.b32 [%rd1], 0, %r1;
+; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_global_dim_param_1];
+; CHECK-NEXT: tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%rd2], 0, %r1;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.tensormap.replace.global.dim.p1(ptr addrspace(1) %global_addr, i32 0, i32 %value)
+ call void @llvm.nvvm.tensormap.replace.global.dim.p3(ptr addrspace(3) %shared_addr, i32 0, i32 %value)
+ ret void
+}
+
+define void @tensormap_replace_global_stride(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i64 %value) {
+; CHECK-LABEL: tensormap_replace_global_stride(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_global_stride_param_0];
+; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_global_stride_param_2];
+; CHECK-NEXT: tensormap.replace.tile.global_stride.global.b1024.b64 [%rd1], 0, %rd2;
+; CHECK-NEXT: ld.param.b64 %rd3, [tensormap_replace_global_stride_param_1];
+; CHECK-NEXT: tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%rd3], 0, %rd2;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.tensormap.replace.global.stride.p1(ptr addrspace(1) %global_addr, i32 0, i64 %value)
+ call void @llvm.nvvm.tensormap.replace.global.stride.p3(ptr addrspace(3) %shared_addr, i32 0, i64 %value)
+ ret void
+}
+
+define void @tensormap_replace_element_stride(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr, i32 %value) {
+; CHECK-LABEL: tensormap_replace_element_stride(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_element_stride_param_0];
+; CHECK-NEXT: ld.param.b32 %r1, [tensormap_replace_element_stride_param_2];
+; CHECK-NEXT: tensormap.replace.tile.element_stride.global.b1024.b32 [%rd1], 0, %r1;
+; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_element_stride_param_1];
+; CHECK-NEXT: tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [%rd2], 0, %r1;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.tensormap.replace.element.stride.p1(ptr addrspace(1) %global_addr, i32 0, i32 %value)
+ call void @llvm.nvvm.tensormap.replace.element.stride.p3(ptr addrspace(3) %shared_addr, i32 0, i32 %value)
+ ret void
+}
+
+define void @tensormap_replace_elemtype(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr) {
+; CHECK-LABEL: tensormap_replace_elemtype(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_elemtype_param_0];
+; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 0;
+; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_elemtype_param_1];
+; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 0;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 1;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 1;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 2;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 2;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 3;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 3;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 4;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 4;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 5;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 5;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 6;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 6;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 7;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 7;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 8;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 8;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 9;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 9;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 10;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 10;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 11;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 11;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 12;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 12;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=u8 */ i32 0)
+ call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=u8 */ i32 0)
+
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=u16 */ i32 1)
+ call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=u16 */ i32 1)
+
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=u32 */ i32 2)
+ call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=u32 */ i32 2)
+
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=s32 */ i32 3)
+ call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=s32 */ i32 3)
+
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=u64 */ i32 4)
+ call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=u64 */ i32 4)
+
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=s64 */ i32 5)
+ call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=s64 */ i32 5)
+
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=f16 */ i32 6)
+ call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=f16 */ i32 6)
+
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=f32 */ i32 7)
+ call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=f32 */ i32 7)
+
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=f32.ftz */ i32 8)
+ call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=f32.ftz */ i32 8)
+
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=f64 */ i32 9)
+ call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=f64 */ i32 9)
+
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=bf16 */ i32 10)
+ call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=bf16 */ i32 10)
+
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=tf32 */ i32 11)
+ call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=tf32 */ i32 11)
+
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=tf32.ftz */ i32 12)
+ call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=tf32.ftz */ i32 12)
+ ret void
+}
+
+define void @tensormap_replace_interleave_layout(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr) {
+; CHECK-LABEL: tensormap_replace_interleave_layout(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_interleave_layout_param_0];
+; CHECK-NEXT: tensormap.replace.tile.interleave_layout.global.b1024.b32 [%rd1], 0;
+; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_interleave_layout_param_1];
+; CHECK-NEXT: tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%rd2], 0;
+; CHECK-NEXT: tensormap.replace.tile.interleave_layout.global.b1024.b32 [%rd1], 1;
+; CHECK-NEXT: tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%rd2], 1;
+; CHECK-NEXT: tensormap.replace.tile.interleave_layout.global.b1024.b32 [%rd1], 2;
+; CHECK-NEXT: tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%rd2], 2;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.tensormap.replace.interleave.layout.p1(ptr addrspace(1) %global_addr, /* interleave_layout=No interleave */ i32 0)
+ call void @llvm.nvvm.tensormap.replace.interleave.layout.p3(ptr addrspace(3) %shared_addr, /* interleave_layout=No interleave */ i32 0)
+
+ call void @llvm.nvvm.tensormap.replace.interleave.layout.p1(ptr addrspace(1) %global_addr, /* interleave_layout=16B interleave */ i32 1)
+ call void @llvm.nvvm.tensormap.replace.interleave.layout.p3(ptr addrspace(3) %shared_addr, /* interleave_layout=16B interleave */ i32 1)
+
+ call void @llvm.nvvm.tensormap.replace.interleave.layout.p1(ptr addrspace(1) %global_addr, /* interleave_layout=32B interleave */ i32 2)
+ call void @llvm.nvvm.tensormap.replace.interleave.layout.p3(ptr addrspace(3) %shared_addr, /* interleave_layout=32B interleave */ i32 2)
+ ret void
+}
+
+define void @tensormap_replace_swizzle_mode(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr) {
+; CHECK-LABEL: tensormap_replace_swizzle_mode(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_swizzle_mode_param_0];
+; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%rd1], 0;
+; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_swizzle_mode_param_1];
+; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%rd2], 0;
+; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%rd1], 1;
+; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%rd2], 1;
+; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%rd1], 2;
+; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%rd2], 2;
+; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%rd1], 3;
+; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%rd2], 3;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.tensormap.replace.swizzle.mode.p1(ptr addrspace(1) %global_addr, /* swizzle_mode=No swizzling */ i32 0)
+ call void @llvm.nvvm.tensormap.replace.swizzle.mode.p3(ptr addrspace(3) %shared_addr, /* swizzle_mode=No swizzling */ i32 0)
+
+ call void @llvm.nvvm.tensormap.replace.swizzle.mode.p1(ptr addrspace(1) %global_addr, /* swizzle_mode=32B swizzling */ i32 1)
+ call void @llvm.nvvm.tensormap.replace.swizzle.mode.p3(ptr addrspace(3) %shared_addr, /* swizzle_mode=32B swizzling */ i32 1)
+
+ call void @llvm.nvvm.tensormap.replace.swizzle.mode.p1(ptr addrspace(1) %global_addr, /* swizzle_mode=64B swizzling */ i32 2)
+ call void @llvm.nvvm.tensormap.replace.swizzle.mode.p3(ptr addrspace(3) %shared_addr, /* swizzle_mode=64B swizzling */ i32 2)
+
+ call void @llvm.nvvm.tensormap.replace.swizzle.mode.p1(ptr addrspace(1) %global_addr, /* swizzle_mode=128B swizzling */ i32 3)
+ call void @llvm.nvvm.tensormap.replace.swizzle.mode.p3(ptr addrspace(3) %shared_addr, /* swizzle_mode=128B swizzling */ i32 3)
+ ret void
+}
+
+define void @tensormap_replace_fill_mode(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr) {
+; CHECK-LABEL: tensormap_replace_fill_mode(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_fill_mode_param_0];
+; CHECK-NEXT: tensormap.replace.tile.fill_mode.global.b1024.b32 [%rd1], 0;
+; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_fill_mode_param_1];
+; CHECK-NEXT: tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [%rd2], 0;
+; CHECK-NEXT: tensormap.replace.tile.fill_mode.global.b1024.b32 [%rd1], 1;
+; CHECK-NEXT: tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [%rd2], 1;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.tensormap.replace.fill.mode.p1(ptr addrspace(1) %global_addr, /* fill_mode=Zero fill */ i32 0)
+ call void @llvm.nvvm.tensormap.replace.fill.mode.p3(ptr addrspace(3) %shared_addr, /* fill_mode=Zero fill */ i32 0)
+
+ call void @llvm.nvvm.tensormap.replace.fill.mode.p1(ptr addrspace(1) %global_addr, /* fill_mode=OOB-NaN fill */ i32 1)
+ call void @llvm.nvvm.tensormap.replace.fill.mode.p3(ptr addrspace(3) %shared_addr, /* fill_mode=OOB-NaN fill */ i32 1)
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/tensormap_replace_sm_100a.ll b/llvm/test/CodeGen/NVPTX/tensormap_replace_sm_100a.ll
new file mode 100644
index 0000000000000..de1ef9f04f509
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tensormap_replace_sm_100a.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx87 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx87 | %ptxas-verify -arch=sm_100a %}
+
+define void @tensormap_replace_swizzle_atomicity(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr) {
+; CHECK-LABEL: tensormap_replace_swizzle_atomicity(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_swizzle_atomicity_param_0];
+; CHECK-NEXT: tensormap.replace.tile.swizzle_atomicity.global.b1024.b32 [%rd1], 0;
+; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_swizzle_atomicity_param_1];
+; CHECK-NEXT: tensormap.replace.tile.swizzle_atomicity.shared::cta.b1024.b32 [%rd2], 0;
+; CHECK-NEXT: tensormap.replace.tile.swizzle_atomicity.global.b1024.b32 [%rd1], 1;
+; CHECK-NEXT: tensormap.replace.tile.swizzle_atomicity.shared::cta.b1024.b32 [%rd2], 1;
+; CHECK-NEXT: tensormap.replace.tile.swizzle_atomicity.global.b1024.b32 [%rd1], 2;
+; CHECK-NEXT: tensormap.replace.tile.swizzle_atomicity.shared::cta.b1024.b32 [%rd2], 2;
+; CHECK-NEXT: tensormap.replace.tile.swizzle_atomicity.global.b1024.b32 [%rd1], 3;
+; CHECK-NEXT: tensormap.replace.tile.swizzle_atomicity.shared::cta.b1024.b32 [%rd2], 3;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.tensormap.replace.swizzle.atomicity.p1(ptr addrspace(1) %global_addr, /* swizzle_atomicity=16B */ i32 0)
+ call void @llvm.nvvm.tensormap.replace.swizzle.atomicity.p3(ptr addrspace(3) %shared_addr, /* swizzle_atomicity=16B */ i32 0)
+
+ call void @llvm.nvvm.tensormap.replace.swizzle.atomicity.p1(ptr addrspace(1) %global_addr, /* swizzle_atomicity=32B */ i32 1)
+ call void @llvm.nvvm.tensormap.replace.swizzle.atomicity.p3(ptr addrspace(3) %shared_addr, /* swizzle_atomicity=32B */ i32 1)
+
+ call void @llvm.nvvm.tensormap.replace.swizzle.atomicity.p1(ptr addrspace(1) %global_addr, /* swizzle_atomicity=32B + 8B flip */ i32 2)
+ call void @llvm.nvvm.tensormap.replace.swizzle.atomicity.p3(ptr addrspace(3) %shared_addr, /* swizzle_atomicity=32B + 8B flip */ i32 2)
+
+ call void @llvm.nvvm.tensormap.replace.swizzle.atomicity.p1(ptr addrspace(1) %global_addr, /* swizzle_atomicity=64B */ i32 3)
+ call void @llvm.nvvm.tensormap.replace.swizzle.atomicity.p3(ptr addrspace(3) %shared_addr, /* swizzle_atomicity=64B */ i32 3)
+ ret void
+}
+
+define void @tensormap_replace_elemtype(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr) {
+; CHECK-LABEL: tensormap_replace_elemtype(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_elemtype_param_0];
+; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 13;
+; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_elemtype_param_1];
+; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 13;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 14;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 14;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.global.b1024.b32 [%rd1], 15;
+; CHECK-NEXT: tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%rd2], 15;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=b4x16 */ i32 13)
+ call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=b4x16 */ i32 13)
+
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=b4x16_p64 */ i32 14)
+ call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=b4x16_p64 */ i32 14)
+
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %global_addr, /* elemtype=b6x16_p32 */ i32 15)
+ call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %shared_addr, /* elemtype=b6x16_p32 */ i32 15)
+ ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/tensormap_replace_sm_103a.ll b/llvm/test/CodeGen/NVPTX/tensormap_replace_sm_103a.ll
new file mode 100644
index 0000000000000..197bbdee692c2
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tensormap_replace_sm_103a.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+
+define void @tensormap_replace_swizzle_mode(ptr addrspace(1) %global_addr, ptr addrspace(3) %shared_addr) {
+; CHECK-LABEL: tensormap_replace_swizzle_mode(
+; CHECK: {
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [tensormap_replace_swizzle_mode_param_0];
+; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%rd1], 4;
+; CHECK-NEXT: ld.param.b64 %rd2, [tensormap_replace_swizzle_mode_param_1];
+; CHECK-NEXT: tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%rd2], 4;
+; CHECK-NEXT: ret;
+ call void @llvm.nvvm.tensormap.replace.swizzle.mode.p1(ptr addrspace(1) %global_addr, /* swizzle_mode=96B swizzling */ i32 4)
+ call void @llvm.nvvm.tensormap.replace.swizzle.mode.p3(ptr addrspace(3) %shared_addr, /* swizzle_mode=96B swizzling */ i32 4)
+ ret void
+}
>From 2081fc2b2aa32cf953ab22ec87aff2c6e77d693a Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Tue, 16 Dec 2025 09:28:36 +0000
Subject: [PATCH 2/8] use enums for immediate field values
---
llvm/include/llvm/IR/NVVMIntrinsicUtils.h | 45 ++++++++++++
llvm/lib/IR/NVVMIntrinsicUtils.cpp | 90 ++++++-----------------
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 5 +-
3 files changed, 72 insertions(+), 68 deletions(-)
diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
index a16051b8cdb01..067290e57245a 100644
--- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
+++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
@@ -59,6 +59,51 @@ enum class Tcgen05CollectorUsageOp : uint8_t {
USE = 3,
};
+enum class TensormapElemType : uint8_t {
+ U8 = 0,
+ U16 = 1,
+ U32 = 2,
+ S32 = 3,
+ U64 = 4,
+ S64 = 5,
+ F16 = 6,
+ F32 = 7,
+ F32_FTZ = 8,
+ F64 = 9,
+ BF16 = 10,
+ TF32 = 11,
+ TF32_FTZ = 12,
+ B4x16 = 13,
+ B4x16_p64 = 14,
+ B6x16_p32 = 15,
+};
+
+enum class TensormapInterleaveLayout : uint8_t {
+ NO_INTERLEAVE = 0,
+ INTERLEAVE_16B = 1,
+ INTERLEAVE_32B = 2,
+};
+
+enum class TensormapSwizzleMode : uint8_t {
+ NO_SWIZZLE = 0,
+ SWIZZLE_32B = 1,
+ SWIZZLE_64B = 2,
+ SWIZZLE_128B = 3,
+ SWIZZLE_96B = 4,
+};
+
+enum class TensormapSwizzleAtomicity : uint8_t {
+ SWIZZLE_ATOMICITY_16B = 0,
+ SWIZZLE_ATOMICITY_32B = 1,
+ SWIZZLE_ATOMICITY_32B_FLIP_8B = 2,
+ SWIZZLE_ATOMICITY_64B = 3,
+};
+
+enum class TensormapFillMode : uint8_t {
+ ZERO_FILL = 0,
+ OOB_NAN_FILL = 1,
+};
+
void printTcgen05MMAKind(raw_ostream &OS, const Constant *ImmArgVal);
void printTcgen05CollectorUsageOp(raw_ostream &OS, const Constant *ImmArgVal);
diff --git a/llvm/lib/IR/NVVMIntrinsicUtils.cpp b/llvm/lib/IR/NVVMIntrinsicUtils.cpp
index 4446792b89988..2c939ff0ca08a 100644
--- a/llvm/lib/IR/NVVMIntrinsicUtils.cpp
+++ b/llvm/lib/IR/NVVMIntrinsicUtils.cpp
@@ -10,6 +10,7 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/ADT/StringRef.h"
#include "llvm/IR/NVVMIntrinsicUtils.h"
using namespace llvm;
@@ -61,56 +62,14 @@ void nvvm::printTcgen05CollectorUsageOp(raw_ostream &OS,
}
void nvvm::printTensormapElemType(raw_ostream &OS, const Constant *ImmArgVal) {
+ static constexpr StringRef TensormapElemTypes[] = {
+ "u8", "u16", "u32", "s32", "u64", "s64",
+ "f16", "f32", "f32.ftz", "f64", "bf16", "tf32",
+ "tf32.ftz", "b4x16", "b4x16_p64", "b6x16_p32"};
if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
uint64_t Val = CI->getZExtValue();
- switch (Val) {
- case 0:
- OS << "u8";
- return;
- case 1:
- OS << "u16";
- return;
- case 2:
- OS << "u32";
- return;
- case 3:
- OS << "s32";
- return;
- case 4:
- OS << "u64";
- return;
- case 5:
- OS << "s64";
- return;
- case 6:
- OS << "f16";
- return;
- case 7:
- OS << "f32";
- return;
- case 8:
- OS << "f32.ftz";
- return;
- case 9:
- OS << "f64";
- return;
- case 10:
- OS << "bf16";
- return;
- case 11:
- OS << "tf32";
- return;
- case 12:
- OS << "tf32.ftz";
- return;
- case 13:
- OS << "b4x16";
- return;
- case 14:
- OS << "b4x16_p64";
- return;
- case 15:
- OS << "b6x16_p32";
+ if (Val <= static_cast<uint64_t>(nvvm::TensormapElemType::B6x16_p32)) {
+ OS << TensormapElemTypes[Val];
return;
}
}
@@ -122,14 +81,14 @@ void nvvm::printTensormapInterleaveLayout(raw_ostream &OS,
const Constant *ImmArgVal) {
if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
uint64_t Val = CI->getZExtValue();
- switch (Val) {
- case 0:
+ switch (static_cast<TensormapInterleaveLayout>(Val)) {
+ case TensormapInterleaveLayout::NO_INTERLEAVE:
OS << "No interleave";
return;
- case 1:
+ case TensormapInterleaveLayout::INTERLEAVE_16B:
OS << "16B interleave";
return;
- case 2:
+ case TensormapInterleaveLayout::INTERLEAVE_32B:
OS << "32B interleave";
return;
}
@@ -143,20 +102,20 @@ void nvvm::printTensormapSwizzleMode(raw_ostream &OS,
const Constant *ImmArgVal) {
if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
uint64_t Val = CI->getZExtValue();
- switch (Val) {
- case 0:
+ switch (static_cast<TensormapSwizzleMode>(Val)) {
+ case TensormapSwizzleMode::NO_SWIZZLE:
OS << "No swizzling";
return;
- case 1:
+ case TensormapSwizzleMode::SWIZZLE_32B:
OS << "32B swizzling";
return;
- case 2:
+ case TensormapSwizzleMode::SWIZZLE_64B:
OS << "64B swizzling";
return;
- case 3:
+ case TensormapSwizzleMode::SWIZZLE_128B:
OS << "128B swizzling";
return;
- case 4:
+ case TensormapSwizzleMode::SWIZZLE_96B:
OS << "96B swizzling";
return;
}
@@ -169,17 +128,17 @@ void nvvm::printTensormapSwizzleAtomicity(raw_ostream &OS,
const Constant *ImmArgVal) {
if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
uint64_t Val = CI->getZExtValue();
- switch (Val) {
- case 0:
+ switch (static_cast<TensormapSwizzleAtomicity>(Val)) {
+ case TensormapSwizzleAtomicity::SWIZZLE_ATOMICITY_16B:
OS << "16B";
return;
- case 1:
+ case TensormapSwizzleAtomicity::SWIZZLE_ATOMICITY_32B:
OS << "32B";
return;
- case 2:
+ case TensormapSwizzleAtomicity::SWIZZLE_ATOMICITY_32B_FLIP_8B:
OS << "32B + 8B flip";
return;
- case 3:
+ case TensormapSwizzleAtomicity::SWIZZLE_ATOMICITY_64B:
OS << "64B";
return;
}
@@ -192,11 +151,10 @@ void nvvm::printTensormapSwizzleAtomicity(raw_ostream &OS,
void nvvm::printTensormapFillMode(raw_ostream &OS, const Constant *ImmArgVal) {
if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
uint64_t Val = CI->getZExtValue();
- switch (Val) {
- case 0:
+ if (Val == static_cast<uint64_t>(TensormapFillMode::ZERO_FILL)) {
OS << "Zero fill";
return;
- case 1:
+ } else if (Val == static_cast<uint64_t>(TensormapFillMode::OOB_NAN_FILL)) {
OS << "OOB-NaN fill";
return;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 46cdaadeae6b9..ccf2be1835722 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -20,6 +20,7 @@
#include "NVPTXRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/NVVMIntrinsicUtils.h"
#include "llvm/Support/NVPTXAddrSpace.h"
#include <string>
@@ -209,7 +210,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
}
bool hasTensormapReplaceElemtypeSupport(unsigned value) const {
- if (value >= 13)
+ if (value >= static_cast<unsigned>(nvvm::TensormapElemType::B4x16))
return hasPTXWithFamilySMs(90, {100, 110, 120}) ||
hasPTXWithFamilySMs(88, {100, 101, 120}) ||
hasPTXWithAccelSMs(87, {100, 101, 120});
@@ -224,7 +225,7 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
}
bool hasTensormapReplaceSwizzleModeSupport(unsigned value) const {
- if (value == 4)
+ if (value == static_cast<unsigned>(nvvm::TensormapSwizzleMode::SWIZZLE_96B))
return hasPTXWithAccelSMs(88, {103});
return hasTensormapReplaceSupport();
>From 04a0f2838092ee927ba23b9d4dcc4150d2220f41 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Wed, 17 Dec 2025 08:10:43 +0000
Subject: [PATCH 3/8] add docs
---
llvm/docs/NVPTXUsage.rst | 289 +++++++++++++++++++++++++++++++++++++++
1 file changed, 289 insertions(+)
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 5f7fb00889655..08308771de7e1 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -2192,6 +2192,295 @@ The ``griddepcontrol`` intrinsics allows the dependent grids and prerequisite gr
For more information, refer
`PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-griddepcontrol>`__.
+Tensormap Replace Intrinsics
+----------------------------
+
+These intrinsics modify the fields of the tensor-map object in ``tile`` mode at
+the location specified by the address operand ``%addr``.
+
+For more information, refer to the
+`PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-tensormap-replace>`__.
+
+'``llvm.nvvm.tensormap.replace.global.address``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare void @llvm.nvvm.tensormap.replace.global.address.p1(ptr addrspace(1) %addr, i64 %new_value)
+ declare void @llvm.nvvm.tensormap.replace.global.address.p3(ptr addrspace(3) %addr, i64 %new_value)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.tensormap.replace.global.address.*``' intrinsics replace the
+``global_address`` field of the tensor-map object with ``%new_value``.
+
+'``llvm.nvvm.tensormap.replace.rank``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare void @llvm.nvvm.tensormap.replace.rank.p1(ptr addrspace(1) %addr, i32 %new_value)
+ declare void @llvm.nvvm.tensormap.replace.rank.p3(ptr addrspace(3) %addr, i32 %new_value)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.tensormap.replace.rank.*``' intrinsics replace the ``rank``
+field of the tensor-map object with ``%new_value`` which must be one less than
+the desired tensor rank as this field uses zero-based numbering.
+
+'``llvm.nvvm.tensormap.replace.global.stride``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare void @llvm.nvvm.tensormap.replace.global.stride.p1(ptr addrspace(1) %addr, i32 immarg %ord, i64 %new_value)
+ declare void @llvm.nvvm.tensormap.replace.global.stride.p3(ptr addrspace(3) %addr, i32 immarg %ord, i64 %new_value)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.tensormap.replace.global.stride.*``' intrinsics replace the
+``%ord``-th element of the ``global_stride`` field of the tensor-map object
+with ``%new_value``.
+
+'``llvm.nvvm.tensormap.replace.element.stride``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare void @llvm.nvvm.tensormap.replace.element.stride.p1(ptr addrspace(1) %addr, i32 immarg %ord, i32 %new_value)
+ declare void @llvm.nvvm.tensormap.replace.element.stride.p3(ptr addrspace(3) %addr, i32 immarg %ord, i32 %new_value)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.tensormap.replace.element.stride.*``' intrinsics replace the
+``%ord``-th element of the ``element_stride`` field of the tensor-map object
+with ``%new_value``.
+
+'``llvm.nvvm.tensormap.replace.global.dim``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare void @llvm.nvvm.tensormap.replace.global.dim.p1(ptr addrspace(1) %addr, i32 immarg %ord, i32 %new_value)
+ declare void @llvm.nvvm.tensormap.replace.global.dim.p3(ptr addrspace(3) %addr, i32 immarg %ord, i32 %new_value)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.tensormap.replace.global.dim.*``' intrinsics replace the
+``%ord``-th element of the ``global_dim`` field of the tensor-map object
+with ``%new_value``.
+
+'``llvm.nvvm.tensormap.replace.box.dim``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare void @llvm.nvvm.tensormap.replace.box.dim.p1(ptr addrspace(1) %addr, i32 immarg %ord, i32 %new_value)
+ declare void @llvm.nvvm.tensormap.replace.box.dim.p3(ptr addrspace(3) %addr, i32 immarg %ord, i32 %new_value)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.tensormap.replace.box.dim.*``' intrinsics replace the
+``%ord``-th element of the ``box_dim`` field of the tensor-map object with
+``%new_value``.
+
+'``llvm.nvvm.tensormap.replace.elemtype``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %addr, i32 immarg %new_value)
+ declare void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %addr, i32 immarg %new_value)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.tensormap.replace.elemtype.*``' intrinsics replace the
+``elemtype`` field of the tensor-map object with the type specified by
+``%new_value``.
+
+Semantics:
+""""""""""
+
+The following table shows the mapping of ``%new_value`` to the actual element
+type:
+
+ ============================ =====
+ Element Type Value
+ ============================ =====
+ ``u8`` 0
+ ``u16`` 1
+ ``u32`` 2
+ ``s32`` 3
+ ``u64`` 4
+ ``s64`` 5
+ ``f16`` 6
+ ``f32`` 7
+ ``f32.ftz`` 8
+ ``f64`` 9
+ ``bf16`` 10
+ ``tf32`` 11
+ ``tf32.ftz`` 12
+ ``b4x16`` 13
+ ``b4x16_p64`` 14
+ ``b6x16_p32`` or ``b6p2x16`` 15
+ ============================ =====
+
+'``llvm.nvvm.tensormap.replace.interleave.layout``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare void @llvm.nvvm.tensormap.replace.interleave.layout.p1(ptr addrspace(1) %addr, i32 immarg %new_value)
+ declare void @llvm.nvvm.tensormap.replace.interleave.layout.p3(ptr addrspace(3) %addr, i32 immarg %new_value)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.tensormap.replace.interleave.layout.*``' intrinsics replace
+the ``interleave_layout`` field of the tensor-map object with the layout
+specified by ``%new_value``.
+
+Semantics:
+""""""""""
+
+The following table shows the mapping of ``%new_value`` to the actual layout:
+
+ ================== =====
+ Interleave Layout Value
+ ================== =====
+ ``No interleave`` 0
+ ``16B interleave`` 1
+ ``32B interleave`` 2
+ ================== =====
+
+'``llvm.nvvm.tensormap.replace.swizzle_mode``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare void @llvm.nvvm.tensormap.replace.swizzle.mode.p1(ptr addrspace(1) %addr, i32 immarg %new_value)
+ declare void @llvm.nvvm.tensormap.replace.swizzle.mode.p3(ptr addrspace(3) %addr, i32 immarg %new_value)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.tensormap.replace.swizzle.mode.*``' intrinsics replace the
+``swizzle_mode`` field of the tensor-map object with the swizzle mode specified
+by ``%new_value``.
+
+Semantics:
+""""""""""
+
+The following table shows the mapping of ``%new_value`` to the actual swizzle
+mode:
+
+ ================ =====
+ Swizzle Mode Value
+ ================ =====
+ ``No swizzle`` 0
+ ``32B swizzle`` 1
+ ``64B swizzle`` 2
+ ``128B swizzle`` 3
+ ``96B swizzle`` 4
+ ================ =====
+
+'``llvm.nvvm.tensormap.replace.swizzle_atomicity``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare void @llvm.nvvm.tensormap.replace.swizzle.atomicity.p1(ptr addrspace(1) %addr, i32 immarg %new_value)
+ declare void @llvm.nvvm.tensormap.replace.swizzle.atomicity.p3(ptr addrspace(3) %addr, i32 immarg %new_value)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.tensormap.replace.swizzle.atomicity.*``' intrinsics replace
+the ``swizzle_atomicity`` field of the tensor-map object with the swizzle
+atomicity specified by ``%new_value``.
+
+Semantics:
+""""""""""
+
+The following table shows the mapping of ``%new_value`` to the actual swizzle
+atomicity:
+
+ ================= =====
+ Swizzle Atomicity Value
+ ================= =====
+ ``16B`` 0
+ ``32B`` 1
+ ``32B + 8B flip`` 2
+ ``64B`` 3
+ ================= =====
+
+'``llvm.nvvm.tensormap.replace.fill_mode``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+ declare void @llvm.nvvm.tensormap.replace.fill.mode.p1(ptr addrspace(1) %addr, i32 immarg %new_value)
+ declare void @llvm.nvvm.tensormap.replace.fill.mode.p3(ptr addrspace(3) %addr, i32 immarg %new_value)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.tensormap.replace.fill.mode.*``' intrinsics replace the
+``fill_mode`` field of the tensor-map object with the fill mode specified by
+``%new_value``.
+
+Semantics:
+""""""""""
+
+The following table shows the mapping of ``%new_value`` to the actual fill mode:
+
+ ================ =====
+ Fill Mode Value
+ ================ =====
+ ``Zero fill`` 0
+ ``OOB-NaN fill`` 1
+ ================ =====
+
TCGEN05 family of Intrinsics
----------------------------
>From ec5f84cc2ee8de0731a5666bc0e5888eb398748f Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Wed, 17 Dec 2025 08:11:03 +0000
Subject: [PATCH 4/8] address comments
---
llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 20 ++++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index b15d1210ded32..2bd2e6fbbe63f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -6257,43 +6257,43 @@ class TensormapReplaceInst_3<string state_space, string field_name,
(Intrin node:$addr, node:$ord, node:$val), predicate>
addr:$addr, i32:$ord, ValTy:$val)]>;
-foreach state_space = ["GLOBAL", "SHARED_CTA"] in {
- defvar pred = !if(!eq(state_space, "GLOBAL"), AS_match.global, AS_match.shared);
- defvar ss_ptx = !tolower(!subst("_", "::", state_space));
+foreach ss = ["GLOBAL", "SHARED_CTA"] in {
+ defvar pred = !if(!eq(ss, "GLOBAL"), AS_match.global, AS_match.shared);
+ defvar ss_ptx = !tolower(!subst("_", "::", ss));
let Predicates = [callSubtarget<"hasTensormapReplaceSupport">] in {
- def TMAP_REPLACE_TILE_GLOBAL_ADDRESS_ # state_space :
+ def TENSORMAP_REPLACE_TILE_GLOBAL_ADDRESS_ # ss :
TensormapReplaceInst_2<ss_ptx, "global_address", "b64", B64, i64,
int_nvvm_tensormap_replace_global_address, pred>;
foreach field_name = ["INTERLEAVE_LAYOUT", "FILL_MODE", "RANK"] in {
defvar intrin = !cast<Intrinsic>("int_nvvm_tensormap_replace_" # !tolower(field_name));
- def TMAP_REPLACE_TILE_ # field_name # _ # state_space :
+ def TENSORMAP_REPLACE_TILE_ # field_name # _ # ss :
TensormapReplaceInst_2<ss_ptx, !tolower(field_name), "b32", B32, i32,
intrin, pred>;
} // field_name
- def TMAP_REPLACE_TILE_GLOBAL_STRIDE_ # state_space :
+ def TENSORMAP_REPLACE_TILE_GLOBAL_STRIDE_ # ss :
TensormapReplaceInst_3<ss_ptx, "global_stride", "b64", B64, i64,
int_nvvm_tensormap_replace_global_stride, pred>;
foreach field_name = ["BOX_DIM", "GLOBAL_DIM", "ELEMENT_STRIDE"] in {
defvar intrin = !cast<Intrinsic>("int_nvvm_tensormap_replace_" # !tolower(field_name));
- def TMAP_REPLACE_TILE_ # field_name # _ # state_space :
+ def TENSORMAP_REPLACE_TILE_ # field_name # _ # ss :
TensormapReplaceInst_3<ss_ptx, !tolower(field_name), "b32", B32, i32,
intrin, pred>;
} // field_name
} // hasTensormapReplaceSupport
- def TMAP_REPLACE_TILE_ELEMTYPE_ # state_space :
+ def TENSORMAP_REPLACE_TILE_ELEMTYPE_ # ss :
TensormapReplaceInst_2<ss_ptx, "elemtype", "b32", B32, i32,
int_nvvm_tensormap_replace_elemtype, pred>;
- def TMAP_REPLACE_SWIZZLE_ATOMICITY_ # state_space :
+ def TENSORMAP_REPLACE_SWIZZLE_ATOMICITY_ # ss :
TensormapReplaceInst_2<ss_ptx, "swizzle_atomicity", "b32", B32, i32,
int_nvvm_tensormap_replace_swizzle_atomicity, pred>,
Requires<[callSubtarget<"hasTensormapReplaceSwizzleAtomicitySupport">]>;
- def TMAP_REPLACE_SWIZZLE_MODE_ # state_space :
+ def TENSORMAP_REPLACE_SWIZZLE_MODE_ # ss :
TensormapReplaceInst_2<ss_ptx, "swizzle_mode", "b32", B32, i32,
int_nvvm_tensormap_replace_swizzle_mode, pred>;
} // state_space
>From 045312bff4128c752fe641a9d880c82c73ef883b Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Wed, 17 Dec 2025 08:21:19 +0000
Subject: [PATCH 5/8] fix formatting
---
llvm/lib/IR/NVVMIntrinsicUtils.cpp | 2 +-
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/IR/NVVMIntrinsicUtils.cpp b/llvm/lib/IR/NVVMIntrinsicUtils.cpp
index 2c939ff0ca08a..3d392a592cacc 100644
--- a/llvm/lib/IR/NVVMIntrinsicUtils.cpp
+++ b/llvm/lib/IR/NVVMIntrinsicUtils.cpp
@@ -10,8 +10,8 @@
//
//===----------------------------------------------------------------------===//
-#include "llvm/ADT/StringRef.h"
#include "llvm/IR/NVVMIntrinsicUtils.h"
+#include "llvm/ADT/StringRef.h"
using namespace llvm;
using namespace nvvm;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index a8d5da3407a67..ccd7ce90e0851 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -4589,7 +4589,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
}
case Intrinsic::nvvm_tensormap_replace_global_address:
- case Intrinsic::nvvm_tensormap_replace_global_stride:{
+ case Intrinsic::nvvm_tensormap_replace_global_stride: {
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = MVT::i64;
Info.ptrVal = I.getArgOperand(0);
>From d64e13721a524ebcf526c40f1637ca1e362b139f Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Wed, 17 Dec 2025 12:13:30 +0000
Subject: [PATCH 6/8] remove llvm_unreachable
---
llvm/lib/IR/NVVMIntrinsicUtils.cpp | 12 ------------
1 file changed, 12 deletions(-)
diff --git a/llvm/lib/IR/NVVMIntrinsicUtils.cpp b/llvm/lib/IR/NVVMIntrinsicUtils.cpp
index 3d392a592cacc..76816ba76e2b9 100644
--- a/llvm/lib/IR/NVVMIntrinsicUtils.cpp
+++ b/llvm/lib/IR/NVVMIntrinsicUtils.cpp
@@ -73,8 +73,6 @@ void nvvm::printTensormapElemType(raw_ostream &OS, const Constant *ImmArgVal) {
return;
}
}
- llvm_unreachable("printTensormapElemType called with invalid value for "
- "immediate argument");
}
void nvvm::printTensormapInterleaveLayout(raw_ostream &OS,
@@ -93,9 +91,6 @@ void nvvm::printTensormapInterleaveLayout(raw_ostream &OS,
return;
}
}
- llvm_unreachable(
- "printTensormapInterleaveLayout called with invalid value for "
- "immediate argument");
}
void nvvm::printTensormapSwizzleMode(raw_ostream &OS,
@@ -120,8 +115,6 @@ void nvvm::printTensormapSwizzleMode(raw_ostream &OS,
return;
}
}
- llvm_unreachable("printTensormapSwizzleMode called with invalid value for "
- "immediate argument");
}
void nvvm::printTensormapSwizzleAtomicity(raw_ostream &OS,
@@ -143,9 +136,6 @@ void nvvm::printTensormapSwizzleAtomicity(raw_ostream &OS,
return;
}
}
- llvm_unreachable(
- "printTensormapSwizzleAtomicity called with invalid value for "
- "immediate argument");
}
void nvvm::printTensormapFillMode(raw_ostream &OS, const Constant *ImmArgVal) {
@@ -159,6 +149,4 @@ void nvvm::printTensormapFillMode(raw_ostream &OS, const Constant *ImmArgVal) {
return;
}
}
- llvm_unreachable("printTensormapFillMode called with invalid value for "
- "immediate argument");
}
>From c2f070825ce7e533030545fe2e5888265b110b1e Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Fri, 19 Dec 2025 10:43:17 +0000
Subject: [PATCH 7/8] address comments
---
llvm/docs/NVPTXUsage.rst | 4 +-
llvm/lib/IR/NVVMIntrinsicUtils.cpp | 48 ++++---------
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 70 ++++++++-----------
.../CodeGen/NVPTX/tensormap_replace.err.ll | 14 ++++
4 files changed, 61 insertions(+), 75 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/tensormap_replace.err.ll
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 08308771de7e1..765ad8be89a29 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -2195,8 +2195,8 @@ For more information, refer
Tensormap Replace Intrinsics
----------------------------
-These intrinsics modify the fields of the tensor-map object in ``tile`` mode at
-the location specified by the address operand ``%addr``.
+These intrinsics modify the fields of the tensor-map object at ``%addr`` in
+``tile`` mode.
For more information, refer to the
`PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-tensormap-replace>`__.
diff --git a/llvm/lib/IR/NVVMIntrinsicUtils.cpp b/llvm/lib/IR/NVVMIntrinsicUtils.cpp
index 76816ba76e2b9..d745a1cfb72cd 100644
--- a/llvm/lib/IR/NVVMIntrinsicUtils.cpp
+++ b/llvm/lib/IR/NVVMIntrinsicUtils.cpp
@@ -95,23 +95,13 @@ void nvvm::printTensormapInterleaveLayout(raw_ostream &OS,
void nvvm::printTensormapSwizzleMode(raw_ostream &OS,
const Constant *ImmArgVal) {
+ static constexpr StringRef TensormapSwizzleModes[] = {
+ "No swizzling", "32B swizzling", "64B swizzling", "128B swizzling",
+ "96B swizzling"};
if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
uint64_t Val = CI->getZExtValue();
- switch (static_cast<TensormapSwizzleMode>(Val)) {
- case TensormapSwizzleMode::NO_SWIZZLE:
- OS << "No swizzling";
- return;
- case TensormapSwizzleMode::SWIZZLE_32B:
- OS << "32B swizzling";
- return;
- case TensormapSwizzleMode::SWIZZLE_64B:
- OS << "64B swizzling";
- return;
- case TensormapSwizzleMode::SWIZZLE_128B:
- OS << "128B swizzling";
- return;
- case TensormapSwizzleMode::SWIZZLE_96B:
- OS << "96B swizzling";
+ if (Val <= static_cast<uint64_t>(nvvm::TensormapSwizzleMode::SWIZZLE_96B)) {
+ OS << TensormapSwizzleModes[Val];
return;
}
}
@@ -119,20 +109,13 @@ void nvvm::printTensormapSwizzleMode(raw_ostream &OS,
void nvvm::printTensormapSwizzleAtomicity(raw_ostream &OS,
const Constant *ImmArgVal) {
+ static constexpr StringRef TensormapSwizzleAtomicities[] = {
+ "16B", "32B", "32B + 8B flip", "64B"};
if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
uint64_t Val = CI->getZExtValue();
- switch (static_cast<TensormapSwizzleAtomicity>(Val)) {
- case TensormapSwizzleAtomicity::SWIZZLE_ATOMICITY_16B:
- OS << "16B";
- return;
- case TensormapSwizzleAtomicity::SWIZZLE_ATOMICITY_32B:
- OS << "32B";
- return;
- case TensormapSwizzleAtomicity::SWIZZLE_ATOMICITY_32B_FLIP_8B:
- OS << "32B + 8B flip";
- return;
- case TensormapSwizzleAtomicity::SWIZZLE_ATOMICITY_64B:
- OS << "64B";
+ if (Val <= static_cast<uint64_t>(
+ nvvm::TensormapSwizzleAtomicity::SWIZZLE_ATOMICITY_64B)) {
+ OS << TensormapSwizzleAtomicities[Val];
return;
}
}
@@ -141,12 +124,9 @@ void nvvm::printTensormapSwizzleAtomicity(raw_ostream &OS,
void nvvm::printTensormapFillMode(raw_ostream &OS, const Constant *ImmArgVal) {
if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
uint64_t Val = CI->getZExtValue();
- if (Val == static_cast<uint64_t>(TensormapFillMode::ZERO_FILL)) {
- OS << "Zero fill";
- return;
- } else if (Val == static_cast<uint64_t>(TensormapFillMode::OOB_NAN_FILL)) {
- OS << "OOB-NaN fill";
- return;
- }
+ OS << (Val == static_cast<uint64_t>(TensormapFillMode::ZERO_FILL)
+ ? "Zero fill"
+ : "OOB-NaN fill");
+ return;
}
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index ccd7ce90e0851..2e85d602c6264 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2746,60 +2746,52 @@ lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset = false) {
return {{BuildVector, Chain}};
}
-static SDValue lowerTensormapReplaceElemtype(SDValue Op, SelectionDAG &DAG) {
+static SDValue invalidTensormapReplaceUsage(SDValue Op, SelectionDAG &DAG,
+ unsigned Val) {
SDNode *N = Op.getNode();
SDLoc DL(N);
- unsigned Val = N->getConstantOperandVal(3);
- if (!DAG.getSubtarget<NVPTXSubtarget>().hasTensormapReplaceElemtypeSupport(
- Val)) {
- const Function &Fn = DAG.getMachineFunction().getFunction();
+ const Function &Fn = DAG.getMachineFunction().getFunction();
- unsigned AS = 0;
- if (auto *MemN = dyn_cast<MemIntrinsicSDNode>(N)) {
- AS = MemN->getAddressSpace();
- }
- Type *PtrTy = PointerType::get(*DAG.getContext(), AS);
- Module *M = DAG.getMachineFunction().getFunction().getParent();
+ unsigned AS = 0;
+ if (auto *MemN = dyn_cast<MemIntrinsicSDNode>(N))
+ AS = MemN->getAddressSpace();
+ Type *PtrTy = PointerType::get(*DAG.getContext(), AS);
+ Module *M = DAG.getMachineFunction().getFunction().getParent();
- DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
- Fn,
- "Intrinsic " +
- Intrinsic::getName(N->getConstantOperandVal(1), {PtrTy}, M) +
- " with elemtype " + Twine(Val) +
- " is not supported on the given target.",
- DL.getDebugLoc()));
- return Op.getOperand(0);
- }
+ DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
+ Fn,
+ "Intrinsic " +
+ Intrinsic::getName(N->getConstantOperandVal(1), {PtrTy}, M) +
+ " with value " + Twine(Val) +
+ " is not supported on the given target.",
+ DL.getDebugLoc()));
+ return Op.getOperand(0);
+}
+
+static SDValue lowerTensormapReplaceElemtype(SDValue Op, SelectionDAG &DAG) {
+ SDNode *N = Op.getNode();
+ SDLoc DL(N);
+
+ // immediate argument representing elemtype
+ unsigned Val = N->getConstantOperandVal(3);
+ if (!DAG.getSubtarget<NVPTXSubtarget>().hasTensormapReplaceElemtypeSupport(
+ Val))
+ return invalidTensormapReplaceUsage(Op, DAG, Val);
return Op;
}
static SDValue lowerTensormapReplaceSwizzleMode(SDValue Op, SelectionDAG &DAG) {
SDNode *N = Op.getNode();
SDLoc DL(N);
+
+ // immediate argument representing swizzle mode
unsigned Val = N->getConstantOperandVal(3);
if (!DAG.getSubtarget<NVPTXSubtarget>().hasTensormapReplaceSwizzleModeSupport(
- Val)) {
- const Function &Fn = DAG.getMachineFunction().getFunction();
-
- unsigned AS = 0;
- if (auto *MemN = dyn_cast<MemIntrinsicSDNode>(N)) {
- AS = MemN->getAddressSpace();
- }
- Type *PtrTy = PointerType::get(*DAG.getContext(), AS);
- Module *M = DAG.getMachineFunction().getFunction().getParent();
-
- DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
- Fn,
- "Intrinsic " +
- Intrinsic::getName(N->getConstantOperandVal(1), {PtrTy}, M) +
- " with swizzle mode " + Twine(Val) +
- " is not supported on the given target.",
- DL.getDebugLoc()));
- return Op.getOperand(0);
- }
+ Val))
+ return invalidTensormapReplaceUsage(Op, DAG, Val);
return Op;
}
diff --git a/llvm/test/CodeGen/NVPTX/tensormap_replace.err.ll b/llvm/test/CodeGen/NVPTX/tensormap_replace.err.ll
new file mode 100644
index 0000000000000..3b266af4e0011
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tensormap_replace.err.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: not llc -mtriple=nvptx64 -mcpu=sm_90a -filetype=null %s 2>&1 | FileCheck %s
+
+define void @tensormap_replace_elemtype_error(ptr addrspace(1) %addr) {
+ ; CHECK: Intrinsic llvm.nvvm.tensormap.replace.elemtype.p1 with value 13 is not supported on the given target.
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %addr, i32 13)
+ ret void
+}
+
+define void @tensormap_replace_swizzle_mode_error(ptr addrspace(1) %addr) {
+ ; CHECK: Intrinsic llvm.nvvm.tensormap.replace.swizzle.mode.p1 with value 4 is not supported on the given target.
+ call void @llvm.nvvm.tensormap.replace.swizzle.mode.p1(ptr addrspace(1) %addr, i32 4)
+ ret void
+}
>From c8a3a429045e66fb3c066c98a6424ac59e1274bd Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi <srinivasar at nvidia.com>
Date: Fri, 19 Dec 2025 12:49:31 +0000
Subject: [PATCH 8/8] address comments
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 9 ++++---
.../CodeGen/NVPTX/tensormap_replace.err.ll | 14 ----------
.../NVPTX/tensormap_replace_invalid.ll | 26 +++++++++++++++++++
3 files changed, 31 insertions(+), 18 deletions(-)
delete mode 100644 llvm/test/CodeGen/NVPTX/tensormap_replace.err.ll
create mode 100644 llvm/test/CodeGen/NVPTX/tensormap_replace_invalid.ll
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 2e85d602c6264..442a592187818 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2746,8 +2746,8 @@ lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset = false) {
return {{BuildVector, Chain}};
}
-static SDValue invalidTensormapReplaceUsage(SDValue Op, SelectionDAG &DAG,
- unsigned Val) {
+static SDValue reportInvalidTensormapReplaceUsage(SDValue Op, SelectionDAG &DAG,
+ unsigned Val) {
SDNode *N = Op.getNode();
SDLoc DL(N);
@@ -2778,7 +2778,8 @@ static SDValue lowerTensormapReplaceElemtype(SDValue Op, SelectionDAG &DAG) {
if (!DAG.getSubtarget<NVPTXSubtarget>().hasTensormapReplaceElemtypeSupport(
Val))
- return invalidTensormapReplaceUsage(Op, DAG, Val);
+ return reportInvalidTensormapReplaceUsage(Op, DAG, Val);
+
return Op;
}
@@ -2791,7 +2792,7 @@ static SDValue lowerTensormapReplaceSwizzleMode(SDValue Op, SelectionDAG &DAG) {
if (!DAG.getSubtarget<NVPTXSubtarget>().hasTensormapReplaceSwizzleModeSupport(
Val))
- return invalidTensormapReplaceUsage(Op, DAG, Val);
+ return reportInvalidTensormapReplaceUsage(Op, DAG, Val);
return Op;
}
diff --git a/llvm/test/CodeGen/NVPTX/tensormap_replace.err.ll b/llvm/test/CodeGen/NVPTX/tensormap_replace.err.ll
deleted file mode 100644
index 3b266af4e0011..0000000000000
--- a/llvm/test/CodeGen/NVPTX/tensormap_replace.err.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: not llc -mtriple=nvptx64 -mcpu=sm_90a -filetype=null %s 2>&1 | FileCheck %s
-
-define void @tensormap_replace_elemtype_error(ptr addrspace(1) %addr) {
- ; CHECK: Intrinsic llvm.nvvm.tensormap.replace.elemtype.p1 with value 13 is not supported on the given target.
- call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %addr, i32 13)
- ret void
-}
-
-define void @tensormap_replace_swizzle_mode_error(ptr addrspace(1) %addr) {
- ; CHECK: Intrinsic llvm.nvvm.tensormap.replace.swizzle.mode.p1 with value 4 is not supported on the given target.
- call void @llvm.nvvm.tensormap.replace.swizzle.mode.p1(ptr addrspace(1) %addr, i32 4)
- ret void
-}
diff --git a/llvm/test/CodeGen/NVPTX/tensormap_replace_invalid.ll b/llvm/test/CodeGen/NVPTX/tensormap_replace_invalid.ll
new file mode 100644
index 0000000000000..c3106172c9098
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tensormap_replace_invalid.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: not llc -mtriple=nvptx64 -mcpu=sm_90a -filetype=null %s 2>&1 | FileCheck %s
+
+define void @tensormap_replace_elemtype_error_global(ptr addrspace(1) %addr) {
+ ; CHECK: Intrinsic llvm.nvvm.tensormap.replace.elemtype.p1 with value 13 is not supported on the given target.
+ call void @llvm.nvvm.tensormap.replace.elemtype.p1(ptr addrspace(1) %addr, i32 13)
+ ret void
+}
+
+define void @tensormap_replace_elemtype_error_shared(ptr addrspace(3) %addr) {
+ ; CHECK: Intrinsic llvm.nvvm.tensormap.replace.elemtype.p3 with value 13 is not supported on the given target.
+ call void @llvm.nvvm.tensormap.replace.elemtype.p3(ptr addrspace(3) %addr, i32 13)
+ ret void
+}
+
+define void @tensormap_replace_swizzle_mode_error_global(ptr addrspace(1) %addr) {
+ ; CHECK: Intrinsic llvm.nvvm.tensormap.replace.swizzle.mode.p1 with value 4 is not supported on the given target.
+ call void @llvm.nvvm.tensormap.replace.swizzle.mode.p1(ptr addrspace(1) %addr, i32 4)
+ ret void
+}
+
+define void @tensormap_replace_swizzle_mode_error_shared(ptr addrspace(3) %addr) {
+ ; CHECK: Intrinsic llvm.nvvm.tensormap.replace.swizzle.mode.p3 with value 4 is not supported on the given target.
+ call void @llvm.nvvm.tensormap.replace.swizzle.mode.p3(ptr addrspace(3) %addr, i32 4)
+ ret void
+}
More information about the llvm-commits
mailing list