[llvm] [NVPTX] Fix lit test issue from used_bytes_mask (PR #171220)
Drew Kersnar via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 8 16:46:40 PST 2025
https://github.com/dakersnar updated https://github.com/llvm/llvm-project/pull/171220
>From d01d16c4d7ee768fd6e4a7e0439e3b4e5c68e8fc Mon Sep 17 00:00:00 2001
From: Drew Kersnar <dkersnar at nvidia.com>
Date: Tue, 9 Dec 2025 00:46:11 +0000
Subject: [PATCH] New fix with test updates
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 15 ++++++--
llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 3 ++
.../test/CodeGen/NVPTX/LoadStoreVectorizer.ll | 1 -
.../CodeGen/NVPTX/param-vectorize-device.ll | 2 -
llvm/test/CodeGen/NVPTX/used-bytes-mask.ll | 38 +++++++++++++++++++
5 files changed, 52 insertions(+), 7 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/used-bytes-mask.ll
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 5081a093d4c34..b88978a50ac16 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -3453,7 +3453,8 @@ SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
}
static std::pair<MemSDNode *, uint32_t>
-convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG) {
+convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG,
+ const NVPTXSubtarget &STI) {
SDValue Chain = N->getOperand(0);
SDValue BasePtr = N->getOperand(1);
SDValue Mask = N->getOperand(3);
@@ -3495,6 +3496,11 @@ convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG) {
MemSDNode *NewLD = cast<MemSDNode>(
DAG.getLoad(ResVT, DL, Chain, BasePtr, N->getMemOperand()).getNode());
+ // If our subtarget does not support the used bytes mask pragma, "drop" the
+ // mask by setting it to UINT32_MAX
+ if (!STI.hasUsedBytesMaskPragma())
+ UsedBytesMask = UINT32_MAX;
+
return {NewLD, UsedBytesMask};
}
@@ -3531,7 +3537,8 @@ replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI) {
// If we have a masked load, convert it to a normal load now
std::optional<uint32_t> UsedBytesMask = std::nullopt;
if (LD->getOpcode() == ISD::MLOAD)
- std::tie(LD, UsedBytesMask) = convertMLOADToLoadWithUsedBytesMask(LD, DAG);
+ std::tie(LD, UsedBytesMask) =
+ convertMLOADToLoadWithUsedBytesMask(LD, DAG, STI);
// Since LoadV2 is a target node, we cannot rely on DAG type legalization.
// Therefore, we must ensure the type is legal. For i1 and i8, we set the
@@ -3667,8 +3674,8 @@ SDValue NVPTXTargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
// them here.
EVT VT = Op.getValueType();
if (NVPTX::isPackedVectorTy(VT)) {
- auto Result =
- convertMLOADToLoadWithUsedBytesMask(cast<MemSDNode>(Op.getNode()), DAG);
+ auto Result = convertMLOADToLoadWithUsedBytesMask(
+ cast<MemSDNode>(Op.getNode()), DAG, STI);
MemSDNode *LD = std::get<0>(Result);
uint32_t UsedBytesMask = std::get<1>(Result);
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index f11d331862081..9b9f871549047 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -89,6 +89,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
return SmVersion >= 100 && PTXVersion >= 88 &&
AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
}
+ bool hasUsedBytesMaskPragma() const {
+ return SmVersion >= 50 && PTXVersion >= 83;
+ }
bool hasAtomAddF64() const { return SmVersion >= 60; }
bool hasAtomScope() const { return SmVersion >= 60; }
bool hasAtomBitwise64() const { return SmVersion >= 32; }
diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
index a75ddd032d4c0..19ec2574e32b4 100644
--- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
+++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
@@ -50,7 +50,6 @@ define half @fh(ptr %p) {
; ENABLED-EMPTY:
; ENABLED-NEXT: // %bb.0:
; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0];
-; ENABLED-NEXT: .pragma "used_bytes_mask 0x3ff";
; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; ENABLED-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; }
; ENABLED-NEXT: mov.b32 {%rs2, %rs3}, %r2;
diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
index 643de006f14c4..4870050dd2d43 100644
--- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
+++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
@@ -171,7 +171,6 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by
; CHECK: .func (.param .align 16 .b8 func_retval0[12])
; CHECK-LABEL: callee_St4x3(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12]
- ; CHECK: .pragma "used_bytes_mask 0xfff";
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], %{{.*}}}, [callee_St4x3_param_0];
; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]};
; CHECK-DAG: st.param.b32 [func_retval0+8], [[R3]];
@@ -394,7 +393,6 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by
; CHECK-LABEL: callee_St4x7(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28]
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0];
- ; CHECK: .pragma "used_bytes_mask 0xfff";
; CHECK: ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], %{{.*}}}, [callee_St4x7_param_0+16];
; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
; CHECK-DAG: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
diff --git a/llvm/test/CodeGen/NVPTX/used-bytes-mask.ll b/llvm/test/CodeGen/NVPTX/used-bytes-mask.ll
new file mode 100644
index 0000000000000..a888d9996a500
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/used-bytes-mask.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx82 | FileCheck %s -check-prefixes=NOMASK
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.2 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx82 | %ptxas-verify -arch=sm_90 %}
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | FileCheck %s -check-prefixes=MASK
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | %ptxas-verify -arch=sm_90 %}
+
+; On older architectures and versions, we shouldn't be seeing a used bytes mask pragma.
+; Specifically, the pragma is only supported on SM_50 or later, and PTX 8.3 or later.
+; Architecture fixed at SM_90 for this test for stability, and we vary the PTX version to test the pragma.
+
+define i32 @global_8xi32(ptr %a, ptr %b) {
+; NOMASK-LABEL: global_8xi32(
+; NOMASK: {
+; NOMASK-NEXT: .reg .b32 %r<5>;
+; NOMASK-NEXT: .reg .b64 %rd<2>;
+; NOMASK-EMPTY:
+; NOMASK-NEXT: // %bb.0:
+; NOMASK-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0];
+; NOMASK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; NOMASK-NEXT: st.param.b32 [func_retval0], %r1;
+; NOMASK-NEXT: ret;
+;
+; MASK-LABEL: global_8xi32(
+; MASK: {
+; MASK-NEXT: .reg .b32 %r<5>;
+; MASK-NEXT: .reg .b64 %rd<2>;
+; MASK-EMPTY:
+; MASK-NEXT: // %bb.0:
+; MASK-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0];
+; MASK-NEXT: .pragma "used_bytes_mask 0xfff";
+; MASK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; MASK-NEXT: st.param.b32 [func_retval0], %r1;
+; MASK-NEXT: ret;
+ %a.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 %a, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> poison)
+ %first = extractelement <4 x i32> %a.load, i32 0
+ ret i32 %first
+}
+declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr , <4 x i1>, <4 x i32>)
More information about the llvm-commits
mailing list