[llvm] [NVPTX] Use PRMT instruction to lower i16 bswap (PR #168968)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 21 16:32:27 PST 2025
https://github.com/Chengjunp updated https://github.com/llvm/llvm-project/pull/168968
>From c396152627656c1d53fcf115a0d64e3b1c5da87f Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nvidia.com>
Date: Thu, 20 Nov 2025 23:21:59 +0000
Subject: [PATCH 1/5] Use PRMT instruction to lower i16 bswap
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 2 --
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 4 ++++
llvm/test/CodeGen/NVPTX/bswap.ll | 14 +++++++-------
3 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 2f1a7ad2d401f..9de643497ecb4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -711,8 +711,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
Custom);
}
- setOperationAction(ISD::BSWAP, MVT::i16, Expand);
-
setOperationAction(ISD::BR_JT, MVT::Other, Custom);
setOperationAction(ISD::BRIND, MVT::Other, Expand);
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index dfde0cca0f00c..b69aa359cb725 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2455,6 +2455,10 @@ include "NVPTXIntrinsics.td"
// unpack). sm_20 supports native 32-bit register, but not native 16-bit
// register.
+def : Pat <
+ (i16 (bswap i16:$a)),
+ (i16 (CVT_u16_u32 (PRMT_B32rii (i32 (CVT_u32_u16 $a, CvtNONE)), (i32 0), (i32 0x0001), PrmtNONE), CvtNONE))>;
+
def : Pat <
(i32 (bswap i32:$a)),
(PRMT_B32rii $a, (i32 0), (i32 0x0123), PrmtNONE)>;
diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll
index e3d1c80922609..a12deed544642 100644
--- a/llvm/test/CodeGen/NVPTX/bswap.ll
+++ b/llvm/test/CodeGen/NVPTX/bswap.ll
@@ -10,16 +10,16 @@ target triple = "nvptx64-nvidia-cuda"
define i16 @bswap16(i16 %a) {
; CHECK-LABEL: bswap16(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<5>;
-; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-NEXT: .reg .b16 %rs<3>;
+; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b16 %rs1, [bswap16_param_0];
-; CHECK-NEXT: shr.u16 %rs2, %rs1, 8;
-; CHECK-NEXT: shl.b16 %rs3, %rs1, 8;
-; CHECK-NEXT: or.b16 %rs4, %rs3, %rs2;
-; CHECK-NEXT: cvt.u32.u16 %r1, %rs4;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x1U;
+; CHECK-NEXT: cvt.u16.u32 %rs2, %r2;
+; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
; CHECK-NEXT: ret;
%b = tail call i16 @llvm.bswap.i16(i16 %a)
ret i16 %b
>From 2de19db36471c140404a06234426d101af9ffaed Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nvidia.com>
Date: Fri, 21 Nov 2025 00:30:53 +0000
Subject: [PATCH 2/5] Format
---
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 2654d3fbef6f6..2bcf4120c7ad2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2477,9 +2477,10 @@ include "NVPTXIntrinsics.td"
// unpack). sm_20 supports native 32-bit register, but not native 16-bit
// register.
-def : Pat <
- (i16 (bswap i16:$a)),
- (i16 (CVT_u16_u32 (PRMT_B32rii (i32 (CVT_u32_u16 $a, CvtNONE)), (i32 0), (i32 0x0001), PrmtNONE), CvtNONE))>;
+def : Pat<(i16 (bswap i16:$a)),
+ (i16 (CVT_u16_u32 (PRMT_B32rii (i32 (CVT_u32_u16 $a, CvtNONE)),
+ (i32 0), (i32 0x0001), PrmtNONE),
+ CvtNONE))>;
def : Pat <
(i32 (bswap i32:$a)),
>From adec39963b75953e24cc6a4317fe51356198291a Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nvidia.com>
Date: Fri, 21 Nov 2025 22:44:22 +0000
Subject: [PATCH 3/5] Lowering bswap in operation legalization
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 48 ++++++++++++++++++++-
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 35 ++-------------
llvm/test/CodeGen/NVPTX/bswap.ll | 16 +++----
3 files changed, 56 insertions(+), 43 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index cc675ff6ff7c7..67d5f99f958d8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1104,6 +1104,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// * MVT::Other - internal.addrspace.wrap
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom);
+
+ // Custom lowering for bswap
+ setOperationAction(ISD::BSWAP, MVT::i16, Custom);
+ setOperationAction(ISD::BSWAP, MVT::i32, Custom);
+ setOperationAction(ISD::BSWAP, MVT::i64, Custom);
+ setOperationAction(ISD::BSWAP, MVT::v2i16, Custom);
}
TargetLoweringBase::LegalizeTypeAction
@@ -2568,6 +2574,45 @@ static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG) {
return Tcgen05StNode;
}
+static SDValue lowerBSWAP(SDValue Op, SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ SDValue Src = Op.getOperand(0);
+ EVT VT = Op.getValueType();
+
+ if (VT == MVT::i16) {
+ SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
+ SDValue Swapped =
+ getPRMT(Extended, DAG.getConstant(0, DL, MVT::i32), 0x7701, DL, DAG);
+ return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Swapped);
+ }
+
+ if (VT == MVT::i32) {
+ return getPRMT(Src, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG);
+ }
+
+ if (VT == MVT::v2i16) {
+ SDValue Converted = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
+ SDValue Swapped =
+ getPRMT(Converted, DAG.getConstant(0, DL, MVT::i32), 0x2301, DL, DAG);
+ return DAG.getNode(ISD::BITCAST, DL, MVT::v2i16, Swapped);
+ }
+
+ if (VT == MVT::i64) {
+ SDValue Low = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Src,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue High = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Src,
+ DAG.getIntPtrConstant(1, DL));
+ SDValue SwappedLow =
+ getPRMT(Low, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG);
+ SDValue SwappedHigh =
+ getPRMT(High, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG);
+ return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64,
+ {SwappedHigh, SwappedLow});
+ }
+
+ llvm_unreachable("unsupported type for bswap");
+}
+
static unsigned getTcgen05MMADisableOutputLane(unsigned IID) {
switch (IID) {
case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
@@ -3191,7 +3236,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerCTLZCTPOP(Op, DAG);
case ISD::FREM:
return lowerFREM(Op, DAG);
-
+ case ISD::BSWAP:
+ return lowerBSWAP(Op, DAG);
default:
llvm_unreachable("Custom lowering not defined for operation");
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 2bcf4120c7ad2..68c6e318a8dd7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2471,38 +2471,9 @@ include "NVPTXIntrinsics.td"
//-----------------------------------
// Notes
//-----------------------------------
-// BSWAP is currently expanded. The following is a more efficient
-// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
-// - for sm_20, use pmpt (use vector scalar mov to get the pack and
-// unpack). sm_20 supports native 32-bit register, but not native 16-bit
-// register.
-
-def : Pat<(i16 (bswap i16:$a)),
- (i16 (CVT_u16_u32 (PRMT_B32rii (i32 (CVT_u32_u16 $a, CvtNONE)),
- (i32 0), (i32 0x0001), PrmtNONE),
- CvtNONE))>;
-
-def : Pat <
- (i32 (bswap i32:$a)),
- (PRMT_B32rii $a, (i32 0), (i32 0x0123), PrmtNONE)>;
-
-def : Pat <
- (v2i16 (bswap v2i16:$a)),
- (PRMT_B32rii $a, (i32 0), (i32 0x2301), PrmtNONE)>;
-
-def : Pat <
- (i64 (bswap i64:$a)),
- (V2I32toI64
- (PRMT_B32rii (I64toI32H_Sink $a), (i32 0), (i32 0x0123), PrmtNONE),
- (PRMT_B32rii (I64toI32L_Sink $a), (i32 0), (i32 0x0123), PrmtNONE))>,
- Requires<[hasPTX<71>]>;
-
-// Fall back to the old way if we don't have PTX 7.1.
-def : Pat <
- (i64 (bswap i64:$a)),
- (V2I32toI64
- (PRMT_B32rii (I64toI32H $a), (i32 0), (i32 0x0123), PrmtNONE),
- (PRMT_B32rii (I64toI32L $a), (i32 0), (i32 0x0123), PrmtNONE))>;
+// BSWAP is currently custom-lowered during operation legalization in
+// NVPTXISelLowering.cpp.
+// See the lowerBSWAP function in NVPTXISelLowering.cpp for details.
////////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll
index a12deed544642..a0bcf0056651c 100644
--- a/llvm/test/CodeGen/NVPTX/bswap.ll
+++ b/llvm/test/CodeGen/NVPTX/bswap.ll
@@ -10,16 +10,12 @@ target triple = "nvptx64-nvidia-cuda"
define i16 @bswap16(i16 %a) {
; CHECK-LABEL: bswap16(
; CHECK: {
-; CHECK-NEXT: .reg .b16 %rs<3>;
-; CHECK-NEXT: .reg .b32 %r<4>;
+; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
-; CHECK-NEXT: ld.param.b16 %rs1, [bswap16_param_0];
-; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
-; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x1U;
-; CHECK-NEXT: cvt.u16.u32 %rs2, %r2;
-; CHECK-NEXT: cvt.u32.u16 %r3, %rs2;
-; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
+; CHECK-NEXT: ld.param.b16 %r1, [bswap16_param_0];
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7701U;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%b = tail call i16 @llvm.bswap.i16(i16 %a)
ret i16 %b
@@ -63,7 +59,7 @@ define i64 @bswap64(i64 %a) {
; PTX70-EMPTY:
; PTX70-NEXT: // %bb.0:
; PTX70-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
-; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
+; PTX70-NEXT: cvt.u32.u64 %r1, %rd1;
; PTX70-NEXT: prmt.b32 %r2, %r1, 0, 0x123U;
; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
; PTX70-NEXT: prmt.b32 %r4, %r3, 0, 0x123U;
@@ -78,7 +74,7 @@ define i64 @bswap64(i64 %a) {
; PTX71-EMPTY:
; PTX71-NEXT: // %bb.0:
; PTX71-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
-; PTX71-NEXT: mov.b64 {%r1, _}, %rd1;
+; PTX71-NEXT: cvt.u32.u64 %r1, %rd1;
; PTX71-NEXT: prmt.b32 %r2, %r1, 0, 0x123U;
; PTX71-NEXT: mov.b64 {_, %r3}, %rd1;
; PTX71-NEXT: prmt.b32 %r4, %r3, 0, 0x123U;
>From 7af6fa42404f9cc398aa12bd33cfb4a6ddd3ebc1 Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nvidia.com>
Date: Fri, 21 Nov 2025 23:48:28 +0000
Subject: [PATCH 4/5] Update code & test
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 39 ++++++++---------
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 8 ----
llvm/test/CodeGen/NVPTX/bswap.ll | 47 ++++++---------------
3 files changed, 32 insertions(+), 62 deletions(-)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 67d5f99f958d8..454a237b1be78 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1106,10 +1106,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
{MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom);
// Custom lowering for bswap
- setOperationAction(ISD::BSWAP, MVT::i16, Custom);
- setOperationAction(ISD::BSWAP, MVT::i32, Custom);
- setOperationAction(ISD::BSWAP, MVT::i64, Custom);
- setOperationAction(ISD::BSWAP, MVT::v2i16, Custom);
+ setOperationAction(ISD::BSWAP, {MVT::i16, MVT::i32, MVT::i64, MVT::v2i16},
+ Custom);
}
TargetLoweringBase::LegalizeTypeAction
@@ -2579,38 +2577,37 @@ static SDValue lowerBSWAP(SDValue Op, SelectionDAG &DAG) {
SDValue Src = Op.getOperand(0);
EVT VT = Op.getValueType();
- if (VT == MVT::i16) {
- SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::i16: {
+ SDValue Extended = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
SDValue Swapped =
getPRMT(Extended, DAG.getConstant(0, DL, MVT::i32), 0x7701, DL, DAG);
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Swapped);
}
-
- if (VT == MVT::i32) {
+ case MVT::i32: {
return getPRMT(Src, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG);
}
-
- if (VT == MVT::v2i16) {
- SDValue Converted = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
+ case MVT::v2i16: {
+ SDValue Converted = DAG.getBitcast(MVT::i32, Src);
SDValue Swapped =
getPRMT(Converted, DAG.getConstant(0, DL, MVT::i32), 0x2301, DL, DAG);
return DAG.getNode(ISD::BITCAST, DL, MVT::v2i16, Swapped);
}
-
- if (VT == MVT::i64) {
- SDValue Low = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Src,
- DAG.getIntPtrConstant(0, DL));
- SDValue High = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Src,
- DAG.getIntPtrConstant(1, DL));
+ case MVT::i64: {
+ SDValue UnpackSrc =
+ DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, Src);
SDValue SwappedLow =
- getPRMT(Low, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG);
+ getPRMT(UnpackSrc.getValue(0), DAG.getConstant(0, DL, MVT::i32), 0x0123,
+ DL, DAG);
SDValue SwappedHigh =
- getPRMT(High, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG);
+ getPRMT(UnpackSrc.getValue(1), DAG.getConstant(0, DL, MVT::i32), 0x0123,
+ DL, DAG);
return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64,
{SwappedHigh, SwappedLow});
}
-
- llvm_unreachable("unsupported type for bswap");
+ default:
+ llvm_unreachable("unsupported type for bswap");
+ }
}
static unsigned getTcgen05MMADisableOutputLane(unsigned IID) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 68c6e318a8dd7..04e2dd435cdf0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2468,14 +2468,6 @@ let Predicates = [hasPTX<73>, hasSM<52>] in {
include "NVPTXIntrinsics.td"
-//-----------------------------------
-// Notes
-//-----------------------------------
-// BSWAP is currently custom-lowered during operation legalization in
-// NVPTXISelLowering.cpp.
-// See the lowerBSWAP function in NVPTXISelLowering.cpp for details.
-
-
////////////////////////////////////////////////////////////////////////////////
// PTX Fence instructions
////////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll
index a0bcf0056651c..1e6f95a6201d2 100644
--- a/llvm/test/CodeGen/NVPTX/bswap.ll
+++ b/llvm/test/CodeGen/NVPTX/bswap.ll
@@ -1,9 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | FileCheck -check-prefixes CHECK,PTX70 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
-; RUN: %if ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | %ptxas-verify %}
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | FileCheck -check-prefixes CHECK,PTX71 %s
-; RUN: %if ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %}
target triple = "nvptx64-nvidia-cuda"
@@ -52,35 +49,19 @@ define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
}
define i64 @bswap64(i64 %a) {
-; PTX70-LABEL: bswap64(
-; PTX70: {
-; PTX70-NEXT: .reg .b32 %r<5>;
-; PTX70-NEXT: .reg .b64 %rd<3>;
-; PTX70-EMPTY:
-; PTX70-NEXT: // %bb.0:
-; PTX70-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
-; PTX70-NEXT: cvt.u32.u64 %r1, %rd1;
-; PTX70-NEXT: prmt.b32 %r2, %r1, 0, 0x123U;
-; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
-; PTX70-NEXT: prmt.b32 %r4, %r3, 0, 0x123U;
-; PTX70-NEXT: mov.b64 %rd2, {%r4, %r2};
-; PTX70-NEXT: st.param.b64 [func_retval0], %rd2;
-; PTX70-NEXT: ret;
-;
-; PTX71-LABEL: bswap64(
-; PTX71: {
-; PTX71-NEXT: .reg .b32 %r<5>;
-; PTX71-NEXT: .reg .b64 %rd<3>;
-; PTX71-EMPTY:
-; PTX71-NEXT: // %bb.0:
-; PTX71-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
-; PTX71-NEXT: cvt.u32.u64 %r1, %rd1;
-; PTX71-NEXT: prmt.b32 %r2, %r1, 0, 0x123U;
-; PTX71-NEXT: mov.b64 {_, %r3}, %rd1;
-; PTX71-NEXT: prmt.b32 %r4, %r3, 0, 0x123U;
-; PTX71-NEXT: mov.b64 %rd2, {%r4, %r2};
-; PTX71-NEXT: st.param.b64 [func_retval0], %rd2;
-; PTX71-NEXT: ret;
+; CHECK-LABEL: bswap64(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
+; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x123U;
+; CHECK-NEXT: prmt.b32 %r4, %r2, 0, 0x123U;
+; CHECK-NEXT: mov.b64 %rd2, {%r4, %r3};
+; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT: ret;
%b = tail call i64 @llvm.bswap.i64(i64 %a)
ret i64 %b
}
>From cabb52cd46ee833760d57d26785d4803e8a58ba3 Mon Sep 17 00:00:00 2001
From: chengjunp <chengjunp at nvidia.com>
Date: Sat, 22 Nov 2025 00:32:13 +0000
Subject: [PATCH 5/5] Add v2i32 test
---
llvm/test/CodeGen/NVPTX/bswap.ll | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll
index 1e6f95a6201d2..8050c6f1c7031 100644
--- a/llvm/test/CodeGen/NVPTX/bswap.ll
+++ b/llvm/test/CodeGen/NVPTX/bswap.ll
@@ -66,7 +66,22 @@ define i64 @bswap64(i64 %a) {
ret i64 %b
}
+define <2 x i32> @bswapv2i32(<2 x i32> %a) {
+; CHECK-LABEL: bswapv2i32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [bswapv2i32_param_0];
+; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x123U;
+; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x123U;
+; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT: ret;
+ %b = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a)
+ ret <2 x i32> %b
+}
declare i16 @llvm.bswap.i16(i16)
declare i32 @llvm.bswap.i32(i32)
declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>)
declare i64 @llvm.bswap.i64(i64)
+declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>)
More information about the llvm-commits
mailing list