[llvm] [NVPTX] Use PTX prmt for llvm.bswap (PR #85545)
Alex MacLean via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 18 13:13:57 PDT 2024
https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/85545
>From 33517787e1aad0bb3bc0917d2d2069db64bd9de6 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean at nvidia.com>
Date: Mon, 12 Feb 2024 23:43:46 +0000
Subject: [PATCH 1/2] [NVPTX] Use PTX prmt for llvm.bswap
---
llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 3 --
llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 19 ++++++++
llvm/test/CodeGen/NVPTX/bswap.ll | 54 +++++++++++++++++++++
3 files changed, 73 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/NVPTX/bswap.ll
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7ad3d99ab71f20..e1cdbd6a4b4dc6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -580,9 +580,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::ROTL, MVT::i8, Expand);
setOperationAction(ISD::ROTR, MVT::i8, Expand);
setOperationAction(ISD::BSWAP, MVT::i16, Expand);
- setOperationAction(ISD::BSWAP, MVT::v2i16, Expand);
- setOperationAction(ISD::BSWAP, MVT::i32, Expand);
- setOperationAction(ISD::BSWAP, MVT::i64, Expand);
// Indirect branch is not supported.
// This also disables Jump Table creation.
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 3d387ed574fabc..4daf47cfd4df8a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3549,6 +3549,11 @@ let hasSideEffects = false in {
(ins Int64Regs:$s),
"{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}",
[]>;
+ def I64toI32L : NVPTXInst<(outs Int32Regs:$low),
+ (ins Int64Regs:$s),
+ "{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
+ []>;
+
}
// Using partial vectorized move produces better SASS code for extraction of
@@ -3838,3 +3843,17 @@ include "NVPTXIntrinsics.td"
// - for sm_20, use pmpt (use vector scalar mov to get the pack and
// unpack). sm_20 supports native 32-bit register, but not native 16-bit
// register.
+
+def : Pat <
+ (i32 (bswap i32:$a)),
+ (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x0123))>;
+
+def : Pat <
+ (v2i16 (bswap v2i16:$a)),
+ (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x2301))>;
+
+def : Pat <
+ (i64 (bswap i64:$a)),
+ (V2I32toI64
+ (INT_NVVM_PRMT (I64toI32H Int64Regs:$a), (i32 0), (i32 0x0123)),
+ (INT_NVVM_PRMT (I64toI32L Int64Regs:$a), (i32 0), (i32 0x0123)))>;
diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll
new file mode 100644
index 00000000000000..7f65568c1d517b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/bswap.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+
+; CHECK-LABEL: @bswap16
+; CHECK: ld.param.u16 %rs1, [bswap16_param_0];
+; CHECK: shr.u16 %rs2, %rs1, 8;
+; CHECK: shl.b16 %rs3, %rs1, 8;
+; CHECK: or.b16 %rs4, %rs3, %rs2;
+; CHECK: cvt.u32.u16 %r1, %rs4;
+; CHECK: st.param.b32 [func_retval0+0], %r1;
+
+define i16 @bswap16(i16 %a) {
+ %b = tail call i16 @llvm.bswap.i16(i16 %a)
+ ret i16 %b
+}
+
+; CHECK-LABEL: @bswap32
+; CHECK: ld.param.u32 %r1, [bswap32_param_0];
+; CHECK: prmt.b32 %r2, %r1, 0, 291;
+; CHECK: st.param.b32 [func_retval0+0], %r2;
+
+define i32 @bswap32(i32 %a) {
+ %b = tail call i32 @llvm.bswap.i32(i32 %a)
+ ret i32 %b
+}
+
+; CHECK-LABEL: @bswapv2i16
+; CHECK: ld.param.u32 %r1, [bswapv2i16_param_0];
+; CHECK: prmt.b32 %r2, %r1, 0, 8961;
+; CHECK: st.param.b32 [func_retval0+0], %r2;
+
+define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
+ %b = tail call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %a)
+ ret <2 x i16> %b
+}
+
+; CHECK-LABEL: @bswap64
+; CHECK: ld.param.u64 %rd1, [bswap64_param_0];
+; CHECK: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
+; CHECK: prmt.b32 %r2, %r1, 0, 291;
+; CHECK: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
+; CHECK: prmt.b32 %r4, %r3, 0, 291;
+; CHECK: mov.b64 %rd2, {%r4, %r2};
+; CHECK: st.param.b64 [func_retval0+0], %rd2;
+
+define i64 @bswap64(i64 %a) {
+ %b = tail call i64 @llvm.bswap.i64(i64 %a)
+ ret i64 %b
+}
+
+declare i16 @llvm.bswap.i16(i16)
+declare i32 @llvm.bswap.i32(i32)
+declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>)
+declare i64 @llvm.bswap.i64(i64)
>From 775278434604f87c14c4a5f2c70c6c2946278759 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean at nvidia.com>
Date: Mon, 18 Mar 2024 20:13:31 +0000
Subject: [PATCH 2/2] address comments
---
llvm/test/CodeGen/NVPTX/bswap.ll | 71 +++++++++++++++++++++-----------
1 file changed, 47 insertions(+), 24 deletions(-)
diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll
index 7f65568c1d517b..3f929ec6a75d0a 100644
--- a/llvm/test/CodeGen/NVPTX/bswap.ll
+++ b/llvm/test/CodeGen/NVPTX/bswap.ll
@@ -1,49 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
-; CHECK-LABEL: @bswap16
-; CHECK: ld.param.u16 %rs1, [bswap16_param_0];
-; CHECK: shr.u16 %rs2, %rs1, 8;
-; CHECK: shl.b16 %rs3, %rs1, 8;
-; CHECK: or.b16 %rs4, %rs3, %rs2;
-; CHECK: cvt.u32.u16 %r1, %rs4;
-; CHECK: st.param.b32 [func_retval0+0], %r1;
+target triple = "nvptx64-nvidia-cuda"
define i16 @bswap16(i16 %a) {
+; CHECK-LABEL: bswap16(
+; CHECK: {
+; CHECK-NEXT: .reg .b16 %rs<5>;
+; CHECK-NEXT: .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u16 %rs1, [bswap16_param_0];
+; CHECK-NEXT: shr.u16 %rs2, %rs1, 8;
+; CHECK-NEXT: shl.b16 %rs3, %rs1, 8;
+; CHECK-NEXT: or.b16 %rs4, %rs3, %rs2;
+; CHECK-NEXT: cvt.u32.u16 %r1, %rs4;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT: ret;
%b = tail call i16 @llvm.bswap.i16(i16 %a)
ret i16 %b
}
-; CHECK-LABEL: @bswap32
-; CHECK: ld.param.u32 %r1, [bswap32_param_0];
-; CHECK: prmt.b32 %r2, %r1, 0, 291;
-; CHECK: st.param.b32 [func_retval0+0], %r2;
define i32 @bswap32(i32 %a) {
+; CHECK-LABEL: bswap32(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [bswap32_param_0];
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 291;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT: ret;
%b = tail call i32 @llvm.bswap.i32(i32 %a)
ret i32 %b
}
-; CHECK-LABEL: @bswapv2i16
-; CHECK: ld.param.u32 %r1, [bswapv2i16_param_0];
-; CHECK: prmt.b32 %r2, %r1, 0, 8961;
-; CHECK: st.param.b32 [func_retval0+0], %r2;
define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
+; CHECK-LABEL: bswapv2i16(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u32 %r1, [bswapv2i16_param_0];
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 8961;
+; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT: ret;
%b = tail call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %a)
ret <2 x i16> %b
}
-; CHECK-LABEL: @bswap64
-; CHECK: ld.param.u64 %rd1, [bswap64_param_0];
-; CHECK: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
-; CHECK: prmt.b32 %r2, %r1, 0, 291;
-; CHECK: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
-; CHECK: prmt.b32 %r4, %r3, 0, 291;
-; CHECK: mov.b64 %rd2, {%r4, %r2};
-; CHECK: st.param.b64 [func_retval0+0], %rd2;
-
define i64 @bswap64(i64 %a) {
+; CHECK-LABEL: bswap64(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<5>;
+; CHECK-NEXT: .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [bswap64_param_0];
+; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
+; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 291;
+; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
+; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 291;
+; CHECK-NEXT: mov.b64 %rd2, {%r4, %r2};
+; CHECK-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; CHECK-NEXT: ret;
%b = tail call i64 @llvm.bswap.i64(i64 %a)
ret i64 %b
}
More information about the llvm-commits
mailing list