[llvm] [NVPTX] Use PTX prmt for llvm.bswap (PR #85545)

Alex MacLean via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 18 13:13:57 PDT 2024


https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/85545

>From 33517787e1aad0bb3bc0917d2d2069db64bd9de6 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean at nvidia.com>
Date: Mon, 12 Feb 2024 23:43:46 +0000
Subject: [PATCH 1/2] [NVPTX] Use PTX prmt for llvm.bswap

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp |  3 --
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td     | 19 ++++++++
 llvm/test/CodeGen/NVPTX/bswap.ll            | 54 +++++++++++++++++++++
 3 files changed, 73 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/bswap.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7ad3d99ab71f20..e1cdbd6a4b4dc6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -580,9 +580,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::ROTL, MVT::i8, Expand);
   setOperationAction(ISD::ROTR, MVT::i8, Expand);
   setOperationAction(ISD::BSWAP, MVT::i16, Expand);
-  setOperationAction(ISD::BSWAP, MVT::v2i16, Expand);
-  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
-  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
 
   // Indirect branch is not supported.
   // This also disables Jump Table creation.
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 3d387ed574fabc..4daf47cfd4df8a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3549,6 +3549,11 @@ let hasSideEffects = false in {
                              (ins Int64Regs:$s),
                              "{{ .reg .b32 tmp; mov.b64 {tmp, $high}, $s; }}",
                              []>;
+  def I64toI32L  : NVPTXInst<(outs Int32Regs:$low),
+                             (ins Int64Regs:$s),
+                             "{{ .reg .b32 tmp; mov.b64 {$low, tmp}, $s; }}",
+                             []>;
+
 }
 
 // Using partial vectorized move produces better SASS code for extraction of
@@ -3838,3 +3843,17 @@ include "NVPTXIntrinsics.td"
 // - for sm_20, use pmpt (use vector scalar mov to get the pack and
 //   unpack). sm_20 supports native 32-bit register, but not native 16-bit
 // register.
+
+def : Pat <
+  (i32 (bswap i32:$a)),
+  (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x0123))>;
+
+def : Pat <
+  (v2i16 (bswap v2i16:$a)),
+  (INT_NVVM_PRMT Int32Regs:$a, (i32 0), (i32 0x2301))>;
+
+def : Pat <
+  (i64 (bswap i64:$a)),
+  (V2I32toI64
+    (INT_NVVM_PRMT (I64toI32H Int64Regs:$a), (i32 0), (i32 0x0123)),
+    (INT_NVVM_PRMT (I64toI32L Int64Regs:$a), (i32 0), (i32 0x0123)))>;
diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll
new file mode 100644
index 00000000000000..7f65568c1d517b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/bswap.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+
+; CHECK-LABEL: @bswap16
+; CHECK:   ld.param.u16    %rs1, [bswap16_param_0];
+; CHECK:   shr.u16         %rs2, %rs1, 8;
+; CHECK:   shl.b16         %rs3, %rs1, 8;
+; CHECK:   or.b16          %rs4, %rs3, %rs2;
+; CHECK:   cvt.u32.u16     %r1, %rs4;
+; CHECK:   st.param.b32    [func_retval0+0], %r1;
+
+define i16 @bswap16(i16 %a) {
+  %b = tail call i16 @llvm.bswap.i16(i16 %a)
+  ret i16 %b
+}
+
+; CHECK-LABEL: @bswap32
+; CHECK:    ld.param.u32    %r1, [bswap32_param_0];
+; CHECK:    prmt.b32        %r2, %r1, 0, 291;
+; CHECK:    st.param.b32    [func_retval0+0], %r2;
+
+define i32 @bswap32(i32 %a) {
+  %b = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %b
+}
+
+; CHECK-LABEL: @bswapv2i16
+; CHECK:    ld.param.u32    %r1, [bswapv2i16_param_0];
+; CHECK:    prmt.b32        %r2, %r1, 0, 8961;
+; CHECK:    st.param.b32    [func_retval0+0], %r2;
+
+define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
+  %b = tail call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %a)
+  ret <2 x i16> %b
+}
+
+; CHECK-LABEL: @bswap64
+; CHECK:    ld.param.u64    %rd1, [bswap64_param_0];
+; CHECK:    { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
+; CHECK:    prmt.b32        %r2, %r1, 0, 291;
+; CHECK:    { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
+; CHECK:    prmt.b32        %r4, %r3, 0, 291;
+; CHECK:    mov.b64         %rd2, {%r4, %r2};
+; CHECK:    st.param.b64    [func_retval0+0], %rd2;
+
+define i64 @bswap64(i64 %a) {
+  %b = tail call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %b
+}
+
+declare i16 @llvm.bswap.i16(i16)
+declare i32 @llvm.bswap.i32(i32)
+declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>)
+declare i64 @llvm.bswap.i64(i64)

>From 775278434604f87c14c4a5f2c70c6c2946278759 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean at nvidia.com>
Date: Mon, 18 Mar 2024 20:13:31 +0000
Subject: [PATCH 2/2] address comments

---
 llvm/test/CodeGen/NVPTX/bswap.ll | 71 +++++++++++++++++++++-----------
 1 file changed, 47 insertions(+), 24 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll
index 7f65568c1d517b..3f929ec6a75d0a 100644
--- a/llvm/test/CodeGen/NVPTX/bswap.ll
+++ b/llvm/test/CodeGen/NVPTX/bswap.ll
@@ -1,49 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
-; CHECK-LABEL: @bswap16
-; CHECK:   ld.param.u16    %rs1, [bswap16_param_0];
-; CHECK:   shr.u16         %rs2, %rs1, 8;
-; CHECK:   shl.b16         %rs3, %rs1, 8;
-; CHECK:   or.b16          %rs4, %rs3, %rs2;
-; CHECK:   cvt.u32.u16     %r1, %rs4;
-; CHECK:   st.param.b32    [func_retval0+0], %r1;
+target triple = "nvptx64-nvidia-cuda"
 
 define i16 @bswap16(i16 %a) {
+; CHECK-LABEL: bswap16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %rs1, [bswap16_param_0];
+; CHECK-NEXT:    shr.u16 %rs2, %rs1, 8;
+; CHECK-NEXT:    shl.b16 %rs3, %rs1, 8;
+; CHECK-NEXT:    or.b16 %rs4, %rs3, %rs2;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
   %b = tail call i16 @llvm.bswap.i16(i16 %a)
   ret i16 %b
 }
 
-; CHECK-LABEL: @bswap32
-; CHECK:    ld.param.u32    %r1, [bswap32_param_0];
-; CHECK:    prmt.b32        %r2, %r1, 0, 291;
-; CHECK:    st.param.b32    [func_retval0+0], %r2;
 
 define i32 @bswap32(i32 %a) {
+; CHECK-LABEL: bswap32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bswap32_param_0];
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 291;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT:    ret;
   %b = tail call i32 @llvm.bswap.i32(i32 %a)
   ret i32 %b
 }
 
-; CHECK-LABEL: @bswapv2i16
-; CHECK:    ld.param.u32    %r1, [bswapv2i16_param_0];
-; CHECK:    prmt.b32        %r2, %r1, 0, 8961;
-; CHECK:    st.param.b32    [func_retval0+0], %r2;
 
 define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
+; CHECK-LABEL: bswapv2i16(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u32 %r1, [bswapv2i16_param_0];
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 8961;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT:    ret;
   %b = tail call <2 x i16> @llvm.bswap.v2i16(<2 x i16> %a)
   ret <2 x i16> %b
 }
 
-; CHECK-LABEL: @bswap64
-; CHECK:    ld.param.u64    %rd1, [bswap64_param_0];
-; CHECK:    { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
-; CHECK:    prmt.b32        %r2, %r1, 0, 291;
-; CHECK:    { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
-; CHECK:    prmt.b32        %r4, %r3, 0, 291;
-; CHECK:    mov.b64         %rd2, {%r4, %r2};
-; CHECK:    st.param.b64    [func_retval0+0], %rd2;
-
 define i64 @bswap64(i64 %a) {
+; CHECK-LABEL: bswap64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [bswap64_param_0];
+; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 291;
+; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
+; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 291;
+; CHECK-NEXT:    mov.b64 %rd2, {%r4, %r2};
+; CHECK-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; CHECK-NEXT:    ret;
   %b = tail call i64 @llvm.bswap.i64(i64 %a)
   ret i64 %b
 }



More information about the llvm-commits mailing list