[llvm] [NVPTX] Fix 64 bits rotations with large shift values (PR #89399)

Thu Apr 25 12:26:38 PDT 2024

https://github.com/npmiller updated https://github.com/llvm/llvm-project/pull/89399

>From 38eca33f83ff793736330aeb85732c50ec9388e5 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller at codeplay.com>
Date: Fri, 19 Apr 2024 10:28:33 +0100
Subject: [PATCH] [NVPTX] Fix 64 bits rotations with large shift values

ROTL and ROTR can take a shift amount larger than the element size, in
which case the effective shift amount should be the shift amount modulo
the element size.

This patch adds the modulo step when the shift amount isn't known at
compile time. Without it the existing implementation would end up
shifting beyond the type size and give incorrect results.
---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 10 +++---
 llvm/test/CodeGen/NVPTX/rotate.ll       | 43 +++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 897ee89323f083..142dd64ddea9dc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1752,8 +1752,9 @@ def ROTL64reg_sw :
             ".reg .b64 %lhs;\n\t"
             ".reg .b64 %rhs;\n\t"
             ".reg .u32 %amt2;\n\t"
-            "shl.b64 \t%lhs, $src, $amt;\n\t"
-            "sub.u32 \t%amt2, 64, $amt;\n\t"
+            "and.b32 \t%amt2, $amt, 63;\n\t"
+            "shl.b64 \t%lhs, $src, %amt2;\n\t"
+            "sub.u32 \t%amt2, 64, %amt2;\n\t"
             "shr.b64 \t%rhs, $src, %amt2;\n\t"
             "add.u64 \t$dst, %lhs, %rhs;\n\t"
             "}}",
@@ -1765,8 +1766,9 @@ def ROTR64reg_sw :
             ".reg .b64 %lhs;\n\t"
             ".reg .b64 %rhs;\n\t"
             ".reg .u32 %amt2;\n\t"
-            "shr.b64 \t%lhs, $src, $amt;\n\t"
-            "sub.u32 \t%amt2, 64, $amt;\n\t"
+            "and.b32 \t%amt2, $amt, 63;\n\t"
+            "shr.b64 \t%lhs, $src, %amt2;\n\t"
+            "sub.u32 \t%amt2, 64, %amt2;\n\t"
             "shl.b64 \t%rhs, $src, %amt2;\n\t"
             "add.u64 \t$dst, %lhs, %rhs;\n\t"
             "}}",
diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll
index 9d058662c27174..39a7b652e8afd3 100644
--- a/llvm/test/CodeGen/NVPTX/rotate.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate.ll
@@ -58,3 +58,46 @@ define i32 @rotl0(i32 %x) {
   %t2 = or i32 %t0, %t1
   ret i32 %t2
 }
+
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+
+; SM35: rotl64
+define i64 @rotl64(i64 %a, i64 %n) {
+; SM35: and.b32 {{.*}}, 63;
+; SM35: shl.b64
+; SM35: sub.u32
+; SM35: shr.b64
+; SM35: add.u64
+  %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n)
+  ret i64 %val
+}
+
+; SM35: rotl64_imm
+define i64 @rotl64_imm(i64 %a) {
+; SM35: shl.b64 {{.*}}, 2;
+; SM35: shr.b64 {{.*}}, 62;
+; SM35: add.u64
+  %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66)
+  ret i64 %val
+}
+
+; SM35: rotr64
+define i64 @rotr64(i64 %a, i64 %n) {
+; SM35: and.b32 {{.*}}, 63;
+; SM35: shr.b64
+; SM35: sub.u32
+; SM35: shl.b64
+; SM35: add.u64
+  %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n)
+  ret i64 %val
+}
+
+; SM35: rotr64_imm
+define i64 @rotr64_imm(i64 %a) {
+; SM35: shl.b64 {{.*}}, 62;
+; SM35: shr.b64 {{.*}}, 2;
+; SM35: add.u64
+  %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66)
+  ret i64 %val
+}