[llvm] [NVPTX] Fix 64 bits rotations with large shift values (PR #89399)

Fri Apr 19 08:14:55 PDT 2024

https://github.com/npmiller created https://github.com/llvm/llvm-project/pull/89399

ROTL and ROTR can take a shift amount larger than the element size, in which case the effective shift amount should be the shift amount modulo the element size.

This patch adds the modulo step when the shift amount isn't known at compile time. Without it the existing implementation would end up shifting beyond the type size and give incorrect results.

cc @Artem-B 

>From db43ed43c6305f5673de9c99e0c5d606fc62b9c1 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller at codeplay.com>
Date: Fri, 19 Apr 2024 10:28:33 +0100
Subject: [PATCH] [NVPTX] Fix 64 bits rotations with large shift values

ROTL and ROTR can take a shift amount larger than the element size, in
which case the effective shift amount should be the shift amount modulo
the element size.

This patch adds the modulo step when the shift amount isn't known at
compile time. Without it the existing implementation would end up
shifting beyond the type size and give incorrect results.
---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 10 +++---
 llvm/test/CodeGen/NVPTX/rotate.ll       | 43 +++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index cd8546005c0289..624a5f61abb784 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1747,8 +1747,9 @@ def ROTL64reg_sw :
             ".reg .b64 %lhs;\n\t"
             ".reg .b64 %rhs;\n\t"
             ".reg .u32 %amt2;\n\t"
-            "shl.b64 \t%lhs, $src, $amt;\n\t"
-            "sub.u32 \t%amt2, 64, $amt;\n\t"
+            "and.b32 \t%amt2, $amt, 63;\n\t"
+            "shl.b64 \t%lhs, $src, %amt2;\n\t"
+            "sub.u32 \t%amt2, 64, %amt2;\n\t"
             "shr.b64 \t%rhs, $src, %amt2;\n\t"
             "add.u64 \t$dst, %lhs, %rhs;\n\t"
             "}}",
@@ -1760,8 +1761,9 @@ def ROTR64reg_sw :
             ".reg .b64 %lhs;\n\t"
             ".reg .b64 %rhs;\n\t"
             ".reg .u32 %amt2;\n\t"
-            "shr.b64 \t%lhs, $src, $amt;\n\t"
-            "sub.u32 \t%amt2, 64, $amt;\n\t"
+            "and.b32 \t%amt2, $amt, 63;\n\t"
+            "shr.b64 \t%lhs, $src, %amt2;\n\t"
+            "sub.u32 \t%amt2, 64, %amt2;\n\t"
             "shl.b64 \t%rhs, $src, %amt2;\n\t"
             "add.u64 \t$dst, %lhs, %rhs;\n\t"
             "}}",
diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll
index 9d058662c27174..39a7b652e8afd3 100644
--- a/llvm/test/CodeGen/NVPTX/rotate.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate.ll
@@ -58,3 +58,46 @@ define i32 @rotl0(i32 %x) {
   %t2 = or i32 %t0, %t1
   ret i32 %t2
 }
+
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+
+; SM35: rotl64
+define i64 @rotl64(i64 %a, i64 %n) {
+; SM35: and.b32 {{.*}}, 63;
+; SM35: shl.b64
+; SM35: sub.u32
+; SM35: shr.b64
+; SM35: add.u64
+  %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n)
+  ret i64 %val
+}
+
+; SM35: rotl64_imm
+define i64 @rotl64_imm(i64 %a) {
+; SM35: shl.b64 {{.*}}, 2;
+; SM35: shr.b64 {{.*}}, 62;
+; SM35: add.u64
+  %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66)
+  ret i64 %val
+}
+
+; SM35: rotr64
+define i64 @rotr64(i64 %a, i64 %n) {
+; SM35: and.b32 {{.*}}, 63;
+; SM35: shr.b64
+; SM35: sub.u32
+; SM35: shl.b64
+; SM35: add.u64
+  %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n)
+  ret i64 %val
+}
+
+; SM35: rotr64_imm
+define i64 @rotr64_imm(i64 %a) {
+; SM35: shl.b64 {{.*}}, 62;
+; SM35: shr.b64 {{.*}}, 2;
+; SM35: add.u64
+  %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66)
+  ret i64 %val
+}