[llvm] 7396ab1 - [NVPTX] Fix 64 bits rotations with large shift values (#89399)

via llvm-commits llvm-commits at lists.llvm.org
Wed May 1 10:15:55 PDT 2024


Author: Nicolas Miller
Date: 2024-05-01T10:15:52-07:00
New Revision: 7396ab1210a2aeee6bab5b73ec6d02975ba51b93

URL: https://github.com/llvm/llvm-project/commit/7396ab1210a2aeee6bab5b73ec6d02975ba51b93
DIFF: https://github.com/llvm/llvm-project/commit/7396ab1210a2aeee6bab5b73ec6d02975ba51b93.diff

LOG: [NVPTX] Fix 64 bits rotations with large shift values (#89399)

ROTL and ROTR can take a shift amount larger than the element size, in
which case the effective shift amount should be the shift amount modulo
the element size.

This patch adds the modulo step when the shift amount isn't known at
compile time. Without it the existing implementation would end up
shifting beyond the type size and give incorrect results.

Added: 
    

Modified: 
    llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
    llvm/test/CodeGen/NVPTX/rotate.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 897ee89323f083..142dd64ddea9dc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1752,8 +1752,9 @@ def ROTL64reg_sw :
             ".reg .b64 %lhs;\n\t"
             ".reg .b64 %rhs;\n\t"
             ".reg .u32 %amt2;\n\t"
-            "shl.b64 \t%lhs, $src, $amt;\n\t"
-            "sub.u32 \t%amt2, 64, $amt;\n\t"
+            "and.b32 \t%amt2, $amt, 63;\n\t"
+            "shl.b64 \t%lhs, $src, %amt2;\n\t"
+            "sub.u32 \t%amt2, 64, %amt2;\n\t"
             "shr.b64 \t%rhs, $src, %amt2;\n\t"
             "add.u64 \t$dst, %lhs, %rhs;\n\t"
             "}}",
@@ -1765,8 +1766,9 @@ def ROTR64reg_sw :
             ".reg .b64 %lhs;\n\t"
             ".reg .b64 %rhs;\n\t"
             ".reg .u32 %amt2;\n\t"
-            "shr.b64 \t%lhs, $src, $amt;\n\t"
-            "sub.u32 \t%amt2, 64, $amt;\n\t"
+            "and.b32 \t%amt2, $amt, 63;\n\t"
+            "shr.b64 \t%lhs, $src, %amt2;\n\t"
+            "sub.u32 \t%amt2, 64, %amt2;\n\t"
             "shl.b64 \t%rhs, $src, %amt2;\n\t"
             "add.u64 \t$dst, %lhs, %rhs;\n\t"
             "}}",

diff  --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll
index 9d058662c27174..20c7ae5908d29f 100644
--- a/llvm/test/CodeGen/NVPTX/rotate.ll
+++ b/llvm/test/CodeGen/NVPTX/rotate.ll
@@ -1,7 +1,8 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s --mtriple=nvptx64 -mcpu=sm_20 | FileCheck --check-prefix=SM20 %s
+; RUN: llc < %s --mtriple=nvptx64 -mcpu=sm_35 | FileCheck --check-prefix=SM35 %s
+; RUN: %if ptxas %{ llc < %s --mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
+; RUN: %if ptxas %{ llc < %s --mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
 
 
 declare i32 @llvm.nvvm.rotate.b32(i32, i32)
@@ -11,11 +12,35 @@ declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
 ; SM20: rotate32
 ; SM35: rotate32
 define i32 @rotate32(i32 %a, i32 %b) {
-; SM20: shl.b32
-; SM20: sub.s32
-; SM20: shr.b32
-; SM20: add.u32
-; SM35: shf.l.wrap.b32
+; SM20-LABEL: rotate32(
+; SM20:       {
+; SM20-NEXT:    .reg .b32 %r<4>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u32 %r1, [rotate32_param_0];
+; SM20-NEXT:    ld.param.u32 %r2, [rotate32_param_1];
+; SM20-NEXT:    {
+; SM20-NEXT:    .reg .b32 %lhs;
+; SM20-NEXT:    .reg .b32 %rhs;
+; SM20-NEXT:    .reg .b32 %amt2;
+; SM20-NEXT:    shl.b32 %lhs, %r1, %r2;
+; SM20-NEXT:    sub.s32 %amt2, 32, %r2;
+; SM20-NEXT:    shr.b32 %rhs, %r1, %amt2;
+; SM20-NEXT:    add.u32 %r3, %lhs, %rhs;
+; SM20-NEXT:    }
+; SM20-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; SM20-NEXT:    ret;
+;
+; SM35-LABEL: rotate32(
+; SM35:       {
+; SM35-NEXT:    .reg .b32 %r<4>;
+; SM35-EMPTY:
+; SM35-NEXT:  // %bb.0:
+; SM35-NEXT:    ld.param.u32 %r1, [rotate32_param_0];
+; SM35-NEXT:    ld.param.u32 %r2, [rotate32_param_1];
+; SM35-NEXT:    shf.l.wrap.b32 %r3, %r1, %r1, %r2;
+; SM35-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; SM35-NEXT:    ret;
   %val = tail call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 %b)
   ret i32 %val
 }
@@ -23,12 +48,48 @@ define i32 @rotate32(i32 %a, i32 %b) {
 ; SM20: rotate64
 ; SM35: rotate64
 define i64 @rotate64(i64 %a, i32 %b) {
-; SM20: shl.b64
-; SM20: sub.u32
-; SM20: shr.b64
-; SM20: add.u64
-; SM35: shf.l.wrap.b32
-; SM35: shf.l.wrap.b32
+; SM20-LABEL: rotate64(
+; SM20:       {
+; SM20-NEXT:    .reg .b32 %r<2>;
+; SM20-NEXT:    .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u64 %rd1, [rotate64_param_0];
+; SM20-NEXT:    ld.param.u32 %r1, [rotate64_param_1];
+; SM20-NEXT:    {
+; SM20-NEXT:    .reg .b64 %lhs;
+; SM20-NEXT:    .reg .b64 %rhs;
+; SM20-NEXT:    .reg .u32 %amt2;
+; SM20-NEXT:    and.b32 %amt2, %r1, 63;
+; SM20-NEXT:    shl.b64 %lhs, %rd1, %amt2;
+; SM20-NEXT:    sub.u32 %amt2, 64, %amt2;
+; SM20-NEXT:    shr.b64 %rhs, %rd1, %amt2;
+; SM20-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT:    }
+; SM20-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT:    ret;
+;
+; SM35-LABEL: rotate64(
+; SM35:       {
+; SM35-NEXT:    .reg .b32 %r<6>;
+; SM35-NEXT:    .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT:  // %bb.0:
+; SM35-NEXT:    ld.param.u64 %rd1, [rotate64_param_0];
+; SM35-NEXT:    {
+; SM35-NEXT:    .reg .b32 %dummy;
+; SM35-NEXT:    mov.b64 {%dummy,%r1}, %rd1;
+; SM35-NEXT:    }
+; SM35-NEXT:    {
+; SM35-NEXT:    .reg .b32 %dummy;
+; SM35-NEXT:    mov.b64 {%r2,%dummy}, %rd1;
+; SM35-NEXT:    }
+; SM35-NEXT:    ld.param.u32 %r3, [rotate64_param_1];
+; SM35-NEXT:    shf.l.wrap.b32 %r4, %r2, %r1, %r3;
+; SM35-NEXT:    shf.l.wrap.b32 %r5, %r1, %r2, %r3;
+; SM35-NEXT:    mov.b64 %rd2, {%r5, %r4};
+; SM35-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT:    ret;
   %val = tail call i64 @llvm.nvvm.rotate.b64(i64 %a, i32 %b)
   ret i64 %val
 }
@@ -36,12 +97,48 @@ define i64 @rotate64(i64 %a, i32 %b) {
 ; SM20: rotateright64
 ; SM35: rotateright64
 define i64 @rotateright64(i64 %a, i32 %b) {
-; SM20: shr.b64
-; SM20: sub.u32
-; SM20: shl.b64
-; SM20: add.u64
-; SM35: shf.r.wrap.b32
-; SM35: shf.r.wrap.b32
+; SM20-LABEL: rotateright64(
+; SM20:       {
+; SM20-NEXT:    .reg .b32 %r<2>;
+; SM20-NEXT:    .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u64 %rd1, [rotateright64_param_0];
+; SM20-NEXT:    ld.param.u32 %r1, [rotateright64_param_1];
+; SM20-NEXT:    {
+; SM20-NEXT:    .reg .b64 %lhs;
+; SM20-NEXT:    .reg .b64 %rhs;
+; SM20-NEXT:    .reg .u32 %amt2;
+; SM20-NEXT:    and.b32 %amt2, %r1, 63;
+; SM20-NEXT:    shr.b64 %lhs, %rd1, %amt2;
+; SM20-NEXT:    sub.u32 %amt2, 64, %amt2;
+; SM20-NEXT:    shl.b64 %rhs, %rd1, %amt2;
+; SM20-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT:    }
+; SM20-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT:    ret;
+;
+; SM35-LABEL: rotateright64(
+; SM35:       {
+; SM35-NEXT:    .reg .b32 %r<6>;
+; SM35-NEXT:    .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT:  // %bb.0:
+; SM35-NEXT:    ld.param.u64 %rd1, [rotateright64_param_0];
+; SM35-NEXT:    {
+; SM35-NEXT:    .reg .b32 %dummy;
+; SM35-NEXT:    mov.b64 {%r1,%dummy}, %rd1;
+; SM35-NEXT:    }
+; SM35-NEXT:    {
+; SM35-NEXT:    .reg .b32 %dummy;
+; SM35-NEXT:    mov.b64 {%dummy,%r2}, %rd1;
+; SM35-NEXT:    }
+; SM35-NEXT:    ld.param.u32 %r3, [rotateright64_param_1];
+; SM35-NEXT:    shf.r.wrap.b32 %r4, %r2, %r1, %r3;
+; SM35-NEXT:    shf.r.wrap.b32 %r5, %r1, %r2, %r3;
+; SM35-NEXT:    mov.b64 %rd2, {%r5, %r4};
+; SM35-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT:    ret;
   %val = tail call i64 @llvm.nvvm.rotate.right.b64(i64 %a, i32 %b)
   ret i64 %val
 }
@@ -49,12 +146,204 @@ define i64 @rotateright64(i64 %a, i32 %b) {
 ; SM20: rotl0
 ; SM35: rotl0
 define i32 @rotl0(i32 %x) {
-; SM20: shl.b32
-; SM20: shr.b32
-; SM20: add.u32
-; SM35: shf.l.wrap.b32
+; SM20-LABEL: rotl0(
+; SM20:       {
+; SM20-NEXT:    .reg .b32 %r<3>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u32 %r1, [rotl0_param_0];
+; SM20-NEXT:    {
+; SM20-NEXT:    .reg .b32 %lhs;
+; SM20-NEXT:    .reg .b32 %rhs;
+; SM20-NEXT:    shl.b32 %lhs, %r1, 8;
+; SM20-NEXT:    shr.b32 %rhs, %r1, 24;
+; SM20-NEXT:    add.u32 %r2, %lhs, %rhs;
+; SM20-NEXT:    }
+; SM20-NEXT:    st.param.b32 [func_retval0+0], %r2;
+; SM20-NEXT:    ret;
+;
+; SM35-LABEL: rotl0(
+; SM35:       {
+; SM35-NEXT:    .reg .b32 %r<3>;
+; SM35-EMPTY:
+; SM35-NEXT:  // %bb.0:
+; SM35-NEXT:    ld.param.u32 %r1, [rotl0_param_0];
+; SM35-NEXT:    shf.l.wrap.b32 %r2, %r1, %r1, 8;
+; SM35-NEXT:    st.param.b32 [func_retval0+0], %r2;
+; SM35-NEXT:    ret;
   %t0 = shl i32 %x, 8
   %t1 = lshr i32 %x, 24
   %t2 = or i32 %t0, %t1
   ret i32 %t2
 }
+
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+
+; SM35: rotl64
+define i64 @rotl64(i64 %a, i64 %n) {
+; SM20-LABEL: rotl64(
+; SM20:       {
+; SM20-NEXT:    .reg .b32 %r<2>;
+; SM20-NEXT:    .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u64 %rd1, [rotl64_param_0];
+; SM20-NEXT:    ld.param.u32 %r1, [rotl64_param_1];
+; SM20-NEXT:    {
+; SM20-NEXT:    .reg .b64 %lhs;
+; SM20-NEXT:    .reg .b64 %rhs;
+; SM20-NEXT:    .reg .u32 %amt2;
+; SM20-NEXT:    and.b32 %amt2, %r1, 63;
+; SM20-NEXT:    shl.b64 %lhs, %rd1, %amt2;
+; SM20-NEXT:    sub.u32 %amt2, 64, %amt2;
+; SM20-NEXT:    shr.b64 %rhs, %rd1, %amt2;
+; SM20-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT:    }
+; SM20-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT:    ret;
+;
+; SM35-LABEL: rotl64(
+; SM35:       {
+; SM35-NEXT:    .reg .b32 %r<2>;
+; SM35-NEXT:    .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT:  // %bb.0:
+; SM35-NEXT:    ld.param.u64 %rd1, [rotl64_param_0];
+; SM35-NEXT:    ld.param.u32 %r1, [rotl64_param_1];
+; SM35-NEXT:    {
+; SM35-NEXT:    .reg .b64 %lhs;
+; SM35-NEXT:    .reg .b64 %rhs;
+; SM35-NEXT:    .reg .u32 %amt2;
+; SM35-NEXT:    and.b32 %amt2, %r1, 63;
+; SM35-NEXT:    shl.b64 %lhs, %rd1, %amt2;
+; SM35-NEXT:    sub.u32 %amt2, 64, %amt2;
+; SM35-NEXT:    shr.b64 %rhs, %rd1, %amt2;
+; SM35-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM35-NEXT:    }
+; SM35-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT:    ret;
+  %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %n)
+  ret i64 %val
+}
+
+; SM35: rotl64_imm
+define i64 @rotl64_imm(i64 %a) {
+; SM20-LABEL: rotl64_imm(
+; SM20:       {
+; SM20-NEXT:    .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u64 %rd1, [rotl64_imm_param_0];
+; SM20-NEXT:    {
+; SM20-NEXT:    .reg .b64 %lhs;
+; SM20-NEXT:    .reg .b64 %rhs;
+; SM20-NEXT:    shl.b64 %lhs, %rd1, 2;
+; SM20-NEXT:    shr.b64 %rhs, %rd1, 62;
+; SM20-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT:    }
+; SM20-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT:    ret;
+;
+; SM35-LABEL: rotl64_imm(
+; SM35:       {
+; SM35-NEXT:    .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT:  // %bb.0:
+; SM35-NEXT:    ld.param.u64 %rd1, [rotl64_imm_param_0];
+; SM35-NEXT:    {
+; SM35-NEXT:    .reg .b64 %lhs;
+; SM35-NEXT:    .reg .b64 %rhs;
+; SM35-NEXT:    shl.b64 %lhs, %rd1, 2;
+; SM35-NEXT:    shr.b64 %rhs, %rd1, 62;
+; SM35-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM35-NEXT:    }
+; SM35-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT:    ret;
+  %val = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 66)
+  ret i64 %val
+}
+
+; SM35: rotr64
+define i64 @rotr64(i64 %a, i64 %n) {
+; SM20-LABEL: rotr64(
+; SM20:       {
+; SM20-NEXT:    .reg .b32 %r<2>;
+; SM20-NEXT:    .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u64 %rd1, [rotr64_param_0];
+; SM20-NEXT:    ld.param.u32 %r1, [rotr64_param_1];
+; SM20-NEXT:    {
+; SM20-NEXT:    .reg .b64 %lhs;
+; SM20-NEXT:    .reg .b64 %rhs;
+; SM20-NEXT:    .reg .u32 %amt2;
+; SM20-NEXT:    and.b32 %amt2, %r1, 63;
+; SM20-NEXT:    shr.b64 %lhs, %rd1, %amt2;
+; SM20-NEXT:    sub.u32 %amt2, 64, %amt2;
+; SM20-NEXT:    shl.b64 %rhs, %rd1, %amt2;
+; SM20-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT:    }
+; SM20-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT:    ret;
+;
+; SM35-LABEL: rotr64(
+; SM35:       {
+; SM35-NEXT:    .reg .b32 %r<2>;
+; SM35-NEXT:    .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT:  // %bb.0:
+; SM35-NEXT:    ld.param.u64 %rd1, [rotr64_param_0];
+; SM35-NEXT:    ld.param.u32 %r1, [rotr64_param_1];
+; SM35-NEXT:    {
+; SM35-NEXT:    .reg .b64 %lhs;
+; SM35-NEXT:    .reg .b64 %rhs;
+; SM35-NEXT:    .reg .u32 %amt2;
+; SM35-NEXT:    and.b32 %amt2, %r1, 63;
+; SM35-NEXT:    shr.b64 %lhs, %rd1, %amt2;
+; SM35-NEXT:    sub.u32 %amt2, 64, %amt2;
+; SM35-NEXT:    shl.b64 %rhs, %rd1, %amt2;
+; SM35-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM35-NEXT:    }
+; SM35-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT:    ret;
+  %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %n)
+  ret i64 %val
+}
+
+; SM35: rotr64_imm
+define i64 @rotr64_imm(i64 %a) {
+; SM20-LABEL: rotr64_imm(
+; SM20:       {
+; SM20-NEXT:    .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT:  // %bb.0:
+; SM20-NEXT:    ld.param.u64 %rd1, [rotr64_imm_param_0];
+; SM20-NEXT:    {
+; SM20-NEXT:    .reg .b64 %lhs;
+; SM20-NEXT:    .reg .b64 %rhs;
+; SM20-NEXT:    shl.b64 %lhs, %rd1, 62;
+; SM20-NEXT:    shr.b64 %rhs, %rd1, 2;
+; SM20-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT:    }
+; SM20-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT:    ret;
+;
+; SM35-LABEL: rotr64_imm(
+; SM35:       {
+; SM35-NEXT:    .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT:  // %bb.0:
+; SM35-NEXT:    ld.param.u64 %rd1, [rotr64_imm_param_0];
+; SM35-NEXT:    {
+; SM35-NEXT:    .reg .b64 %lhs;
+; SM35-NEXT:    .reg .b64 %rhs;
+; SM35-NEXT:    shl.b64 %lhs, %rd1, 62;
+; SM35-NEXT:    shr.b64 %rhs, %rd1, 2;
+; SM35-NEXT:    add.u64 %rd2, %lhs, %rhs;
+; SM35-NEXT:    }
+; SM35-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT:    ret;
+  %val = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 66)
+  ret i64 %val
+}


        


More information about the llvm-commits mailing list