[llvm] [NVPTX] deprecate nvvm.rotate.* intrinsics, cleanup funnel-shift handling (PR #107655)

Mon Sep 23 14:39:39 PDT 2024

================
@@ -50,45 +53,36 @@ define i32 @rotate32(i32 %a, i32 %b) {
 define i64 @rotate64(i64 %a, i32 %b) {
 ; SM20-LABEL: rotate64(
 ; SM20:       {
-; SM20-NEXT:    .reg .b32 %r<2>;
-; SM20-NEXT:    .reg .b64 %rd<3>;
+; SM20-NEXT:    .reg .b32 %r<5>;
+; SM20-NEXT:    .reg .b64 %rd<5>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
 ; SM20-NEXT:    ld.param.u64 %rd1, [rotate64_param_0];
 ; SM20-NEXT:    ld.param.u32 %r1, [rotate64_param_1];
-; SM20-NEXT:    {
-; SM20-NEXT:    .reg .b64 %lhs;
-; SM20-NEXT:    .reg .b64 %rhs;
-; SM20-NEXT:    .reg .u32 %amt2;
-; SM20-NEXT:    and.b32 %amt2, %r1, 63;
-; SM20-NEXT:    shl.b64 %lhs, %rd1, %amt2;
-; SM20-NEXT:    sub.u32 %amt2, 64, %amt2;
-; SM20-NEXT:    shr.b64 %rhs, %rd1, %amt2;
-; SM20-NEXT:    add.u64 %rd2, %lhs, %rhs;
-; SM20-NEXT:    }
-; SM20-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT:    and.b32 %r2, %r1, 63;
+; SM20-NEXT:    shl.b64 %rd2, %rd1, %r2;
+; SM20-NEXT:    neg.s32 %r3, %r1;
+; SM20-NEXT:    and.b32 %r4, %r3, 63;
+; SM20-NEXT:    shr.u64 %rd3, %rd1, %r4;
+; SM20-NEXT:    or.b64 %rd4, %rd2, %rd3;
+; SM20-NEXT:    st.param.b64 [func_retval0+0], %rd4;
 ; SM20-NEXT:    ret;
 ;
 ; SM35-LABEL: rotate64(
 ; SM35:       {
-; SM35-NEXT:    .reg .b32 %r<6>;
-; SM35-NEXT:    .reg .b64 %rd<3>;
+; SM35-NEXT:    .reg .b32 %r<5>;
+; SM35-NEXT:    .reg .b64 %rd<5>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
 ; SM35-NEXT:    ld.param.u64 %rd1, [rotate64_param_0];
-; SM35-NEXT:    {
-; SM35-NEXT:    .reg .b32 %dummy;
-; SM35-NEXT:    mov.b64 {%dummy,%r1}, %rd1;
-; SM35-NEXT:    }
-; SM35-NEXT:    {
-; SM35-NEXT:    .reg .b32 %dummy;
-; SM35-NEXT:    mov.b64 {%r2,%dummy}, %rd1;
-; SM35-NEXT:    }
-; SM35-NEXT:    ld.param.u32 %r3, [rotate64_param_1];
-; SM35-NEXT:    shf.l.wrap.b32 %r4, %r2, %r1, %r3;
-; SM35-NEXT:    shf.l.wrap.b32 %r5, %r1, %r2, %r3;
-; SM35-NEXT:    mov.b64 %rd2, {%r5, %r4};
-; SM35-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT:    ld.param.u32 %r1, [rotate64_param_1];
+; SM35-NEXT:    and.b32 %r2, %r1, 63;
+; SM35-NEXT:    shl.b64 %rd2, %rd1, %r2;
+; SM35-NEXT:    neg.s32 %r3, %r1;
+; SM35-NEXT:    and.b32 %r4, %r3, 63;
+; SM35-NEXT:    shr.u64 %rd3, %rd1, %r4;
+; SM35-NEXT:    or.b64 %rd4, %rd2, %rd3;
----------------
Artem-B wrote:

Ah. I see. I did not connect IR->IR trnasformation in the alive example with the `shf.l wrap` we used to generate.

The original code is indeed wrong. 32-bit rotation is indeed not very useful for implementing 64-bit rotations, as it can only shift up to 32 bits. It should indeed be doable, but is probably not worth it. Given that the bug has been around forever, and nobody stumbled onto it before you, it's probalby not widely used.

If we were to custom-lower it, one way to do it is to change the order of destination operands when we split i64 into two i32 as a partial rotation by 32, and do the remaining rotation by (count-32) with the funnel shift.


https://github.com/llvm/llvm-project/pull/107655