[llvm] [NVPTX] deprecate nvvm.rotate.* intrinsics, cleanup funnel-shift handling (PR #107655)

Mon Sep 23 13:38:56 PDT 2024

================
@@ -50,45 +53,36 @@ define i32 @rotate32(i32 %a, i32 %b) {
 define i64 @rotate64(i64 %a, i32 %b) {
 ; SM20-LABEL: rotate64(
 ; SM20:       {
-; SM20-NEXT:    .reg .b32 %r<2>;
-; SM20-NEXT:    .reg .b64 %rd<3>;
+; SM20-NEXT:    .reg .b32 %r<5>;
+; SM20-NEXT:    .reg .b64 %rd<5>;
 ; SM20-EMPTY:
 ; SM20-NEXT:  // %bb.0:
 ; SM20-NEXT:    ld.param.u64 %rd1, [rotate64_param_0];
 ; SM20-NEXT:    ld.param.u32 %r1, [rotate64_param_1];
-; SM20-NEXT:    {
-; SM20-NEXT:    .reg .b64 %lhs;
-; SM20-NEXT:    .reg .b64 %rhs;
-; SM20-NEXT:    .reg .u32 %amt2;
-; SM20-NEXT:    and.b32 %amt2, %r1, 63;
-; SM20-NEXT:    shl.b64 %lhs, %rd1, %amt2;
-; SM20-NEXT:    sub.u32 %amt2, 64, %amt2;
-; SM20-NEXT:    shr.b64 %rhs, %rd1, %amt2;
-; SM20-NEXT:    add.u64 %rd2, %lhs, %rhs;
-; SM20-NEXT:    }
-; SM20-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT:    and.b32 %r2, %r1, 63;
+; SM20-NEXT:    shl.b64 %rd2, %rd1, %r2;
+; SM20-NEXT:    neg.s32 %r3, %r1;
+; SM20-NEXT:    and.b32 %r4, %r3, 63;
+; SM20-NEXT:    shr.u64 %rd3, %rd1, %r4;
+; SM20-NEXT:    or.b64 %rd4, %rd2, %rd3;
+; SM20-NEXT:    st.param.b64 [func_retval0+0], %rd4;
 ; SM20-NEXT:    ret;
 ;
 ; SM35-LABEL: rotate64(
 ; SM35:       {
-; SM35-NEXT:    .reg .b32 %r<6>;
-; SM35-NEXT:    .reg .b64 %rd<3>;
+; SM35-NEXT:    .reg .b32 %r<5>;
+; SM35-NEXT:    .reg .b64 %rd<5>;
 ; SM35-EMPTY:
 ; SM35-NEXT:  // %bb.0:
 ; SM35-NEXT:    ld.param.u64 %rd1, [rotate64_param_0];
-; SM35-NEXT:    {
-; SM35-NEXT:    .reg .b32 %dummy;
-; SM35-NEXT:    mov.b64 {%dummy,%r1}, %rd1;
-; SM35-NEXT:    }
-; SM35-NEXT:    {
-; SM35-NEXT:    .reg .b32 %dummy;
-; SM35-NEXT:    mov.b64 {%r2,%dummy}, %rd1;
-; SM35-NEXT:    }
-; SM35-NEXT:    ld.param.u32 %r3, [rotate64_param_1];
-; SM35-NEXT:    shf.l.wrap.b32 %r4, %r2, %r1, %r3;
-; SM35-NEXT:    shf.l.wrap.b32 %r5, %r1, %r2, %r3;
-; SM35-NEXT:    mov.b64 %rd2, {%r5, %r4};
-; SM35-NEXT:    st.param.b64 [func_retval0+0], %rd2;
+; SM35-NEXT:    ld.param.u32 %r1, [rotate64_param_1];
+; SM35-NEXT:    and.b32 %r2, %r1, 63;
+; SM35-NEXT:    shl.b64 %rd2, %rd1, %r2;
+; SM35-NEXT:    neg.s32 %r3, %r1;
+; SM35-NEXT:    and.b32 %r4, %r3, 63;
+; SM35-NEXT:    shr.u64 %rd3, %rd1, %r4;
+; SM35-NEXT:    or.b64 %rd4, %rd2, %rd3;
----------------
AlexMaclean wrote:

It actually is not correct to lower these 64-bit rotates with 32-bit funnel shifts. This is the correctness issue I fixed. 

https://alive2.llvm.org/ce/z/RQVDDG shows counter-examples for this lowering. 

I think it would be safe to lower a 64-bit rotate with a constant rotation operand to two 32-bit funnel shifts but I don't know if it is worth it to handle this edge case. 

https://github.com/llvm/llvm-project/pull/107655