[llvm] [NVPTX] Fix 64 bits rotations with large shift values (PR #89399)
Artem Belevich via llvm-commits
llvm-commits at lists.llvm.org
Wed May 1 10:15:09 PDT 2024
================
@@ -11,50 +12,338 @@ declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
; SM20: rotate32
; SM35: rotate32
define i32 @rotate32(i32 %a, i32 %b) {
-; SM20: shl.b32
-; SM20: sub.s32
-; SM20: shr.b32
-; SM20: add.u32
-; SM35: shf.l.wrap.b32
+; SM20-LABEL: rotate32(
+; SM20: {
+; SM20-NEXT: .reg .b32 %r<4>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u32 %r1, [rotate32_param_0];
+; SM20-NEXT: ld.param.u32 %r2, [rotate32_param_1];
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b32 %lhs;
+; SM20-NEXT: .reg .b32 %rhs;
+; SM20-NEXT: .reg .b32 %amt2;
+; SM20-NEXT: shl.b32 %lhs, %r1, %r2;
+; SM20-NEXT: sub.s32 %amt2, 32, %r2;
+; SM20-NEXT: shr.b32 %rhs, %r1, %amt2;
+; SM20-NEXT: add.u32 %r3, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b32 [func_retval0+0], %r3;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotate32(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<4>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u32 %r1, [rotate32_param_0];
+; SM35-NEXT: ld.param.u32 %r2, [rotate32_param_1];
+; SM35-NEXT: shf.l.wrap.b32 %r3, %r1, %r1, %r2;
+; SM35-NEXT: st.param.b32 [func_retval0+0], %r3;
+; SM35-NEXT: ret;
%val = tail call i32 @llvm.nvvm.rotate.b32(i32 %a, i32 %b)
ret i32 %val
}
; SM20: rotate64
; SM35: rotate64
define i64 @rotate64(i64 %a, i32 %b) {
-; SM20: shl.b64
-; SM20: sub.u32
-; SM20: shr.b64
-; SM20: add.u64
-; SM35: shf.l.wrap.b32
-; SM35: shf.l.wrap.b32
+; SM20-LABEL: rotate64(
+; SM20: {
+; SM20-NEXT: .reg .b32 %r<2>;
+; SM20-NEXT: .reg .b64 %rd<3>;
+; SM20-EMPTY:
+; SM20-NEXT: // %bb.0:
+; SM20-NEXT: ld.param.u64 %rd1, [rotate64_param_0];
+; SM20-NEXT: ld.param.u32 %r1, [rotate64_param_1];
+; SM20-NEXT: {
+; SM20-NEXT: .reg .b64 %lhs;
+; SM20-NEXT: .reg .b64 %rhs;
+; SM20-NEXT: .reg .u32 %amt2;
+; SM20-NEXT: and.b32 %amt2, %r1, 63;
+; SM20-NEXT: shl.b64 %lhs, %rd1, %amt2;
+; SM20-NEXT: sub.u32 %amt2, 64, %amt2;
+; SM20-NEXT: shr.b64 %rhs, %rd1, %amt2;
+; SM20-NEXT: add.u64 %rd2, %lhs, %rhs;
+; SM20-NEXT: }
+; SM20-NEXT: st.param.b64 [func_retval0+0], %rd2;
+; SM20-NEXT: ret;
+;
+; SM35-LABEL: rotate64(
+; SM35: {
+; SM35-NEXT: .reg .b32 %r<6>;
+; SM35-NEXT: .reg .b64 %rd<3>;
+; SM35-EMPTY:
+; SM35-NEXT: // %bb.0:
+; SM35-NEXT: ld.param.u64 %rd1, [rotate64_param_0];
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b32 %dummy;
+; SM35-NEXT: mov.b64 {%dummy,%r1}, %rd1;
+; SM35-NEXT: }
+; SM35-NEXT: {
+; SM35-NEXT: .reg .b32 %dummy;
+; SM35-NEXT: mov.b64 {%r2,%dummy}, %rd1;
+; SM35-NEXT: }
----------------
Artem-B wrote:
Looks like a minor optimization opportunity for the future -- this could be done as `mov.b64 {%r2, %r1}, %rd1`.
https://github.com/llvm/llvm-project/pull/89399
More information about the llvm-commits
mailing list