[llvm] [LoongArch] Switch to the Machine Scheduler (PR #83759)

Sun Mar 3 19:49:34 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-loongarch

Author: wanglei (wangleiat)

<details>
<summary>Changes</summary>

The SelectionDAG scheduling preference now becomes source order scheduling (machine scheduler generates better code -- even without there being a machine model defined for LoongArch yet).

Most of the test changes are trivial instruction reorderings and differing register allocations, without any obvious performance impact.

This is similar to commit: 3d0fbafd0bce43bb9106230a45d1130f7a40e5ec

---

Patch is 785.20 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/83759.diff


95 Files Affected:

- (modified) llvm/lib/Target/LoongArch/LoongArchSubtarget.h (+1) 
- (modified) llvm/test/CodeGen/LoongArch/alloca.ll (+4-4) 
- (modified) llvm/test/CodeGen/LoongArch/alsl.ll (+8-8) 
- (modified) llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll (+108-108) 
- (modified) llvm/test/CodeGen/LoongArch/bitreverse.ll (+23-23) 
- (modified) llvm/test/CodeGen/LoongArch/branch-relaxation.ll (+6-6) 
- (modified) llvm/test/CodeGen/LoongArch/bswap-bitreverse.ll (+6-6) 
- (modified) llvm/test/CodeGen/LoongArch/bswap.ll (+20-20) 
- (modified) llvm/test/CodeGen/LoongArch/bytepick.ll (+6-9) 
- (modified) llvm/test/CodeGen/LoongArch/calling-conv-common.ll (+68-68) 
- (modified) llvm/test/CodeGen/LoongArch/calling-conv-lp64d.ll (+6-6) 
- (modified) llvm/test/CodeGen/LoongArch/calling-conv-lp64s.ll (+2-2) 
- (modified) llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/cfr-pseudo-copy.mir (+2-2) 
- (modified) llvm/test/CodeGen/LoongArch/ctlz-cttz-ctpop.ll (+49-49) 
- (modified) llvm/test/CodeGen/LoongArch/fcopysign.ll (+4-4) 
- (modified) llvm/test/CodeGen/LoongArch/get-setcc-result-type.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/ghc-cc.ll (+48-48) 
- (modified) llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll (+8-8) 
- (modified) llvm/test/CodeGen/LoongArch/ir-instruction/and.ll (+20-20) 
- (modified) llvm/test/CodeGen/LoongArch/ir-instruction/ashr.ll (+2-2) 
- (modified) llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll (+105-105) 
- (modified) llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll (+398-338) 
- (modified) llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-minmax.ll (+260-260) 
- (modified) llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw.ll (+735-735) 
- (modified) llvm/test/CodeGen/LoongArch/ir-instruction/double-convert.ll (+16-16) 
- (modified) llvm/test/CodeGen/LoongArch/ir-instruction/float-convert.ll (+23-23) 
- (modified) llvm/test/CodeGen/LoongArch/ir-instruction/load-store.ll (+754-370) 
- (modified) llvm/test/CodeGen/LoongArch/ir-instruction/lshr.ll (+2-2) 
- (modified) llvm/test/CodeGen/LoongArch/ir-instruction/mul.ll (+52-51) 
- (modified) llvm/test/CodeGen/LoongArch/ir-instruction/shl.ll (+2-2) 
- (modified) llvm/test/CodeGen/LoongArch/ir-instruction/sub.ll (+2-2) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/build-vector.ll (+44-44) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/fma-v4f64.ll (+224-224) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/fma-v8f32.ll (+224-224) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/add.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/and.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/ashr.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fadd.ll (+6-6) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fcmp.ll (+84-84) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fdiv.ll (+6-6) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fmul.ll (+6-6) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fsub.ll (+6-6) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/icmp.ll (+72-72) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll (+8-8) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/lshr.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/mul.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/or.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sdiv.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/shl.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sub.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/udiv.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/xor.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/mulh.ll (+24-24) 
- (modified) llvm/test/CodeGen/LoongArch/lasx/vselect.ll (+11-11) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/build-vector.ll (+20-20) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/fma-v2f64.ll (+224-224) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/fma-v4f32.ll (+224-224) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/add.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/and.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/ashr.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/extractelement.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fadd.ll (+6-6) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fcmp.ll (+84-84) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fdiv.ll (+6-6) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fmul.ll (+6-6) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fsub.ll (+6-6) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/icmp.ll (+72-72) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insertelement.ll (+8-8) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/lshr.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/mul.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/or.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sdiv.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/shl.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sub.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/udiv.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/xor.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/mulh.ll (+24-24) 
- (modified) llvm/test/CodeGen/LoongArch/lsx/vselect.ll (+11-11) 
- (modified) llvm/test/CodeGen/LoongArch/preferred-alignments.ll (+8-8) 
- (modified) llvm/test/CodeGen/LoongArch/rotl-rotr.ll (+192-196) 
- (modified) llvm/test/CodeGen/LoongArch/select-to-shiftand.ll (+2-2) 
- (modified) llvm/test/CodeGen/LoongArch/shift-masked-shamt.ll (+18-18) 
- (modified) llvm/test/CodeGen/LoongArch/shrinkwrap.ll (+1-3) 
- (modified) llvm/test/CodeGen/LoongArch/smul-with-overflow.ll (+359-347) 
- (modified) llvm/test/CodeGen/LoongArch/soft-fp-to-int.ll (+16-16) 
- (modified) llvm/test/CodeGen/LoongArch/spill-ra-without-kill.ll (+6-6) 
- (modified) llvm/test/CodeGen/LoongArch/spill-reload-cfr.ll (+24-38) 
- (modified) llvm/test/CodeGen/LoongArch/tail-calls.ll (+10-8) 
- (modified) llvm/test/CodeGen/LoongArch/unaligned-access.ll (+12-12) 
- (modified) llvm/test/CodeGen/LoongArch/vararg.ll (+10-10) 
- (modified) llvm/test/CodeGen/LoongArch/vector-fp-imm.ll (+726-599) 
- (modified) llvm/test/CodeGen/LoongArch/zext-with-load-is-free.ll (+4-4) 
- (modified) llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.generated.expected (+16-17) 
- (modified) llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/loongarch_generated_funcs.ll.nogenerated.expected (+16-17) 


``````````diff

diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
index 11c0b39e176e61..cecb4a50aa7633 100644
--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
@@ -113,6 +113,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
   Align getPrefFunctionAlignment() const { return PrefFunctionAlignment; }
   Align getPrefLoopAlignment() const { return PrefLoopAlignment; }
   unsigned getMaxBytesForAlignment() const { return MaxBytesForAlignment; }
+  bool enableMachineScheduler() const override { return true; }
 };
 } // end namespace llvm
 
diff --git a/llvm/test/CodeGen/LoongArch/alloca.ll b/llvm/test/CodeGen/LoongArch/alloca.ll
index d766be6aac9509..75a05689e4178d 100644
--- a/llvm/test/CodeGen/LoongArch/alloca.ll
+++ b/llvm/test/CodeGen/LoongArch/alloca.ll
@@ -126,8 +126,7 @@ define void @alloca_callframe(i32 %n) nounwind {
 ; LA32-NEXT:    st.w $a1, $sp, 8
 ; LA32-NEXT:    ori $a1, $zero, 10
 ; LA32-NEXT:    st.w $a1, $sp, 4
-; LA32-NEXT:    ori $a1, $zero, 9
-; LA32-NEXT:    st.w $a1, $sp, 0
+; LA32-NEXT:    ori $t0, $zero, 9
 ; LA32-NEXT:    ori $a1, $zero, 2
 ; LA32-NEXT:    ori $a2, $zero, 3
 ; LA32-NEXT:    ori $a3, $zero, 4
@@ -135,6 +134,7 @@ define void @alloca_callframe(i32 %n) nounwind {
 ; LA32-NEXT:    ori $a5, $zero, 6
 ; LA32-NEXT:    ori $a6, $zero, 7
 ; LA32-NEXT:    ori $a7, $zero, 8
+; LA32-NEXT:    st.w $t0, $sp, 0
 ; LA32-NEXT:    bl %plt(func)
 ; LA32-NEXT:    addi.w $sp, $sp, 16
 ; LA32-NEXT:    addi.w $sp, $fp, -16
@@ -162,8 +162,7 @@ define void @alloca_callframe(i32 %n) nounwind {
 ; LA64-NEXT:    st.d $a1, $sp, 16
 ; LA64-NEXT:    ori $a1, $zero, 10
 ; LA64-NEXT:    st.d $a1, $sp, 8
-; LA64-NEXT:    ori $a1, $zero, 9
-; LA64-NEXT:    st.d $a1, $sp, 0
+; LA64-NEXT:    ori $t0, $zero, 9
 ; LA64-NEXT:    ori $a1, $zero, 2
 ; LA64-NEXT:    ori $a2, $zero, 3
 ; LA64-NEXT:    ori $a3, $zero, 4
@@ -171,6 +170,7 @@ define void @alloca_callframe(i32 %n) nounwind {
 ; LA64-NEXT:    ori $a5, $zero, 6
 ; LA64-NEXT:    ori $a6, $zero, 7
 ; LA64-NEXT:    ori $a7, $zero, 8
+; LA64-NEXT:    st.d $t0, $sp, 0
 ; LA64-NEXT:    bl %plt(func)
 ; LA64-NEXT:    addi.d $sp, $sp, 32
 ; LA64-NEXT:    addi.d $sp, $fp, -16
diff --git a/llvm/test/CodeGen/LoongArch/alsl.ll b/llvm/test/CodeGen/LoongArch/alsl.ll
index 650f504dcaf83a..177e37de0952d7 100644
--- a/llvm/test/CodeGen/LoongArch/alsl.ll
+++ b/llvm/test/CodeGen/LoongArch/alsl.ll
@@ -53,12 +53,12 @@ entry:
 define i64 @alsl_i64(i64 signext %a, i64 signext %b) nounwind {
 ; LA32-LABEL: alsl_i64:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    slli.w $a1, $a1, 4
 ; LA32-NEXT:    srli.w $a4, $a0, 28
+; LA32-NEXT:    slli.w $a1, $a1, 4
 ; LA32-NEXT:    or $a1, $a1, $a4
-; LA32-NEXT:    add.w $a1, $a3, $a1
 ; LA32-NEXT:    alsl.w $a0, $a0, $a2, 4
 ; LA32-NEXT:    sltu $a2, $a0, $a2
+; LA32-NEXT:    add.w $a1, $a3, $a1
 ; LA32-NEXT:    add.w $a1, $a1, $a2
 ; LA32-NEXT:    ret
 ;
@@ -189,14 +189,14 @@ entry:
 define i64 @mul_add_i64(i64 signext %a, i64 signext %b) nounwind {
 ; LA32-LABEL: mul_add_i64:
 ; LA32:       # %bb.0: # %entry
-; LA32-NEXT:    slli.w $a4, $a1, 4
-; LA32-NEXT:    sub.w $a1, $a4, $a1
 ; LA32-NEXT:    ori $a4, $zero, 15
 ; LA32-NEXT:    mulh.wu $a4, $a0, $a4
+; LA32-NEXT:    slli.w $a5, $a1, 4
+; LA32-NEXT:    sub.w $a1, $a5, $a1
 ; LA32-NEXT:    add.w $a1, $a4, $a1
+; LA32-NEXT:    slli.w $a4, $a0, 4
+; LA32-NEXT:    sub.w $a0, $a4, $a0
 ; LA32-NEXT:    add.w $a1, $a3, $a1
-; LA32-NEXT:    slli.w $a3, $a0, 4
-; LA32-NEXT:    sub.w $a0, $a3, $a0
 ; LA32-NEXT:    add.w $a0, $a2, $a0
 ; LA32-NEXT:    sltu $a2, $a0, $a2
 ; LA32-NEXT:    add.w $a1, $a1, $a2
@@ -342,9 +342,9 @@ define i64 @mul_add_neg_i64(i64 signext %a, i64 signext %b) nounwind {
 ; LA32-NEXT:    mulh.wu $a4, $a0, $a4
 ; LA32-NEXT:    sub.w $a4, $a4, $a0
 ; LA32-NEXT:    add.w $a1, $a4, $a1
+; LA32-NEXT:    slli.w $a4, $a0, 4
+; LA32-NEXT:    sub.w $a0, $a0, $a4
 ; LA32-NEXT:    add.w $a1, $a3, $a1
-; LA32-NEXT:    slli.w $a3, $a0, 4
-; LA32-NEXT:    sub.w $a0, $a0, $a3
 ; LA32-NEXT:    add.w $a0, $a2, $a0
 ; LA32-NEXT:    sltu $a2, $a0, $a2
 ; LA32-NEXT:    add.w $a1, $a1, $a2
diff --git a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
index b84c1093eb75f2..bf48c0df3e4961 100644
--- a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
@@ -4,34 +4,34 @@
 define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-LABEL: atomicrmw_uinc_wrap_i8:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    slli.d $a3, $a0, 3
+; LA64-NEXT:    slli.d $a2, $a0, 3
 ; LA64-NEXT:    bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT:    ld.w $a2, $a0, 0
-; LA64-NEXT:    ori $a4, $zero, 255
-; LA64-NEXT:    sll.w $a4, $a4, $a3
-; LA64-NEXT:    andi $a3, $a3, 24
+; LA64-NEXT:    ori $a3, $zero, 255
+; LA64-NEXT:    sll.w $a4, $a3, $a2
+; LA64-NEXT:    ld.w $a3, $a0, 0
+; LA64-NEXT:    andi $a2, $a2, 24
 ; LA64-NEXT:    nor $a4, $a4, $zero
 ; LA64-NEXT:    andi $a1, $a1, 255
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB0_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB0_3 Depth 2
-; LA64-NEXT:    srl.w $a5, $a2, $a3
-; LA64-NEXT:    andi $a6, $a5, 255
-; LA64-NEXT:    sltu $a6, $a6, $a1
+; LA64-NEXT:    srl.w $a5, $a3, $a2
+; LA64-NEXT:    addi.w $a6, $a3, 0
+; LA64-NEXT:    andi $a7, $a5, 255
 ; LA64-NEXT:    addi.d $a5, $a5, 1
-; LA64-NEXT:    xori $a6, $a6, 1
-; LA64-NEXT:    masknez $a5, $a5, $a6
+; LA64-NEXT:    sltu $a7, $a7, $a1
+; LA64-NEXT:    xori $a7, $a7, 1
+; LA64-NEXT:    masknez $a5, $a5, $a7
 ; LA64-NEXT:    andi $a5, $a5, 255
-; LA64-NEXT:    sll.w $a5, $a5, $a3
-; LA64-NEXT:    and $a6, $a2, $a4
-; LA64-NEXT:    or $a5, $a6, $a5
-; LA64-NEXT:    addi.w $a6, $a2, 0
+; LA64-NEXT:    sll.w $a5, $a5, $a2
+; LA64-NEXT:    and $a3, $a3, $a4
+; LA64-NEXT:    or $a5, $a3, $a5
 ; LA64-NEXT:  .LBB0_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB0_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a2, $a0, 0
-; LA64-NEXT:    bne $a2, $a6, .LBB0_5
+; LA64-NEXT:    ll.w $a3, $a0, 0
+; LA64-NEXT:    bne $a3, $a6, .LBB0_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB0_3 Depth=2
 ; LA64-NEXT:    move $a7, $a5
@@ -43,9 +43,9 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB0_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; LA64-NEXT:    bne $a2, $a6, .LBB0_1
+; LA64-NEXT:    bne $a3, $a6, .LBB0_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    srl.w $a0, $a2, $a3
+; LA64-NEXT:    srl.w $a0, $a3, $a2
 ; LA64-NEXT:    ret
   %result = atomicrmw uinc_wrap ptr %ptr, i8 %val seq_cst
   ret i8 %result
@@ -54,35 +54,35 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
 define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-LABEL: atomicrmw_uinc_wrap_i16:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    slli.d $a3, $a0, 3
+; LA64-NEXT:    slli.d $a2, $a0, 3
 ; LA64-NEXT:    bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT:    ld.w $a2, $a0, 0
-; LA64-NEXT:    lu12i.w $a4, 15
-; LA64-NEXT:    ori $a4, $a4, 4095
-; LA64-NEXT:    sll.w $a4, $a4, $a3
-; LA64-NEXT:    andi $a3, $a3, 24
+; LA64-NEXT:    lu12i.w $a3, 15
+; LA64-NEXT:    ori $a3, $a3, 4095
+; LA64-NEXT:    sll.w $a4, $a3, $a2
+; LA64-NEXT:    ld.w $a3, $a0, 0
+; LA64-NEXT:    andi $a2, $a2, 24
 ; LA64-NEXT:    nor $a4, $a4, $zero
 ; LA64-NEXT:    bstrpick.d $a1, $a1, 15, 0
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB1_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB1_3 Depth 2
-; LA64-NEXT:    srl.w $a5, $a2, $a3
-; LA64-NEXT:    bstrpick.d $a6, $a5, 15, 0
-; LA64-NEXT:    sltu $a6, $a6, $a1
+; LA64-NEXT:    srl.w $a5, $a3, $a2
+; LA64-NEXT:    addi.w $a6, $a3, 0
+; LA64-NEXT:    bstrpick.d $a7, $a5, 15, 0
 ; LA64-NEXT:    addi.d $a5, $a5, 1
-; LA64-NEXT:    xori $a6, $a6, 1
-; LA64-NEXT:    masknez $a5, $a5, $a6
+; LA64-NEXT:    sltu $a7, $a7, $a1
+; LA64-NEXT:    xori $a7, $a7, 1
+; LA64-NEXT:    masknez $a5, $a5, $a7
 ; LA64-NEXT:    bstrpick.d $a5, $a5, 15, 0
-; LA64-NEXT:    sll.w $a5, $a5, $a3
-; LA64-NEXT:    and $a6, $a2, $a4
-; LA64-NEXT:    or $a5, $a6, $a5
-; LA64-NEXT:    addi.w $a6, $a2, 0
+; LA64-NEXT:    sll.w $a5, $a5, $a2
+; LA64-NEXT:    and $a3, $a3, $a4
+; LA64-NEXT:    or $a5, $a3, $a5
 ; LA64-NEXT:  .LBB1_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB1_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a2, $a0, 0
-; LA64-NEXT:    bne $a2, $a6, .LBB1_5
+; LA64-NEXT:    ll.w $a3, $a0, 0
+; LA64-NEXT:    bne $a3, $a6, .LBB1_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB1_3 Depth=2
 ; LA64-NEXT:    move $a7, $a5
@@ -94,9 +94,9 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB1_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB1_1 Depth=1
-; LA64-NEXT:    bne $a2, $a6, .LBB1_1
+; LA64-NEXT:    bne $a3, $a6, .LBB1_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    srl.w $a0, $a2, $a3
+; LA64-NEXT:    srl.w $a0, $a3, $a2
 ; LA64-NEXT:    ret
   %result = atomicrmw uinc_wrap ptr %ptr, i16 %val seq_cst
   ret i16 %result
@@ -111,19 +111,19 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
 ; LA64-NEXT:  .LBB2_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB2_3 Depth 2
-; LA64-NEXT:    addi.w $a3, $a2, 0
-; LA64-NEXT:    sltu $a4, $a3, $a1
-; LA64-NEXT:    xori $a4, $a4, 1
-; LA64-NEXT:    addi.d $a2, $a2, 1
-; LA64-NEXT:    masknez $a4, $a2, $a4
+; LA64-NEXT:    addi.d $a3, $a2, 1
+; LA64-NEXT:    addi.w $a4, $a2, 0
+; LA64-NEXT:    sltu $a2, $a4, $a1
+; LA64-NEXT:    xori $a2, $a2, 1
+; LA64-NEXT:    masknez $a3, $a3, $a2
 ; LA64-NEXT:  .LBB2_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB2_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
 ; LA64-NEXT:    ll.w $a2, $a0, 0
-; LA64-NEXT:    bne $a2, $a3, .LBB2_5
+; LA64-NEXT:    bne $a2, $a4, .LBB2_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB2_3 Depth=2
-; LA64-NEXT:    move $a5, $a4
+; LA64-NEXT:    move $a5, $a3
 ; LA64-NEXT:    sc.w $a5, $a0, 0
 ; LA64-NEXT:    beqz $a5, .LBB2_3
 ; LA64-NEXT:    b .LBB2_6
@@ -132,7 +132,7 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB2_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB2_1 Depth=1
-; LA64-NEXT:    bne $a2, $a3, .LBB2_1
+; LA64-NEXT:    bne $a2, $a4, .LBB2_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64-NEXT:    move $a0, $a2
 ; LA64-NEXT:    ret
@@ -149,10 +149,10 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB3_3 Depth 2
 ; LA64-NEXT:    move $a3, $a2
-; LA64-NEXT:    sltu $a2, $a2, $a1
-; LA64-NEXT:    xori $a2, $a2, 1
-; LA64-NEXT:    addi.d $a4, $a3, 1
-; LA64-NEXT:    masknez $a4, $a4, $a2
+; LA64-NEXT:    addi.d $a2, $a2, 1
+; LA64-NEXT:    sltu $a4, $a3, $a1
+; LA64-NEXT:    xori $a4, $a4, 1
+; LA64-NEXT:    masknez $a4, $a2, $a4
 ; LA64-NEXT:  .LBB3_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB3_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
@@ -180,39 +180,39 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-LABEL: atomicrmw_udec_wrap_i8:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    slli.d $a3, $a0, 3
+; LA64-NEXT:    slli.d $a4, $a0, 3
 ; LA64-NEXT:    bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT:    ld.w $a2, $a0, 0
-; LA64-NEXT:    ori $a4, $zero, 255
-; LA64-NEXT:    sll.w $a4, $a4, $a3
-; LA64-NEXT:    andi $a3, $a3, 24
+; LA64-NEXT:    andi $a2, $a4, 24
+; LA64-NEXT:    ori $a5, $zero, 255
+; LA64-NEXT:    ld.w $a3, $a0, 0
+; LA64-NEXT:    sll.w $a4, $a5, $a4
 ; LA64-NEXT:    nor $a4, $a4, $zero
 ; LA64-NEXT:    andi $a5, $a1, 255
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB4_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB4_3 Depth 2
-; LA64-NEXT:    srl.w $a6, $a2, $a3
-; LA64-NEXT:    andi $a7, $a6, 255
-; LA64-NEXT:    sltu $t0, $a5, $a7
+; LA64-NEXT:    srl.w $a6, $a3, $a2
+; LA64-NEXT:    addi.w $a7, $a3, 0
+; LA64-NEXT:    andi $t0, $a6, 255
 ; LA64-NEXT:    addi.d $a6, $a6, -1
+; LA64-NEXT:    sltui $t1, $t0, 1
+; LA64-NEXT:    sltu $t0, $a5, $t0
 ; LA64-NEXT:    masknez $a6, $a6, $t0
 ; LA64-NEXT:    maskeqz $t0, $a1, $t0
 ; LA64-NEXT:    or $a6, $t0, $a6
-; LA64-NEXT:    sltui $a7, $a7, 1
-; LA64-NEXT:    masknez $a6, $a6, $a7
-; LA64-NEXT:    maskeqz $a7, $a1, $a7
-; LA64-NEXT:    or $a6, $a7, $a6
+; LA64-NEXT:    masknez $a6, $a6, $t1
+; LA64-NEXT:    maskeqz $t0, $a1, $t1
+; LA64-NEXT:    or $a6, $t0, $a6
 ; LA64-NEXT:    andi $a6, $a6, 255
-; LA64-NEXT:    sll.w $a6, $a6, $a3
-; LA64-NEXT:    and $a7, $a2, $a4
-; LA64-NEXT:    or $a6, $a7, $a6
-; LA64-NEXT:    addi.w $a7, $a2, 0
+; LA64-NEXT:    sll.w $a6, $a6, $a2
+; LA64-NEXT:    and $a3, $a3, $a4
+; LA64-NEXT:    or $a6, $a3, $a6
 ; LA64-NEXT:  .LBB4_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB4_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a2, $a0, 0
-; LA64-NEXT:    bne $a2, $a7, .LBB4_5
+; LA64-NEXT:    ll.w $a3, $a0, 0
+; LA64-NEXT:    bne $a3, $a7, .LBB4_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB4_3 Depth=2
 ; LA64-NEXT:    move $t0, $a6
@@ -224,9 +224,9 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB4_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB4_1 Depth=1
-; LA64-NEXT:    bne $a2, $a7, .LBB4_1
+; LA64-NEXT:    bne $a3, $a7, .LBB4_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    srl.w $a0, $a2, $a3
+; LA64-NEXT:    srl.w $a0, $a3, $a2
 ; LA64-NEXT:    ret
   %result = atomicrmw udec_wrap ptr %ptr, i8 %val seq_cst
   ret i8 %result
@@ -235,40 +235,40 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
 define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-LABEL: atomicrmw_udec_wrap_i16:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    slli.d $a3, $a0, 3
+; LA64-NEXT:    slli.d $a4, $a0, 3
 ; LA64-NEXT:    bstrins.d $a0, $zero, 1, 0
-; LA64-NEXT:    ld.w $a2, $a0, 0
-; LA64-NEXT:    lu12i.w $a4, 15
-; LA64-NEXT:    ori $a4, $a4, 4095
-; LA64-NEXT:    sll.w $a4, $a4, $a3
-; LA64-NEXT:    andi $a3, $a3, 24
+; LA64-NEXT:    andi $a2, $a4, 24
+; LA64-NEXT:    lu12i.w $a3, 15
+; LA64-NEXT:    ori $a5, $a3, 4095
+; LA64-NEXT:    ld.w $a3, $a0, 0
+; LA64-NEXT:    sll.w $a4, $a5, $a4
 ; LA64-NEXT:    nor $a4, $a4, $zero
 ; LA64-NEXT:    bstrpick.d $a5, $a1, 15, 0
 ; LA64-NEXT:    .p2align 4, , 16
 ; LA64-NEXT:  .LBB5_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB5_3 Depth 2
-; LA64-NEXT:    srl.w $a6, $a2, $a3
-; LA64-NEXT:    bstrpick.d $a7, $a6, 15, 0
-; LA64-NEXT:    sltu $t0, $a5, $a7
+; LA64-NEXT:    srl.w $a6, $a3, $a2
+; LA64-NEXT:    addi.w $a7, $a3, 0
+; LA64-NEXT:    bstrpick.d $t0, $a6, 15, 0
 ; LA64-NEXT:    addi.d $a6, $a6, -1
+; LA64-NEXT:    sltui $t1, $t0, 1
+; LA64-NEXT:    sltu $t0, $a5, $t0
 ; LA64-NEXT:    masknez $a6, $a6, $t0
 ; LA64-NEXT:    maskeqz $t0, $a1, $t0
 ; LA64-NEXT:    or $a6, $t0, $a6
-; LA64-NEXT:    sltui $a7, $a7, 1
-; LA64-NEXT:    masknez $a6, $a6, $a7
-; LA64-NEXT:    maskeqz $a7, $a1, $a7
-; LA64-NEXT:    or $a6, $a7, $a6
+; LA64-NEXT:    masknez $a6, $a6, $t1
+; LA64-NEXT:    maskeqz $t0, $a1, $t1
+; LA64-NEXT:    or $a6, $t0, $a6
 ; LA64-NEXT:    bstrpick.d $a6, $a6, 15, 0
-; LA64-NEXT:    sll.w $a6, $a6, $a3
-; LA64-NEXT:    and $a7, $a2, $a4
-; LA64-NEXT:    or $a6, $a7, $a6
-; LA64-NEXT:    addi.w $a7, $a2, 0
+; LA64-NEXT:    sll.w $a6, $a6, $a2
+; LA64-NEXT:    and $a3, $a3, $a4
+; LA64-NEXT:    or $a6, $a3, $a6
 ; LA64-NEXT:  .LBB5_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB5_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
-; LA64-NEXT:    ll.w $a2, $a0, 0
-; LA64-NEXT:    bne $a2, $a7, .LBB5_5
+; LA64-NEXT:    ll.w $a3, $a0, 0
+; LA64-NEXT:    bne $a3, $a7, .LBB5_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB5_3 Depth=2
 ; LA64-NEXT:    move $t0, $a6
@@ -280,9 +280,9 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB5_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB5_1 Depth=1
-; LA64-NEXT:    bne $a2, $a7, .LBB5_1
+; LA64-NEXT:    bne $a3, $a7, .LBB5_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
-; LA64-NEXT:    srl.w $a0, $a2, $a3
+; LA64-NEXT:    srl.w $a0, $a3, $a2
 ; LA64-NEXT:    ret
   %result = atomicrmw udec_wrap ptr %ptr, i16 %val seq_cst
   ret i16 %result
@@ -297,24 +297,24 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
 ; LA64-NEXT:  .LBB6_1: # %atomicrmw.start
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB6_3 Depth 2
-; LA64-NEXT:    addi.w $a4, $a2, 0
-; LA64-NEXT:    sltu $a5, $a3, $a4
-; LA64-NEXT:    addi.d $a2, $a2, -1
-; LA64-NEXT:    masknez $a2, $a2, $a5
-; LA64-NEXT:    maskeqz $a5, $a1, $a5
-; LA64-NEXT:    or $a2, $a5, $a2
-; LA64-NEXT:    sltui $a5, $a4, 1
-; LA64-NEXT:    masknez $a2, $a2, $a5
-; LA64-NEXT:    maskeqz $a5, $a1, $a5
-; LA64-NEXT:    or $a5, $a5, $a2
+; LA64-NEXT:    addi.d $a4, $a2, -1
+; LA64-NEXT:    addi.w $a5, $a2, 0
+; LA64-NEXT:    sltui $a2, $a5, 1
+; LA64-NEXT:    sltu $a6, $a3, $a5
+; LA64-NEXT:    masknez $a4, $a4, $a6
+; LA64-NEXT:    maskeqz $a6, $a1, $a6
+; LA64-NEXT:    or $a4, $a6, $a4
+; LA64-NEXT:    masknez $a4, $a4, $a2
+; LA64-NEXT:    maskeqz $a2, $a1, $a2
+; LA64-NEXT:    or $a4, $a2, $a4
 ; LA64-NEXT:  .LBB6_3: # %atomicrmw.start
 ; LA64-NEXT:    # Parent Loop BB6_1 Depth=1
 ; LA64-NEXT:    # => This Inner Loop Header: Depth=2
 ; LA64-NEXT:    ll.w $a2, $a0, 0
-; LA64-NEXT:    bne $a2, $a4, .LBB6_5
+; LA64-NEXT:    bne $a2, $a5, .LBB6_5
 ; LA64-NEXT:  # %bb.4: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB6_3 Depth=2
-; LA64-NEXT:    move $a6, $a5
+; LA64-NEXT:    move $a6, $a4
 ; LA64-NEXT:    sc.w $a6, $a0, 0
 ; LA64-NEXT:    beqz $a6, .LBB6_3
 ; LA64-NEXT:    b .LBB6_6
@@ -323,7 +323,7 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
 ; LA64-NEXT:    dbar 20
 ; LA64-NEXT:  .LBB6_6: # %atomicrmw.start
 ; LA64-NEXT:    # in Loop: Header=BB6_1 Depth=1
-; LA64-NEXT:    bne $a2, $a4, .LBB6_1
+; LA64-NEXT:    bne $a2, $a5, .LBB6_1
 ; LA64-NEXT:  # %bb.2: # %atomicrmw.end
 ; LA64-NEXT:    move $a0, $a2
 ; LA64-NEXT:    ret
@@ -340,12 +340,12 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
 ; LA64-NEXT:    # =>This Loop Header: Depth=1
 ; LA64-NEXT:    # Child Loop BB7_3 Depth 2
 ; LA64-NEXT:    move $a3, $a2
-; LA64-NEXT:    sltu $a2, $a1, $a2
-; LA64-NEXT:    addi.d $a4, $a3, -1
-; LA64-NEXT:    masknez $a4, $a4, $a2
-; LA64-NEXT:    maskeqz $a2, $a1, $a2
-; LA64-NEXT:    or $a2, $a2, $a4
+; LA64-NEXT:    addi.d $a2, $a2, -1
 ; LA64-NEXT:    sltui $a4, $a3, 1
+; LA64-NEXT:    sltu $a5, $a1, $a3
+; LA64-NEXT:    masknez $a2, $a2, $a5
+; LA64-NEXT:    maskeqz $a5, $a1, $a5
+; LA64-NEXT:    or $a2, $a5, $a2
 ; LA64-NEXT:    masknez $a2, $a2, $a4
 ; LA64-NEXT:    maskeqz $a4, $a1, $a4
 ; LA64-NEXT:    or $a4, $a4, $a2
diff --git a/llvm/test/CodeGen/LoongArch/bitreverse.ll b/llvm/test/CodeGen/LoongArch/bitreverse.ll
index 259d8565c68420..fcf523aa3c883a 100644
--- a/llvm/test/CodeGen/LoongArch/bitreverse.ll
+++ b/llvm/test/CodeGen/LoongArch/bitreverse.ll
@@ -129,30 +129,30 @@ define i48 @test_bitreverse_i48(i48 %a) nounwind {
 define i77 @test_bitreverse_i77(i77 %a) nounwind {
 ; LA32-LABEL: test_bitreverse_i77:
 ; LA32:       # %bb.0:
-; LA32-NEXT:    ld.w $a2, $a1, 0
+; LA32-NEXT:    ld.w $a2, $a1, 4
+; LA32-NEXT:    ld.w $a3, $a1, 8
+; LA32-NEXT:    ld.w $a1, $a1, 0
 ; LA32-NEXT:    bitrev.w $a2, $a2
-; LA32-NEXT:    ld.w $a3, $a1, 4
+; LA32-NEXT:    slli.w $a4, $a2, 13
 ; LA32-NEXT:    bitrev.w $a3, $a3
-; LA32-NEXT:    srli.w $a4, $a3, 19
-; LA32-NEXT:    slli.w $a5, $a2, 13
-; LA32-NEXT:    or $a4, $a5, $a4
+; LA32-NEXT:    srli.w $a3, $a3, 19
+; LA32-NEXT:    or $a3, $a3, $a4
 ; L...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/83759