[llvm] MachineScheduler: Reset next cluster candidate for each node (PR #139513)
via llvm-commits
llvm-commits at lists.llvm.org
Mon May 12 01:18:50 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-powerpc
Author: Ruiling, Song (ruiling)
<details>
<summary>Changes</summary>
When a node is picked, we should reset its next cluster candidate to null before releasing its successors/predecessors.
---
Patch is 1.82 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/139513.diff
110 Files Affected:
- (modified) llvm/lib/CodeGen/MachineScheduler.cpp (+7)
- (modified) llvm/test/CodeGen/AArch64/expand-select.ll (+10-10)
- (modified) llvm/test/CodeGen/AArch64/extbinopload.ll (+43-42)
- (modified) llvm/test/CodeGen/AArch64/fptoi.ll (+70-70)
- (modified) llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll (+16-16)
- (modified) llvm/test/CodeGen/AArch64/itofp.ll (+90-90)
- (modified) llvm/test/CodeGen/AArch64/nontemporal-load.ll (+9-8)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll (+43-43)
- (modified) llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll (+43-43)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll (+47-47)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll (+12-12)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll (+80-82)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll (+16-16)
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll (+74-72)
- (modified) llvm/test/CodeGen/AArch64/vec_uaddo.ll (+1-1)
- (modified) llvm/test/CodeGen/AArch64/vec_umulo.ll (+4-4)
- (modified) llvm/test/CodeGen/AArch64/vselect-ext.ll (+15-15)
- (modified) llvm/test/CodeGen/AArch64/wide-scalar-shift-legalization.ll (+28-31)
- (modified) llvm/test/CodeGen/AArch64/zext-to-tbl.ll (+54-53)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll (+11-12)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll (+30-30)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+321-314)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+292-289)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll (+12-11)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll (+10-9)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll (+4-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll (+124-125)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll (+40-39)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll (+19-19)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll (+49-49)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll (+69-69)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+3062-3062)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll (+14-12)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+239-241)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-cc.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-preserve-cc.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+1496-1516)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+15-15)
- (modified) llvm/test/CodeGen/AMDGPU/carryout-selection.ll (+1-2)
- (modified) llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll (+23-23)
- (modified) llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll (+24-24)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+5-5)
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/fmed3.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/freeze.ll (+26-26)
- (modified) llvm/test/CodeGen/AMDGPU/function-args-inreg.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll (+23-22)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/half.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll (+16-16)
- (modified) llvm/test/CodeGen/AMDGPU/idiv-licm.ll (+3-4)
- (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+32-32)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/lds-misaligned-bug.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll (+10-10)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log.ll (+4-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log10.ll (+4-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll (+106-106)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll (+115-115)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll (+282-281)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll (+115-115)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll (+282-281)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.round.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i1.ll (+20-21)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+49-49)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i32.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+18-18)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i16.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i32.ll (+170-171)
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-lastuse-metadata.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll (+16-16)
- (modified) llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll (+864-878)
- (modified) llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll (+3-3)
- (modified) llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll (+864-878)
- (modified) llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/permute_i8.ll (+51-51)
- (modified) llvm/test/CodeGen/AMDGPU/pr51516.mir (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/repeated-divisor.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/sdiv.ll (+96-96)
- (modified) llvm/test/CodeGen/AMDGPU/select.f16.ll (+165-169)
- (modified) llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll (+7-7)
- (modified) llvm/test/CodeGen/AMDGPU/srem.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/store-local.128.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/vopd-combine.mir (+6-6)
- (modified) llvm/test/CodeGen/PowerPC/p10-fi-elim.ll (+2-2)
- (modified) llvm/test/CodeGen/RISCV/GlobalISel/wide-scalar-shift-by-byte-multiple-legalization.ll (+35-35)
- (modified) llvm/test/CodeGen/RISCV/memcmp-optsize.ll (+66-66)
- (modified) llvm/test/CodeGen/RISCV/memcmp.ll (+66-66)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll (+149-149)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll (+20-20)
- (modified) llvm/test/CodeGen/RISCV/rvv/pr125306.ll (+8-8)
- (modified) llvm/test/CodeGen/RISCV/unaligned-load-store.ll (+27-27)
- (modified) llvm/test/CodeGen/RISCV/vararg.ll (+9-9)
- (modified) llvm/test/CodeGen/RISCV/wide-scalar-shift-by-byte-multiple-legalization.ll (+194-194)
- (modified) llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll (+136-136)
``````````diff
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 236c55cb04142..e283cf0f392f1 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -967,6 +967,12 @@ void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
/// releaseSuccessors - Call releaseSucc on each of SU's successors.
void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
+ // Reset the next successor, For example, we want to cluster A B C.
+ // After A is picked, we will set B as next cluster succ, but if we pick
+ // D instead of B after A, then we need to reset the next cluster succ because
+ // we have decided to not pick the cluster candidate B during pickNode().
+ // Leaving B as the NextClusterSucc just make things messy.
+ NextClusterSucc = nullptr;
for (SDep &Succ : SU->Succs)
releaseSucc(SU, &Succ);
}
@@ -1004,6 +1010,7 @@ void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
/// releasePredecessors - Call releasePred on each of SU's predecessors.
void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
+ NextClusterPred = nullptr;
for (SDep &Pred : SU->Preds)
releasePred(SU, &Pred);
}
diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll
index 1ed2e09c6b4d4..7ca6adb1338d3 100644
--- a/llvm/test/CodeGen/AArch64/expand-select.ll
+++ b/llvm/test/CodeGen/AArch64/expand-select.ll
@@ -8,11 +8,11 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, ptr %Out) {
; CHECK-NEXT: fmov s0, wzr
; CHECK-NEXT: ldr x11, [sp]
; CHECK-NEXT: fmov s1, w8
-; CHECK-NEXT: ldp x9, x10, [sp, #8]
+; CHECK-NEXT: ldp x8, x10, [sp, #8]
; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: tst w8, #0x1
-; CHECK-NEXT: csel x8, x5, x9, ne
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: tst w9, #0x1
+; CHECK-NEXT: csel x8, x5, x8, ne
; CHECK-NEXT: csel x9, x4, x11, ne
; CHECK-NEXT: stp x9, x8, [x10, #16]
; CHECK-NEXT: csel x8, x3, x7, ne
@@ -36,14 +36,14 @@ define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, ptr %Out) {
; CHECK-NEXT: ldr x10, [sp, #16]
; CHECK-NEXT: fmov s1, w8
; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: tst w8, #0x1
-; CHECK-NEXT: ldp x9, x8, [sp]
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: tst w9, #0x1
+; CHECK-NEXT: ldp x8, x9, [sp]
; CHECK-NEXT: csel x11, x2, x6, ne
; CHECK-NEXT: str x11, [x10]
-; CHECK-NEXT: csel x9, x4, x9, ne
-; CHECK-NEXT: csel x8, x5, x8, ne
-; CHECK-NEXT: stur x9, [x10, #12]
+; CHECK-NEXT: csel x8, x4, x8, ne
+; CHECK-NEXT: stur x8, [x10, #12]
+; CHECK-NEXT: csel x8, x5, x9, ne
; CHECK-NEXT: csel x9, x3, x7, ne
; CHECK-NEXT: str w8, [x10, #20]
; CHECK-NEXT: str w9, [x10, #8]
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index 82114d60c4a93..75f3ffc9515e5 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -667,30 +667,30 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: add x10, x3, #12
; CHECK-NEXT: bic v1.8h, #255, lsl #8
; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4
-; CHECK-NEXT: ldr s3, [x0, #12]
-; CHECK-NEXT: ldp s2, s7, [x0, #4]
+; CHECK-NEXT: ldr s4, [x0, #12]
+; CHECK-NEXT: ldp s5, s2, [x2, #4]
; CHECK-NEXT: ldr s6, [x2, #12]
-; CHECK-NEXT: ldp s5, s4, [x2, #4]
-; CHECK-NEXT: ld1 { v3.s }[1], [x11]
+; CHECK-NEXT: ldp s3, s7, [x0, #4]
+; CHECK-NEXT: ld1 { v4.s }[1], [x11]
; CHECK-NEXT: ld1 { v6.s }[1], [x10]
-; CHECK-NEXT: ld1 { v2.s }[1], [x9]
-; CHECK-NEXT: ld1 { v4.s }[1], [x8]
+; CHECK-NEXT: ld1 { v2.s }[1], [x8]
; CHECK-NEXT: ld1 { v5.s }[1], [x3]
; CHECK-NEXT: add x8, x1, #8
+; CHECK-NEXT: ld1 { v3.s }[1], [x9]
; CHECK-NEXT: ld1 { v7.s }[1], [x8]
-; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b
-; CHECK-NEXT: ushll v4.8h, v4.8b, #0
-; CHECK-NEXT: uaddl v3.8h, v5.8b, v6.8b
+; CHECK-NEXT: ushll v2.8h, v2.8b, #0
+; CHECK-NEXT: uaddl v3.8h, v3.8b, v4.8b
+; CHECK-NEXT: uaddl v4.8h, v5.8b, v6.8b
; CHECK-NEXT: uaddw v1.8h, v1.8h, v7.8b
-; CHECK-NEXT: uaddw2 v4.8h, v4.8h, v0.16b
-; CHECK-NEXT: ushll v0.4s, v2.4h, #3
-; CHECK-NEXT: ushll v5.4s, v3.4h, #3
+; CHECK-NEXT: uaddw2 v2.8h, v2.8h, v0.16b
+; CHECK-NEXT: ushll v0.4s, v3.4h, #3
+; CHECK-NEXT: ushll v5.4s, v4.4h, #3
+; CHECK-NEXT: ushll2 v4.4s, v4.8h, #3
; CHECK-NEXT: ushll2 v3.4s, v3.8h, #3
-; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3
; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
-; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v1.8h
-; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v4.8h
-; CHECK-NEXT: uaddw v2.4s, v5.4s, v4.4h
+; CHECK-NEXT: uaddw2 v1.4s, v3.4s, v1.8h
+; CHECK-NEXT: uaddw2 v3.4s, v4.4s, v2.8h
+; CHECK-NEXT: uaddw v2.4s, v5.4s, v2.4h
; CHECK-NEXT: ret
%lp1 = load <4 x i8>, ptr %p
store <4 x i8> %lp1, ptr %z
@@ -1073,24 +1073,24 @@ define <16 x i32> @extrause_ext2(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: ld1 { v6.s }[1], [x10]
; CHECK-NEXT: ld1 { v5.s }[1], [x9]
; CHECK-NEXT: ld1 { v7.s }[1], [x8]
-; CHECK-NEXT: uaddl v16.8h, v2.8b, v3.8b
-; CHECK-NEXT: uaddl v3.8h, v1.8b, v6.8b
-; CHECK-NEXT: uaddl v2.8h, v4.8b, v5.8b
+; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT: uaddl v1.8h, v1.8b, v6.8b
+; CHECK-NEXT: uaddl v3.8h, v4.8b, v5.8b
; CHECK-NEXT: uaddl v4.8h, v0.8b, v7.8b
-; CHECK-NEXT: ushll v0.4s, v16.4h, #3
-; CHECK-NEXT: ushll2 v1.4s, v16.8h, #3
-; CHECK-NEXT: ushll2 v18.4s, v16.8h, #0
-; CHECK-NEXT: ushll v6.4s, v2.4h, #3
-; CHECK-NEXT: ushll2 v7.4s, v2.8h, #3
-; CHECK-NEXT: ushll2 v5.4s, v2.8h, #0
+; CHECK-NEXT: ushll2 v0.4s, v2.8h, #0
+; CHECK-NEXT: ushll v5.4s, v2.4h, #3
+; CHECK-NEXT: ushll2 v16.4s, v2.8h, #3
+; CHECK-NEXT: ushll v6.4s, v3.4h, #3
+; CHECK-NEXT: ushll2 v7.4s, v3.8h, #3
; CHECK-NEXT: ushll v17.4s, v2.4h, #0
-; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v3.8h
-; CHECK-NEXT: uaddw v0.4s, v0.4s, v3.4h
+; CHECK-NEXT: ushll2 v18.4s, v3.8h, #0
+; CHECK-NEXT: ushll v19.4s, v3.4h, #0
+; CHECK-NEXT: stp q17, q0, [x4]
+; CHECK-NEXT: uaddw v0.4s, v5.4s, v1.4h
+; CHECK-NEXT: uaddw2 v1.4s, v16.4s, v1.8h
; CHECK-NEXT: uaddw2 v3.4s, v7.4s, v4.8h
; CHECK-NEXT: uaddw v2.4s, v6.4s, v4.4h
-; CHECK-NEXT: ushll v4.4s, v16.4h, #0
-; CHECK-NEXT: stp q17, q5, [x4, #32]
-; CHECK-NEXT: stp q4, q18, [x4]
+; CHECK-NEXT: stp q19, q18, [x4, #32]
; CHECK-NEXT: ret
%lp1 = load <4 x i8>, ptr %p
%p2 = getelementptr i8, ptr %p, i32 4
@@ -1176,19 +1176,20 @@ define <16 x i32> @extrause_shl(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
; CHECK-NEXT: ld1 { v5.s }[1], [x9]
; CHECK-NEXT: ld1 { v7.s }[1], [x8]
; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT: uaddl v1.8h, v1.8b, v6.8b
; CHECK-NEXT: uaddl v3.8h, v4.8b, v5.8b
-; CHECK-NEXT: uaddl v4.8h, v1.8b, v6.8b
-; CHECK-NEXT: ushll v5.4s, v2.4h, #3
-; CHECK-NEXT: ushll2 v6.4s, v2.8h, #3
-; CHECK-NEXT: uaddl v2.8h, v0.8b, v7.8b
-; CHECK-NEXT: ushll v7.4s, v3.4h, #3
-; CHECK-NEXT: ushll2 v16.4s, v3.8h, #3
-; CHECK-NEXT: uaddw2 v1.4s, v6.4s, v4.8h
-; CHECK-NEXT: uaddw v0.4s, v5.4s, v4.4h
-; CHECK-NEXT: stp q5, q6, [x4]
-; CHECK-NEXT: uaddw2 v3.4s, v16.4s, v2.8h
-; CHECK-NEXT: uaddw v2.4s, v7.4s, v2.4h
-; CHECK-NEXT: stp q7, q16, [x4, #32]
+; CHECK-NEXT: uaddl v5.8h, v0.8b, v7.8b
+; CHECK-NEXT: ushll v4.4s, v2.4h, #3
+; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3
+; CHECK-NEXT: ushll v6.4s, v3.4h, #3
+; CHECK-NEXT: ushll2 v7.4s, v3.8h, #3
+; CHECK-NEXT: uaddw v0.4s, v4.4s, v1.4h
+; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v1.8h
+; CHECK-NEXT: str q4, [x4]
+; CHECK-NEXT: stp q2, q6, [x4, #16]
+; CHECK-NEXT: uaddw2 v3.4s, v7.4s, v5.8h
+; CHECK-NEXT: uaddw v2.4s, v6.4s, v5.4h
+; CHECK-NEXT: str q7, [x4, #48]
; CHECK-NEXT: ret
%lp1 = load <4 x i8>, ptr %p
%p2 = getelementptr i8, ptr %p, i32 4
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 9c4f0207b84ce..ae3b6a54a1f7f 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -2825,42 +2825,42 @@ define <32 x i64> @fptos_v32f32_v32i64(<32 x float> %a) {
; CHECK-SD-NEXT: fcvtl v7.2d, v7.2s
; CHECK-SD-NEXT: fcvtl2 v17.2d, v6.4s
; CHECK-SD-NEXT: fcvtl v6.2d, v6.2s
-; CHECK-SD-NEXT: fcvtl2 v18.2d, v5.4s
-; CHECK-SD-NEXT: fcvtl v5.2d, v5.2s
+; CHECK-SD-NEXT: fcvtl2 v21.2d, v2.4s
+; CHECK-SD-NEXT: fcvtl v2.2d, v2.2s
; CHECK-SD-NEXT: fcvtl2 v19.2d, v4.4s
; CHECK-SD-NEXT: fcvtl v4.2d, v4.2s
+; CHECK-SD-NEXT: fcvtl2 v18.2d, v5.4s
; CHECK-SD-NEXT: fcvtl2 v20.2d, v3.4s
+; CHECK-SD-NEXT: fcvtl v5.2d, v5.2s
; CHECK-SD-NEXT: fcvtl v3.2d, v3.2s
; CHECK-SD-NEXT: fcvtzs v16.2d, v16.2d
; CHECK-SD-NEXT: fcvtzs v7.2d, v7.2d
; CHECK-SD-NEXT: fcvtzs v17.2d, v17.2d
; CHECK-SD-NEXT: fcvtzs v6.2d, v6.2d
+; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
+; CHECK-SD-NEXT: fcvtzs v19.2d, v19.2d
+; CHECK-SD-NEXT: fcvtzs v4.2d, v4.2d
; CHECK-SD-NEXT: fcvtzs v18.2d, v18.2d
+; CHECK-SD-NEXT: fcvtzs v20.2d, v20.2d
; CHECK-SD-NEXT: fcvtzs v5.2d, v5.2d
-; CHECK-SD-NEXT: fcvtzs v4.2d, v4.2d
; CHECK-SD-NEXT: fcvtzs v3.2d, v3.2d
; CHECK-SD-NEXT: stp q7, q16, [x8, #224]
-; CHECK-SD-NEXT: fcvtl2 v7.2d, v2.4s
-; CHECK-SD-NEXT: fcvtzs v16.2d, v19.2d
-; CHECK-SD-NEXT: stp q5, q18, [x8, #160]
-; CHECK-SD-NEXT: fcvtl v2.2d, v2.2s
-; CHECK-SD-NEXT: fcvtl2 v5.2d, v0.4s
+; CHECK-SD-NEXT: fcvtzs v16.2d, v21.2d
; CHECK-SD-NEXT: stp q6, q17, [x8, #192]
-; CHECK-SD-NEXT: fcvtl2 v6.2d, v1.4s
-; CHECK-SD-NEXT: fcvtzs v17.2d, v20.2d
+; CHECK-SD-NEXT: fcvtl2 v17.2d, v1.4s
; CHECK-SD-NEXT: fcvtl v1.2d, v1.2s
+; CHECK-SD-NEXT: stp q4, q19, [x8, #128]
+; CHECK-SD-NEXT: stp q3, q20, [x8, #96]
+; CHECK-SD-NEXT: stp q2, q16, [x8, #64]
+; CHECK-SD-NEXT: fcvtl2 v16.2d, v0.4s
; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s
-; CHECK-SD-NEXT: stp q4, q16, [x8, #128]
-; CHECK-SD-NEXT: fcvtzs v7.2d, v7.2d
-; CHECK-SD-NEXT: fcvtzs v2.2d, v2.2d
-; CHECK-SD-NEXT: fcvtzs v4.2d, v6.2d
-; CHECK-SD-NEXT: stp q3, q17, [x8, #96]
-; CHECK-SD-NEXT: fcvtzs v3.2d, v5.2d
+; CHECK-SD-NEXT: stp q5, q18, [x8, #160]
+; CHECK-SD-NEXT: fcvtzs v6.2d, v17.2d
; CHECK-SD-NEXT: fcvtzs v1.2d, v1.2d
+; CHECK-SD-NEXT: fcvtzs v4.2d, v16.2d
; CHECK-SD-NEXT: fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT: stp q2, q7, [x8, #64]
-; CHECK-SD-NEXT: stp q0, q3, [x8]
-; CHECK-SD-NEXT: stp q1, q4, [x8, #32]
+; CHECK-SD-NEXT: stp q1, q6, [x8, #32]
+; CHECK-SD-NEXT: stp q0, q4, [x8]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptos_v32f32_v32i64:
@@ -2918,42 +2918,42 @@ define <32 x i64> @fptou_v32f32_v32i64(<32 x float> %a) {
; CHECK-SD-NEXT: fcvtl v7.2d, v7.2s
; CHECK-SD-NEXT: fcvtl2 v17.2d, v6.4s
; CHECK-SD-NEXT: fcvtl v6.2d, v6.2s
-; CHECK-SD-NEXT: fcvtl2 v18.2d, v5.4s
-; CHECK-SD-NEXT: fcvtl v5.2d, v5.2s
+; CHECK-SD-NEXT: fcvtl2 v21.2d, v2.4s
+; CHECK-SD-NEXT: fcvtl v2.2d, v2.2s
; CHECK-SD-NEXT: fcvtl2 v19.2d, v4.4s
; CHECK-SD-NEXT: fcvtl v4.2d, v4.2s
+; CHECK-SD-NEXT: fcvtl2 v18.2d, v5.4s
; CHECK-SD-NEXT: fcvtl2 v20.2d, v3.4s
+; CHECK-SD-NEXT: fcvtl v5.2d, v5.2s
; CHECK-SD-NEXT: fcvtl v3.2d, v3.2s
; CHECK-SD-NEXT: fcvtzu v16.2d, v16.2d
; CHECK-SD-NEXT: fcvtzu v7.2d, v7.2d
; CHECK-SD-NEXT: fcvtzu v17.2d, v17.2d
; CHECK-SD-NEXT: fcvtzu v6.2d, v6.2d
+; CHECK-SD-NEXT: fcvtzu v2.2d, v2.2d
+; CHECK-SD-NEXT: fcvtzu v19.2d, v19.2d
+; CHECK-SD-NEXT: fcvtzu v4.2d, v4.2d
; CHECK-SD-NEXT: fcvtzu v18.2d, v18.2d
+; CHECK-SD-NEXT: fcvtzu v20.2d, v20.2d
; CHECK-SD-NEXT: fcvtzu v5.2d, v5.2d
-; CHECK-SD-NEXT: fcvtzu v4.2d, v4.2d
; CHECK-SD-NEXT: fcvtzu v3.2d, v3.2d
; CHECK-SD-NEXT: stp q7, q16, [x8, #224]
-; CHECK-SD-NEXT: fcvtl2 v7.2d, v2.4s
-; CHECK-SD-NEXT: fcvtzu v16.2d, v19.2d
-; CHECK-SD-NEXT: stp q5, q18, [x8, #160]
-; CHECK-SD-NEXT: fcvtl v2.2d, v2.2s
-; CHECK-SD-NEXT: fcvtl2 v5.2d, v0.4s
+; CHECK-SD-NEXT: fcvtzu v16.2d, v21.2d
; CHECK-SD-NEXT: stp q6, q17, [x8, #192]
-; CHECK-SD-NEXT: fcvtl2 v6.2d, v1.4s
-; CHECK-SD-NEXT: fcvtzu v17.2d, v20.2d
+; CHECK-SD-NEXT: fcvtl2 v17.2d, v1.4s
; CHECK-SD-NEXT: fcvtl v1.2d, v1.2s
+; CHECK-SD-NEXT: stp q4, q19, [x8, #128]
+; CHECK-SD-NEXT: stp q3, q20, [x8, #96]
+; CHECK-SD-NEXT: stp q2, q16, [x8, #64]
+; CHECK-SD-NEXT: fcvtl2 v16.2d, v0.4s
; CHECK-SD-NEXT: fcvtl v0.2d, v0.2s
-; CHECK-SD-NEXT: stp q4, q16, [x8, #128]
-; CHECK-SD-NEXT: fcvtzu v7.2d, v7.2d
-; CHECK-SD-NEXT: fcvtzu v2.2d, v2.2d
-; CHECK-SD-NEXT: fcvtzu v4.2d, v6.2d
-; CHECK-SD-NEXT: stp q3, q17, [x8, #96]
-; CHECK-SD-NEXT: fcvtzu v3.2d, v5.2d
+; CHECK-SD-NEXT: stp q5, q18, [x8, #160]
+; CHECK-SD-NEXT: fcvtzu v6.2d, v17.2d
; CHECK-SD-NEXT: fcvtzu v1.2d, v1.2d
+; CHECK-SD-NEXT: fcvtzu v4.2d, v16.2d
; CHECK-SD-NEXT: fcvtzu v0.2d, v0.2d
-; CHECK-SD-NEXT: stp q2, q7, [x8, #64]
-; CHECK-SD-NEXT: stp q0, q3, [x8]
-; CHECK-SD-NEXT: stp q1, q4, [x8, #32]
+; CHECK-SD-NEXT: stp q1, q6, [x8, #32]
+; CHECK-SD-NEXT: stp q0, q4, [x8]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: fptou_v32f32_v32i64:
@@ -5244,45 +5244,45 @@ define <32 x i64> @fptos_v32f16_v32i64(<32 x half> %a) {
; CHECK-GI-FP16-NEXT: mov v17.d[1], v23.d[0]
; CHECK-GI-FP16-NEXT: mov v1.d[1], v29.d[0]
; CHECK-GI-FP16-NEXT: mov v19.d[1], v30.d[0]
-; CHECK-GI-FP16-NEXT: mov h21, v3.h[1]
+; CHECK-GI-FP16-NEXT: mov h16, v3.h[1]
; CHECK-GI-FP16-NEXT: stp q6, q5, [x8, #32]
; CHECK-GI-FP16-NEXT: mov v20.d[1], v22.d[0]
-; CHECK-GI-FP16-NEXT: mov h16, v3.h[2]
+; CHECK-GI-FP16-NEXT: mov h21, v3.h[2]
; CHECK-GI-FP16-NEXT: mov h7, v3.h[3]
; CHECK-GI-FP16-NEXT: mov h22, v3.h[4]
-; CHECK-GI-FP16-NEXT: mov h23, v3.h[5]
-; CHECK-GI-FP16-NEXT: mov h6, v3.h[6]
+; CHECK-GI-FP16-NEXT: mov h6, v3.h[5]
+; CHECK-GI-FP16-NEXT: mov h23, v3.h[6]
; CHECK-GI-FP16-NEXT: mov h5, v3.h[7]
; CHECK-GI-FP16-NEXT: mov v18.d[1], v24.d[0]
; CHECK-GI-FP16-NEXT: mov v2.d[1], v25.d[0]
; CHECK-GI-FP16-NEXT: fcvt d3, h3
-; CHECK-GI-FP16-NEXT: fcvt d21, h21
-; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v0.2d
; CHECK-GI-FP16-NEXT: fcvt d16, h16
+; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v0.2d
+; CHECK-GI-FP16-NEXT: fcvt d21, h21
; CHECK-GI-FP16-NEXT: fcvtzs v4.2d, v4.2d
; CHECK-GI-FP16-NEXT: fcvt d7, h7
; CHECK-GI-FP16-NEXT: fcvt d22, h22
-; CHECK-GI-FP16-NEXT: fcvt d23, h23
-; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v1.2d
; CHECK-GI-FP16-NEXT: fcvt d6, h6
+; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v1.2d
+; CHECK-GI-FP16-NEXT: fcvt d23, h23
; CHECK-GI-FP16-NEXT: fcvt d5, h5
; CHECK-GI-FP16-NEXT: fcvtzs v19.2d, v19.2d
-; CHECK-GI-FP16-NEXT: mov v3.d[1], v21.d[0]
-; CHECK-GI-FP16-NEXT: fcvtzs v20.2d, v20.2d
+; CHECK-GI-FP16-NEXT: mov v3.d[1], v16.d[0]
+; CHECK-GI-FP16-NEXT: fcvtzs v16.2d, v20.2d
; CHECK-GI-FP16-NEXT: stp q0, q4, [x8, #64]
; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v17.2d
; CHECK-GI-FP16-NEXT: fcvtzs v4.2d, v18.2d
-; CHECK-GI-FP16-NEXT: mov v16.d[1], v7.d[0]
-; CHECK-GI-FP16-NEXT: mov v22.d[1], v23.d[0]
-; CHECK-GI-FP16-NEXT: mov v6.d[1], v5.d[0]
+; CHECK-GI-FP16-NEXT: mov v21.d[1], v7.d[0]
+; CHECK-GI-FP16-NEXT: mov v22.d[1], v6.d[0]
+; CHECK-GI-FP16-NEXT: mov v23.d[1], v5.d[0]
; CHECK-GI-FP16-NEXT: stp q1, q19, [x8, #96]
; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v2.2d
; CHECK-GI-FP16-NEXT: fcvtzs v2.2d, v3.2d
-; CHECK-GI-FP16-NEXT: stp q20, q0, [x8, #128]
-; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v16.2d
+; CHECK-GI-FP16-NEXT: stp q16, q0, [x8, #128]
+; CHECK-GI-FP16-NEXT: fcvtzs v0.2d, v21.2d
; CHECK-GI-FP16-NEXT: fcvtzs v3.2d, v22.2d
; CHECK-GI-FP16-NEXT: stp q4, q1, [x8, #160]
-; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v6.2d
+; CHECK-GI-FP16-NEXT: fcvtzs v1.2d, v23.2d
; CHECK-GI-FP16-NEXT: stp q2, q0, [x8, #192]
; CHECK-GI-FP16-NEXT: stp q3, q1, [x8, #224]
; CHECK-GI-FP16-NEXT: ret
@@ -5645,45 +5645,45 @@ define <32 x i64> @fptou_v32f16_v32i64(<32 x half> %a) {
; CHECK-GI-FP16-NEXT: mov v17.d[1], v23.d[0]
; CHECK-GI-FP16-NEXT: mov v1.d[1], v29.d[0]
; CHECK-GI-FP16-NEXT: mov v19.d[1], v30.d[0]
-; CHECK-GI-FP16-NEXT: mov h21, v3.h[1]
+; CHECK-GI-FP16-NEXT: mov h16, v3.h[1]
; CHECK-GI-FP16-NEXT: stp q6, q5, [x8, #32]
; CHECK-GI-FP16-NEXT: mov v20.d[1], v22.d[0]
-; CHECK-GI-FP16-NEXT: mov h16, v3.h[2]
+; CHECK-GI-FP16-NEXT: mov h21, v3.h[2]
; CHECK-GI-FP16-NEXT: mov h7, v3.h[3]
; CHECK-GI-FP16-NEXT: mov h22, v3.h[4]
-; CHECK-GI-FP16-NEXT: mov h23, v3.h[5]
-; CHECK-GI-FP16-NEXT: mov h6, v3.h[6]
+; CHECK-GI-FP16-NEXT: mov h6, v3.h[5]
+; CHECK-GI-FP16-NEXT: mov h23, v3.h[6]
; CHECK-GI-FP16-NEXT: mov h5, v3.h[7]
; CHECK-GI-FP16-NEXT: mov v18.d[1], v24.d[0]
; CHECK-GI-FP16-NEXT: mov v2.d[1], v25.d[0]
; CHECK-GI-FP16-NEXT: fcvt d3, h3
-; CHECK-GI-FP16-NEXT: fcvt d21, h21
-; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v0.2d
; CHECK-GI-FP16-NEXT: fcvt d16, h16
+; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v0.2d
+; CHECK-GI-FP16-NEXT: fcvt d21, h21
; CHECK-GI-FP16-NEXT: fcvtzu v4.2d, v4.2d
; CHECK-GI-FP16-NEXT: fcvt d7, h7
; CHECK-GI-FP16-NEXT: fcvt d22, h22
-; CHECK-GI-FP16-NEXT: fcvt d23, h23
-; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v1.2d
; CHECK-GI-FP16-NEXT: fcvt d6, h6
+; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v1.2d
+; CHECK-GI-FP16-NEXT: fcvt d23, h23
; CHECK-GI-FP16-NEXT: fcvt d5, h5
; CHECK-GI-FP16-NEXT: fcvtzu v19.2d, v19.2d
-; CHECK-GI-FP16-NEXT: mov v3.d[1], v21.d[0]
-; CHECK-GI-FP16-NEXT: fcvtzu v20.2d, v20.2d
+; CHECK-GI-FP16-NEXT: mov v3.d[1], v16.d[0]
+; CHECK-GI-FP16-NEXT: fcvtzu v16.2d, v20.2d
; CHECK-GI-FP16-NEXT: stp q0, q4, [x8, #64]
; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v17.2d
; CHECK-GI-FP16-NEXT: fcvtzu v4.2d, v18.2d
-; CHECK-GI-FP16-NEXT: mov v16.d[1], v7.d[0]
-; CHECK-GI-FP16-NEXT: mov v22.d[1], v23.d[0]
-; CHECK-GI-FP16-NEXT: mov v6.d[1], v5.d[0]
+; CHECK-GI-FP16-NEXT: mov v21.d[1], v7.d[0]
+; CHECK-GI-FP16-NEXT: mov v22.d[1], v6.d[0]
+; CHECK-GI-FP16-NEXT: mov v23.d[1], v5.d[0]
; CHECK-GI-FP16-NEXT: stp q1, q19, [x8, #96]
; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v2.2d
; CHECK-GI-FP16-NEXT: fcvtzu v2.2d, v3.2d
-; CHECK-GI-FP16-NEXT: stp q20, q0, [x8, #128]
-; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v16.2d
+; CHECK-GI-FP16-NEXT: stp q16, q0, [x8, #128]
+; CHECK-GI-FP16-NEXT: fcvtzu v0.2d, v21.2d
; CHECK-GI-FP16-NEXT: fcvtzu v3.2d, v22.2d
; CHECK-GI-FP16-NEXT: stp q4, q1, [x8, #160]
-; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v6.2d
+; CHECK-GI-FP16-NEXT: fcvtzu v1.2d, v23.2d
; CHECK-GI-FP16-NEXT: stp q2, q0, [x8, #192]
; CHECK-GI-FP16-NEXT: stp q3, q1, [x8, #224]
; CHECK-GI-FP16-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index f2c4e976b8c16..b1b5154a57b4d 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -3521,31 +3521,31 @@ define <8 x i100> @test_unsigned_v8f16_v8i100(<8 x half> %f) {
; CHECK-NEXT: fmov s0, s8
; CHECK-NEXT: bl __fixunssfti
; CHECK-NEXT: extr x8, x21, x27, #28
-; CHECK-NEXT: extr x9, x29, x20, #28
+; CHECK-NEXT: str x24, [x19]
+; CHECK-NEXT: bfi x22, x20, #36, #28
; CHECK-NEXT: stur x28, [x19, #75]
+; CHECK-NEXT: extr x9, x29, x20, #28
; CHECK-NEXT: fcmp s8, #0.0
-; CHECK-NEXT: bfi x22, x20, #36, #28
-; CHECK-NEXT: lsr x11, x29, #28
; CHECK-NEXT: stur x8, [x19, #41]
-; CHECK-NEXT: str x9, [x19, #16]
-; CHECK-NEXT: ldr x10, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x11, [sp, #32] // 8-byte Folded Reload
+; CHECK-NEXT: stp x22, x9, [x19, #8]
+; CHECK-NEXT: lsr x9, x29, #28
; CHECK-NEXT: csel x8, xzr, x0, lt
-; CHECK-NEXT: csel x9, xzr, x1, lt
+; CHECK-NEXT: csel x10, xzr, x1, lt
; CHECK-NEXT: fcmp s8, s9
-; CHECK-NEXT: stp x24, x22, [x19]
-; CHECK-NEXT: stur x10, [x19, #50]
-; CHECK-NEXT: lsr x10, x21, #28
-; CHECK-NEX...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/139513
More information about the llvm-commits
mailing list