[llvm] Revert "[DAGCombiner] Relax condition for extract_vector_elt combine" (PR #157953)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 10 13:46:47 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-webassembly
Author: Arthur Eubanks (aeubanks)
<details>
<summary>Changes</summary>
Reverts llvm/llvm-project#<!-- -->157658
Causes hangs, see https://github.com/llvm/llvm-project/pull/157658#issuecomment-3276441812
---
Patch is 91.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157953.diff
23 Files Affected:
- (modified) llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (+2-1)
- (modified) llvm/test/CodeGen/AArch64/shufflevector.ll (+6-5)
- (modified) llvm/test/CodeGen/Thumb2/active_lane_mask.ll (+6-4)
- (modified) llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll (+17-12)
- (modified) llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll (+17-12)
- (modified) llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll (+9-6)
- (modified) llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll (+9-6)
- (modified) llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll (+139-108)
- (modified) llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll (+128-108)
- (modified) llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll (+30-22)
- (modified) llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll (+18-13)
- (modified) llvm/test/CodeGen/Thumb2/mve-vabdus.ll (+41-31)
- (modified) llvm/test/CodeGen/Thumb2/mve-vld2.ll (+53-33)
- (modified) llvm/test/CodeGen/Thumb2/mve-vld3.ll (+208-97)
- (modified) llvm/test/CodeGen/Thumb2/mve-vld4-post.ll (+33-22)
- (modified) llvm/test/CodeGen/Thumb2/mve-vld4.ll (+128-87)
- (modified) llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll (+95-77)
- (modified) llvm/test/CodeGen/Thumb2/mve-vst3.ll (+10-10)
- (modified) llvm/test/CodeGen/WebAssembly/vector-reduce.ll (+24-22)
- (modified) llvm/test/CodeGen/X86/avx512fp16-mov.ll (+29-25)
- (modified) llvm/test/CodeGen/X86/test-shrink-bug.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/vec_smulo.ll (+2-2)
- (modified) llvm/test/CodeGen/X86/vec_umulo.ll (+2-2)
``````````diff
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 97a3d36a67103..d130efe96b56b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23933,7 +23933,8 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
// scalar_to_vector here as well.
if (!LegalOperations ||
- TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VecVT) ||
+ // FIXME: Should really be just isOperationLegalOrCustom.
+ TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecVT) ||
TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VecVT)) {
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SVInVec,
DAG.getVectorIdxConstant(OrigElt, DL));
diff --git a/llvm/test/CodeGen/AArch64/shufflevector.ll b/llvm/test/CodeGen/AArch64/shufflevector.ll
index b47c077ccf1c5..9fd5e65086782 100644
--- a/llvm/test/CodeGen/AArch64/shufflevector.ll
+++ b/llvm/test/CodeGen/AArch64/shufflevector.ll
@@ -286,11 +286,10 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: sub sp, sp, #16
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT: str h1, [sp, #14]
-; CHECK-SD-NEXT: mov s0, v0.s[1]
+; CHECK-SD-NEXT: ext v0.8b, v0.8b, v1.8b, #4
+; CHECK-SD-NEXT: mov s1, v0.s[1]
; CHECK-SD-NEXT: str h0, [sp, #12]
+; CHECK-SD-NEXT: str h1, [sp, #14]
; CHECK-SD-NEXT: ldr w0, [sp, #12]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
@@ -492,8 +491,10 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){
; CHECK-SD-NEXT: sub sp, sp, #16
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: str h0, [sp, #14]
+; CHECK-SD-NEXT: dup v1.2s, v0.s[0]
; CHECK-SD-NEXT: str h0, [sp, #12]
+; CHECK-SD-NEXT: mov s1, v1.s[1]
+; CHECK-SD-NEXT: str h1, [sp, #14]
; CHECK-SD-NEXT: ldr w0, [sp, #12]
; CHECK-SD-NEXT: add sp, sp, #16
; CHECK-SD-NEXT: ret
diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
index cae8d6e3deaeb..bcd92f81911b2 100644
--- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
+++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
@@ -107,7 +107,6 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) {
; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: ldr r2, [sp, #48]
-; CHECK-NEXT: adds r0, #16
; CHECK-NEXT: vqadd.u32 q0, q0, r1
; CHECK-NEXT: ldr r1, [sp, #52]
; CHECK-NEXT: vcmp.u32 hi, q3, q0
@@ -120,9 +119,12 @@ define <7 x i32> @v7i32(i32 %index, i32 %TC, <7 x i32> %V1, <7 x i32> %V2) {
; CHECK-NEXT: ldr r1, [sp, #24]
; CHECK-NEXT: vmov q1[2], q1[0], r2, r1
; CHECK-NEXT: vpsel q0, q1, q0
-; CHECK-NEXT: vmov r1, r2, d0
-; CHECK-NEXT: vmov r3, s2
-; CHECK-NEXT: stm r0!, {r1, r2, r3}
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: vmov.f32 s2, s1
+; CHECK-NEXT: vmov r3, s0
+; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: strd r3, r2, [r0, #16]
+; CHECK-NEXT: str r1, [r0, #24]
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll
index de508e67a7a77..37f6bbeffd027 100644
--- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i16-add.ll
@@ -31,19 +31,24 @@ entry:
define arm_aapcs_vfpcc <4 x i16> @complex_add_v4i16(<4 x i16> %a, <4 x i16> %b) {
; CHECK-LABEL: complex_add_v4i16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, lr}
-; CHECK-NEXT: push {r4, lr}
-; CHECK-NEXT: vmov r12, r1, d1
-; CHECK-NEXT: vmov r2, lr, d3
-; CHECK-NEXT: vmov r3, r4, d2
+; CHECK-NEXT: vrev64.32 q2, q0
+; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vrev64.32 q3, q1
+; CHECK-NEXT: vmov r2, s4
+; CHECK-NEXT: subs r0, r1, r0
+; CHECK-NEXT: vmov r1, s8
; CHECK-NEXT: subs r1, r2, r1
-; CHECK-NEXT: vmov r2, r0, d0
-; CHECK-NEXT: subs r0, r3, r0
-; CHECK-NEXT: vmov q0[2], q0[0], r0, r1
-; CHECK-NEXT: add.w r0, lr, r12
-; CHECK-NEXT: adds r1, r4, r2
-; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT: pop {r4, pc}
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT: vmov r0, s14
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: add r0, r1
+; CHECK-NEXT: vmov r1, s12
+; CHECK-NEXT: add r1, r2
+; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT: vmov q0, q2
+; CHECK-NEXT: bx lr
entry:
%a.real = shufflevector <4 x i16> %a, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
%a.imag = shufflevector <4 x i16> %a, <4 x i16> zeroinitializer, <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll
index e11b3c773adf6..794894def9265 100644
--- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-i8-add.ll
@@ -31,19 +31,24 @@ entry:
define arm_aapcs_vfpcc <4 x i8> @complex_add_v4i8(<4 x i8> %a, <4 x i8> %b) {
; CHECK-LABEL: complex_add_v4i8:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, lr}
-; CHECK-NEXT: push {r4, lr}
-; CHECK-NEXT: vmov r12, r1, d1
-; CHECK-NEXT: vmov r2, lr, d3
-; CHECK-NEXT: vmov r3, r4, d2
+; CHECK-NEXT: vrev64.32 q2, q0
+; CHECK-NEXT: vmov r1, s6
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vrev64.32 q3, q1
+; CHECK-NEXT: vmov r2, s4
+; CHECK-NEXT: subs r0, r1, r0
+; CHECK-NEXT: vmov r1, s8
; CHECK-NEXT: subs r1, r2, r1
-; CHECK-NEXT: vmov r2, r0, d0
-; CHECK-NEXT: subs r0, r3, r0
-; CHECK-NEXT: vmov q0[2], q0[0], r0, r1
-; CHECK-NEXT: add.w r0, lr, r12
-; CHECK-NEXT: adds r1, r4, r2
-; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
-; CHECK-NEXT: pop {r4, pc}
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT: vmov r0, s14
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: add r0, r1
+; CHECK-NEXT: vmov r1, s12
+; CHECK-NEXT: add r1, r2
+; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT: vmov q0, q2
+; CHECK-NEXT: bx lr
entry:
%a.real = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <2 x i32> <i32 0, i32 2>
%a.imag = shufflevector <4 x i8> %a, <4 x i8> zeroinitializer, <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
index d535c64289d4f..77548b49d77f2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptosi-sat-vector.ll
@@ -185,10 +185,11 @@ define arm_aapcs_vfpcc <6 x i32> @test_signed_v6f32_v6i32(<6 x float> %f) {
; CHECK-MVEFP: @ %bb.0:
; CHECK-MVEFP-NEXT: vcvt.s32.f32 q1, q1
; CHECK-MVEFP-NEXT: vcvt.s32.f32 q0, q0
-; CHECK-MVEFP-NEXT: vmov r1, r2, d2
-; CHECK-MVEFP-NEXT: str r2, [r0, #20]
+; CHECK-MVEFP-NEXT: vmov.f32 s6, s5
+; CHECK-MVEFP-NEXT: vmov r2, s4
+; CHECK-MVEFP-NEXT: vmov r1, s6
+; CHECK-MVEFP-NEXT: strd r2, r1, [r0, #16]
; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0]
-; CHECK-MVEFP-NEXT: str r1, [r0, #16]
; CHECK-MVEFP-NEXT: bx lr
%x = call <6 x i32> @llvm.fptosi.sat.v6f32.v6i32(<6 x float> %f)
ret <6 x i32> %x
@@ -220,11 +221,13 @@ define arm_aapcs_vfpcc <7 x i32> @test_signed_v7f32_v7i32(<7 x float> %f) {
; CHECK-MVEFP: @ %bb.0:
; CHECK-MVEFP-NEXT: vcvt.s32.f32 q1, q1
; CHECK-MVEFP-NEXT: vcvt.s32.f32 q0, q0
+; CHECK-MVEFP-NEXT: vmov.f32 s10, s5
+; CHECK-MVEFP-NEXT: vmov r2, s4
; CHECK-MVEFP-NEXT: vmov r3, s6
-; CHECK-MVEFP-NEXT: vmov r1, r2, d2
-; CHECK-MVEFP-NEXT: strd r2, r3, [r0, #20]
+; CHECK-MVEFP-NEXT: vmov r1, s10
+; CHECK-MVEFP-NEXT: strd r2, r1, [r0, #16]
+; CHECK-MVEFP-NEXT: str r3, [r0, #24]
; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0]
-; CHECK-MVEFP-NEXT: str r1, [r0, #16]
; CHECK-MVEFP-NEXT: bx lr
%x = call <7 x i32> @llvm.fptosi.sat.v7f32.v7i32(<7 x float> %f)
ret <7 x i32> %x
diff --git a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
index 61f05347d511d..ee040feca4240 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fptoui-sat-vector.ll
@@ -172,10 +172,11 @@ define arm_aapcs_vfpcc <6 x i32> @test_unsigned_v6f32_v6i32(<6 x float> %f) {
; CHECK-MVEFP: @ %bb.0:
; CHECK-MVEFP-NEXT: vcvt.u32.f32 q1, q1
; CHECK-MVEFP-NEXT: vcvt.u32.f32 q0, q0
-; CHECK-MVEFP-NEXT: vmov r1, r2, d2
-; CHECK-MVEFP-NEXT: str r2, [r0, #20]
+; CHECK-MVEFP-NEXT: vmov.f32 s6, s5
+; CHECK-MVEFP-NEXT: vmov r2, s4
+; CHECK-MVEFP-NEXT: vmov r1, s6
+; CHECK-MVEFP-NEXT: strd r2, r1, [r0, #16]
; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0]
-; CHECK-MVEFP-NEXT: str r1, [r0, #16]
; CHECK-MVEFP-NEXT: bx lr
%x = call <6 x i32> @llvm.fptoui.sat.v6f32.v6i32(<6 x float> %f)
ret <6 x i32> %x
@@ -207,11 +208,13 @@ define arm_aapcs_vfpcc <7 x i32> @test_unsigned_v7f32_v7i32(<7 x float> %f) {
; CHECK-MVEFP: @ %bb.0:
; CHECK-MVEFP-NEXT: vcvt.u32.f32 q1, q1
; CHECK-MVEFP-NEXT: vcvt.u32.f32 q0, q0
+; CHECK-MVEFP-NEXT: vmov.f32 s10, s5
+; CHECK-MVEFP-NEXT: vmov r2, s4
; CHECK-MVEFP-NEXT: vmov r3, s6
-; CHECK-MVEFP-NEXT: vmov r1, r2, d2
-; CHECK-MVEFP-NEXT: strd r2, r3, [r0, #20]
+; CHECK-MVEFP-NEXT: vmov r1, s10
+; CHECK-MVEFP-NEXT: strd r2, r1, [r0, #16]
+; CHECK-MVEFP-NEXT: str r3, [r0, #24]
; CHECK-MVEFP-NEXT: vstrw.32 q0, [r0]
-; CHECK-MVEFP-NEXT: str r1, [r0, #16]
; CHECK-MVEFP-NEXT: bx lr
%x = call <7 x i32> @llvm.fptoui.sat.v7f32.v7i32(<7 x float> %f)
ret <7 x i32> %x
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
index 0f71653afa408..7be08b04c5957 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
@@ -4,45 +4,54 @@
define arm_aapcs_vfpcc <4 x i32> @loads_i32(ptr %A, ptr %B, ptr %C) {
; CHECK-LABEL: loads_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: vldrw.u32 q3, [r1]
-; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vmov.i64 q2, #0xffffffff
-; CHECK-NEXT: vmov.f32 s0, s12
-; CHECK-NEXT: vmov.f32 s2, s13
-; CHECK-NEXT: vmov lr, r0, d2
-; CHECK-NEXT: vand q0, q0, q2
-; CHECK-NEXT: vmov r1, r5, d1
-; CHECK-NEXT: vmov.f32 s12, s14
-; CHECK-NEXT: vmov.f32 s14, s15
-; CHECK-NEXT: vand q2, q3, q2
-; CHECK-NEXT: vmov r4, r3, d5
-; CHECK-NEXT: asrs r6, r0, #31
-; CHECK-NEXT: adds.w r12, r0, r1
-; CHECK-NEXT: adc.w r1, r6, r5
-; CHECK-NEXT: vmov r6, r5, d3
+; CHECK-NEXT: .save {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: vldrw.u32 q2, [r1]
+; CHECK-NEXT: vmov.i64 q1, #0xffffffff
+; CHECK-NEXT: vmov.f32 s0, s10
+; CHECK-NEXT: vmov.f32 s2, s11
+; CHECK-NEXT: vand q0, q0, q1
+; CHECK-NEXT: vmov.f32 s10, s9
+; CHECK-NEXT: vmov r1, r3, d0
+; CHECK-NEXT: vand q2, q2, q1
+; CHECK-NEXT: vmov r4, r5, d1
+; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q1, [r2]
-; CHECK-NEXT: vmov r2, r8, d3
-; CHECK-NEXT: adds r0, r5, r4
-; CHECK-NEXT: asr.w r4, r5, #31
-; CHECK-NEXT: adc.w r5, r4, r3
-; CHECK-NEXT: vmov r4, r7, d4
-; CHECK-NEXT: asrs r3, r6, #31
-; CHECK-NEXT: asrl r0, r5, r8
-; CHECK-NEXT: adds r4, r4, r6
-; CHECK-NEXT: adcs r3, r7
-; CHECK-NEXT: asrl r4, r3, r2
-; CHECK-NEXT: asr.w r2, lr, #31
-; CHECK-NEXT: vmov r3, r7, d0
-; CHECK-NEXT: adds.w r6, lr, r3
-; CHECK-NEXT: adc.w r3, r2, r7
-; CHECK-NEXT: vmov r2, r7, d2
-; CHECK-NEXT: asrl r6, r3, r2
-; CHECK-NEXT: asrl r12, r1, r7
-; CHECK-NEXT: vmov q0[2], q0[0], r6, r4
-; CHECK-NEXT: vmov q0[3], q0[1], r12, r0
-; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT: vmov lr, r12, d5
+; CHECK-NEXT: vmov.f32 s12, s2
+; CHECK-NEXT: vmov.f32 s2, s3
+; CHECK-NEXT: vmov r0, s12
+; CHECK-NEXT: vmov.f32 s12, s6
+; CHECK-NEXT: vmov.f32 s6, s7
+; CHECK-NEXT: asrs r2, r0, #31
+; CHECK-NEXT: adds r0, r0, r1
+; CHECK-NEXT: adc.w r1, r2, r3
+; CHECK-NEXT: vmov r2, s12
+; CHECK-NEXT: asrl r0, r1, r2
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: vmov.f32 s2, s1
+; CHECK-NEXT: adds r2, r1, r4
+; CHECK-NEXT: asr.w r3, r1, #31
+; CHECK-NEXT: adc.w r1, r3, r5
+; CHECK-NEXT: vmov r3, s6
+; CHECK-NEXT: asrl r2, r1, r3
+; CHECK-NEXT: vmov r4, r5, d4
+; CHECK-NEXT: vmov r1, s2
+; CHECK-NEXT: vmov.f32 s2, s5
+; CHECK-NEXT: adds.w r6, r1, lr
+; CHECK-NEXT: asr.w r3, r1, #31
+; CHECK-NEXT: adc.w r1, r3, r12
+; CHECK-NEXT: vmov r3, s2
+; CHECK-NEXT: asrl r6, r1, r3
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: adds r4, r4, r1
+; CHECK-NEXT: asr.w r3, r1, #31
+; CHECK-NEXT: adc.w r1, r3, r5
+; CHECK-NEXT: vmov r3, s4
+; CHECK-NEXT: asrl r4, r1, r3
+; CHECK-NEXT: vmov q0[2], q0[0], r4, r0
+; CHECK-NEXT: vmov q0[3], q0[1], r6, r2
+; CHECK-NEXT: pop {r4, r5, r6, pc}
entry:
%a = load <4 x i32>, ptr %A, align 4
%b = load <4 x i32>, ptr %B, align 4
@@ -127,42 +136,55 @@ define arm_aapcs_vfpcc void @load_store_i32(ptr %A, ptr %B, ptr %C, ptr %D) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vldrw.u32 q1, [r1]
-; CHECK-NEXT: vldrw.u32 q3, [r0]
; CHECK-NEXT: vmov.i64 q0, #0xffffffff
; CHECK-NEXT: vmov.f32 s8, s6
; CHECK-NEXT: vmov.f32 s10, s7
-; CHECK-NEXT: vand q2, q2, q0
-; CHECK-NEXT: vmov r5, r0, d7
-; CHECK-NEXT: vmov r1, r7, d5
-; CHECK-NEXT: vmov r12, lr, d4
-; CHECK-NEXT: vldrw.u32 q2, [r2]
; CHECK-NEXT: vmov.f32 s6, s5
-; CHECK-NEXT: vand q0, q1, q0
-; CHECK-NEXT: adds.w r8, r0, r1
+; CHECK-NEXT: vand q4, q2, q0
+; CHECK-NEXT: vand q2, q1, q0
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vmov r4, r5, d9
+; CHECK-NEXT: vldrw.u32 q1, [r2]
+; CHECK-NEXT: vmov.f32 s12, s2
+; CHECK-NEXT: vmov.f32 s2, s3
+; CHECK-NEXT: vmov lr, r12, d8
+; CHECK-NEXT: vmov.f32 s16, s6
+; CHECK-NEXT: vmov.f32 s6, s7
+; CHECK-NEXT: vmov r6, r1, d5
+; CHECK-NEXT: vmov.f32 s10, s1
+; CHECK-NEXT: vmov r0, s2
+; CHECK-NEXT: vmov.f32 s2, s5
+; CHECK-NEXT: adds.w r8, r0, r4
; CHECK-NEXT: asr.w r2, r0, #31
-; CHECK-NEXT: adcs r7, r2
-; CHECK-NEXT: asrs r4, r5, #31
-; CHECK-NEXT: adds.w r2, r5, r12
-; CHECK-NEXT: vmov r6, r1, d6
-; CHECK-NEXT: adc.w r5, r4, lr
-; CHECK-NEXT: vmov r4, r12, d5
-; CHECK-NEXT: asrl r2, r5, r4
-; CHECK-NEXT: asrl r8, r7, r12
-; CHECK-NEXT: vmov r5, r4, d0
-; CHECK-NEXT: asrs r7, r1, #31
-; CHECK-NEXT: adds r0, r6, r5
-; CHECK-NEXT: asr.w r6, r6, #31
-; CHECK-NEXT: adc.w r5, r6, r4
-; CHECK-NEXT: vmov r6, r4, d4
-; CHECK-NEXT: asrl r0, r5, r6
-; CHECK-NEXT: vmov q1[2], q1[0], r0, r2
-; CHECK-NEXT: vmov r0, r2, d1
-; CHECK-NEXT: adds r0, r0, r1
-; CHECK-NEXT: adc.w r1, r7, r2
-; CHECK-NEXT: asrl r0, r1, r4
-; CHECK-NEXT: vmov q1[3], q1[1], r0, r8
-; CHECK-NEXT: vstrw.32 q1, [r3]
+; CHECK-NEXT: adcs r5, r2
+; CHECK-NEXT: vmov r2, s6
+; CHECK-NEXT: asrl r8, r5, r2
+; CHECK-NEXT: vmov r2, s10
+; CHECK-NEXT: vmov r5, r7, d4
+; CHECK-NEXT: asrs r4, r2, #31
+; CHECK-NEXT: adds r2, r2, r6
+; CHECK-NEXT: adcs r1, r4
+; CHECK-NEXT: vmov r4, s2
+; CHECK-NEXT: asrl r2, r1, r4
+; CHECK-NEXT: vmov r1, s12
+; CHECK-NEXT: adds.w r6, r1, lr
+; CHECK-NEXT: asr.w r4, r1, #31
+; CHECK-NEXT: adc.w r1, r4, r12
+; CHECK-NEXT: vmov r4, s16
+; CHECK-NEXT: asrl r6, r1, r4
+; CHECK-NEXT: vmov r1, s0
+; CHECK-NEXT: adds r0, r1, r5
+; CHECK-NEXT: asr.w r4, r1, #31
+; CHECK-NEXT: adc.w r1, r4, r7
+; CHECK-NEXT: vmov r7, s4
+; CHECK-NEXT: asrl r0, r1, r7
+; CHECK-NEXT: vmov q0[2], q0[0], r0, r6
+; CHECK-NEXT: vmov q0[3], q0[1], r2, r8
+; CHECK-NEXT: vstrw.32 q0, [r3]
+; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
entry:
%a = load <4 x i32>, ptr %A, align 4
@@ -246,31 +268,36 @@ entry:
define arm_aapcs_vfpcc void @load_one_store_i32(ptr %A, ptr %D) {
; CHECK-LABEL: load_one_store_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, r7, r9, lr}
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r9, lr}
+; CHECK-NEXT: .save {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: vmov r2, r3, d1
-; CHECK-NEXT: vmov r5, r0, d0
-; CHECK-NEXT: adds r6, r3, r3
-; CHECK-NEXT: asr.w r12, r3, #31
-; CHECK-NEXT: adc.w r9, r12, r3, asr #31
-; CHECK-NEXT: adds r4, r2, r2
-; CHECK-NEXT: asr.w r12, r2, #31
-; CHECK-NEXT: adc.w r7, r12, r2, asr #31
-; CHECK-NEXT: asrl r6, r9, r3
-; CHECK-NEXT: asrl r4, r7, r2
-; CHECK-NEXT: adds r2, r5, r5
-; CHECK-NEXT: asr.w r7, r5, #31
-; CHECK-NEXT: adc.w r7, r7, r5, asr #31
-; CHECK-NEXT: asrl r2, r7, r5
-; CHECK-NEXT: vmov q0[2], q0[0], r2, r4
+; CHECK-NEXT: vmov.f32 s4, s2
+; CHECK-NEXT: vmov.f32 s2, s3
+; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov.f32 s2, s1
+; CHECK-NEXT: adds.w r12, r2, r2
+; CHECK-NEXT: asr.w r3, r2, #31
+; CHECK-NEXT: adc.w r3, r3, r2, asr #31
+; CHECK-NEXT: asrl r12, r3, r2
+; CHECK-NEXT: vmov r3, s2
+; CHECK-NEXT: adds r2, r3, r3
+; CHECK-NEXT: asr.w r0, r3, #31
+; CHECK-NEXT: adc.w r5, r0, r3, asr #31
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: asrl r2, r5, r3
; CHECK-NEXT: adds r4, r0, r0
-; CHECK-NEXT: asr.w r2, r0, #31
-; CHECK-NEXT: adc.w r3, r2, r0, asr #31
+; CHECK-NEXT: asr.w r3, r0, #31
+; CHECK-NEXT: adc.w r3, r3, r0, asr #31
; CHECK-NEXT: asrl r4, r3, r0
-; CHECK-NEXT: vmov q0[3], q0[1], r4, r6
+; CHECK-NEXT: vmov r0, s0
+; CHECK-NEXT: adds r6, r0, r0
+; CHECK-NEXT: asr.w r3, r0, #31
+; CHECK-NEXT: adc.w r3, r3, r0, asr #31
+; CHECK-NEXT: asrl r6, r3, r0
+; CHECK-NEXT: vmov q0[2], q0[0], r6, r4
+; CHECK-NEXT: vmov q0[3], q0[1], r2, r12
; CHECK-NEXT: vstrw.32 q0, [r1]
-; CHECK-NEXT: pop.w {r4, r5, r6, r7, r9, pc}
+; CHECK-NEXT: pop {r4, r5, r6, pc}
entry:
%a = load <4 x i32>, ptr %A, align 4
%sa = sext <4 x i32> %a to <4 x i64>
@@ -333,30 +360,34 @@ entry:
define arm_aapcs_vfpcc void @mul_i32(ptr %A, ptr %B, i64 %C, ptr %D) {
; CHECK-LABEL: mul_i32:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: ldr.w r12, [sp, #24]
-; CHECK-NEXT: vmov r3, lr, d0
-; CHECK-NEXT: vmov r0, r1, d2
-; CHECK-NEXT: vmov.f32 s0, s2
-; CHECK-NEXT: vmov.f32 s2, s3
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: ldr.w lr, [sp, #20]
+; CHECK-NEXT: vmov.f32 s10, s1
+; CHECK-NEXT: vmov.f32 s14, s5
+; CHECK-NEXT: vmov r5, s4
; CHECK-NEXT: vmov.f32 s4, s6
; CHECK-NEXT: vmov.f32 s6, s7
+; CHECK-NEXT: vmov r0, s10
+; CHECK-NEXT: vmov r1, s14
+; CHECK-NEXT: smull r12, r3, r1, r0
+; CHECK-NEXT: vmov r0, s0
+; CHECK-NEXT: vmov.f32 s0, s2
+; CHECK-NEXT: vmov.f32 s2, s3
; CHECK-NEXT: vmullb.s32 q2, q1, q0
-; CHECK-NEXT: vmov r4, r5, d5
-; CHECK-NEXT: asrl r4, r5, r2
-; CHECK-NEXT: smull r8, r3, r0, r3
-; CHECK-NEXT: vmov r0, r7, d4
-; CHECK-NEXT: asrl r0, r7, r2
-; CHECK-NEXT: smull r6, r1, r1, lr
-; CHECK-NEXT: asrl r8, r3, r2
-; CHECK-NEXT: vmov q0[2], q0[0], r8, r0
+; CHECK-NEXT: asrl r12, r3, r2
+; CHECK-NEXT: vmov r6, r1, d4
+; CHECK-NEXT:...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/157953
More information about the llvm-commits
mailing list