[llvm] b7d3a2b - [ARM] Mark i64 and f64 shuffles as Custom for MVE
David Green via llvm-commits
llvm-commits at lists.llvm.org
Sun Feb 6 08:17:11 PST 2022
Author: David Green
Date: 2022-02-06T16:17:06Z
New Revision: b7d3a2b62f4d3cea9ec7baf1004ac2f68a0bca98
URL: https://github.com/llvm/llvm-project/commit/b7d3a2b62f4d3cea9ec7baf1004ac2f68a0bca98
DIFF: https://github.com/llvm/llvm-project/commit/b7d3a2b62f4d3cea9ec7baf1004ac2f68a0bca98.diff
LOG: [ARM] Mark i64 and f64 shuffles as Custom for MVE
This way they get lowered through the ARMISD::BUILD_VECTOR, which can
produce more efficient D register moves.
Also helps D115653 not get stuck in a loop.
Added:
Modified:
llvm/lib/Target/ARM/ARMISelLowering.cpp
llvm/test/CodeGen/Thumb2/mve-shuffle.ll
llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll
llvm/test/CodeGen/Thumb2/mve-vst2-post.ll
llvm/test/CodeGen/Thumb2/mve-vst2.ll
llvm/test/CodeGen/Thumb2/mve-vst3.ll
llvm/test/CodeGen/Thumb2/mve-vst4-post.ll
llvm/test/CodeGen/Thumb2/mve-vst4.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 32cb88bbab625..ce62979de13fd 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -392,6 +392,7 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VSELECT, VT, Legal);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
}
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
index e7e8382336ce4..92ed9280bc8b4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
@@ -1474,6 +1474,189 @@ entry:
ret <2 x double> %out
}
+define arm_aapcs_vfpcc <4 x double> @shuffle4_f64(<2 x double> %src1, <2 x double> %src2) {
+; CHECK-LABEL: shuffle4_f64:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.f32 s8, s6
+; CHECK-NEXT: vmov.f32 s6, s0
+; CHECK-NEXT: vmov.f32 s9, s7
+; CHECK-NEXT: vmov.f32 s7, s1
+; CHECK-NEXT: vmov.f32 s10, s2
+; CHECK-NEXT: vmov.f32 s11, s3
+; CHECK-NEXT: vmov q0, q2
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <2 x double> %src1, <2 x double> %src2, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
+ ret <4 x double> %out
+}
+define arm_aapcs_vfpcc <4 x double> @shuffle5_f64(<2 x double> %src1, <2 x double> %src2) {
+; CHECK-LABEL: shuffle5_f64:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.f32 s8, s6
+; CHECK-NEXT: vmov.f32 s10, s4
+; CHECK-NEXT: vmov.f32 s4, s2
+; CHECK-NEXT: vmov.f32 s6, s0
+; CHECK-NEXT: vmov.f32 s9, s7
+; CHECK-NEXT: vmov.f32 s11, s5
+; CHECK-NEXT: vmov.f32 s5, s3
+; CHECK-NEXT: vmov.f32 s7, s1
+; CHECK-NEXT: vmov q0, q2
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <2 x double> %src1, <2 x double> %src2, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ ret <4 x double> %out
+}
+define arm_aapcs_vfpcc <2 x double> @shuffle6_f64(<2 x double> %src1, <2 x double> %src2) {
+; CHECK-LABEL: shuffle6_f64:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.f32 s2, s6
+; CHECK-NEXT: vmov.f32 s3, s7
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %out
+}
+define arm_aapcs_vfpcc <2 x double> @shuffle7_f64(<2 x double> %src1, <2 x double> %src2) {
+; CHECK-LABEL: shuffle7_f64:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.f32 s0, s6
+; CHECK-NEXT: vmov.f32 s1, s7
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 3, i32 1>
+ ret <2 x double> %out
+}
+define arm_aapcs_vfpcc <2 x double> @shuffle8_f64(<2 x double> %src1, <2 x double> %src2) {
+; CHECK-LABEL: shuffle8_f64:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.f32 s6, s2
+; CHECK-NEXT: vmov.f32 s7, s3
+; CHECK-NEXT: vmov q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <2 x double> %src1, <2 x double> %src2, <2 x i32> <i32 2, i32 1>
+ ret <2 x double> %out
+}
+define arm_aapcs_vfpcc <8 x double> @shuffle9_f64(<4 x double> %src1, <4 x double> %src2) {
+; CHECK-LABEL: shuffle9_f64:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .vsave {d8, d9, d10, d11}
+; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: vmov q5, q2
+; CHECK-NEXT: vmov.f32 s16, s0
+; CHECK-NEXT: vmov.f32 s18, s20
+; CHECK-NEXT: vmov.f32 s20, s2
+; CHECK-NEXT: vmov.f32 s10, s12
+; CHECK-NEXT: vmov.f32 s19, s21
+; CHECK-NEXT: vmov.f32 s8, s4
+; CHECK-NEXT: vmov.f32 s17, s1
+; CHECK-NEXT: vmov.f32 s21, s3
+; CHECK-NEXT: vmov q0, q4
+; CHECK-NEXT: vmov.f32 s12, s6
+; CHECK-NEXT: vmov.f32 s11, s13
+; CHECK-NEXT: vmov.f32 s9, s5
+; CHECK-NEXT: vmov.f32 s13, s7
+; CHECK-NEXT: vmov q1, q5
+; CHECK-NEXT: vpop {d8, d9, d10, d11}
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <4 x double> %src1, <4 x double> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+ ret <8 x double> %out
+}
+
+
+
+
+define arm_aapcs_vfpcc <4 x i64> @shuffle4_i64(<2 x i64> %src1, <2 x i64> %src2) {
+; CHECK-LABEL: shuffle4_i64:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.f32 s8, s6
+; CHECK-NEXT: vmov.f32 s6, s0
+; CHECK-NEXT: vmov.f32 s9, s7
+; CHECK-NEXT: vmov.f32 s7, s1
+; CHECK-NEXT: vmov.f32 s10, s2
+; CHECK-NEXT: vmov.f32 s11, s3
+; CHECK-NEXT: vmov q0, q2
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
+ ret <4 x i64> %out
+}
+define arm_aapcs_vfpcc <4 x i64> @shuffle5_i64(<2 x i64> %src1, <2 x i64> %src2) {
+; CHECK-LABEL: shuffle5_i64:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.f32 s8, s6
+; CHECK-NEXT: vmov.f32 s10, s4
+; CHECK-NEXT: vmov.f32 s4, s2
+; CHECK-NEXT: vmov.f32 s6, s0
+; CHECK-NEXT: vmov.f32 s9, s7
+; CHECK-NEXT: vmov.f32 s11, s5
+; CHECK-NEXT: vmov.f32 s5, s3
+; CHECK-NEXT: vmov.f32 s7, s1
+; CHECK-NEXT: vmov q0, q2
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+ ret <4 x i64> %out
+}
+define arm_aapcs_vfpcc <2 x i64> @shuffle6_i64(<2 x i64> %src1, <2 x i64> %src2) {
+; CHECK-LABEL: shuffle6_i64:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.f32 s2, s6
+; CHECK-NEXT: vmov.f32 s3, s7
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 0, i32 3>
+ ret <2 x i64> %out
+}
+define arm_aapcs_vfpcc <2 x i64> @shuffle7_i64(<2 x i64> %src1, <2 x i64> %src2) {
+; CHECK-LABEL: shuffle7_i64:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.f32 s0, s6
+; CHECK-NEXT: vmov.f32 s1, s7
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 3, i32 1>
+ ret <2 x i64> %out
+}
+define arm_aapcs_vfpcc <2 x i64> @shuffle8_i64(<2 x i64> %src1, <2 x i64> %src2) {
+; CHECK-LABEL: shuffle8_i64:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: vmov.f32 s6, s2
+; CHECK-NEXT: vmov.f32 s7, s3
+; CHECK-NEXT: vmov q0, q1
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 2, i32 1>
+ ret <2 x i64> %out
+}
+define arm_aapcs_vfpcc <8 x i64> @shuffle9_i64(<4 x i64> %src1, <4 x i64> %src2) {
+; CHECK-LABEL: shuffle9_i64:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .vsave {d8, d9, d10, d11}
+; CHECK-NEXT: vpush {d8, d9, d10, d11}
+; CHECK-NEXT: vmov q5, q2
+; CHECK-NEXT: vmov.f32 s16, s0
+; CHECK-NEXT: vmov.f32 s18, s20
+; CHECK-NEXT: vmov.f32 s20, s2
+; CHECK-NEXT: vmov.f32 s10, s12
+; CHECK-NEXT: vmov.f32 s19, s21
+; CHECK-NEXT: vmov.f32 s8, s4
+; CHECK-NEXT: vmov.f32 s17, s1
+; CHECK-NEXT: vmov.f32 s21, s3
+; CHECK-NEXT: vmov q0, q4
+; CHECK-NEXT: vmov.f32 s12, s6
+; CHECK-NEXT: vmov.f32 s11, s13
+; CHECK-NEXT: vmov.f32 s9, s5
+; CHECK-NEXT: vmov.f32 s13, s7
+; CHECK-NEXT: vmov q1, q5
+; CHECK-NEXT: vpop {d8, d9, d10, d11}
+; CHECK-NEXT: bx lr
+entry:
+ %out = shufflevector <4 x i64> %src1, <4 x i64> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+ ret <8 x i64> %out
+}
+
define arm_aapcs_vfpcc <4 x i32> @insert_i32(i32 %a) {
; CHECK-LABEL: insert_i32:
@@ -1548,7 +1731,7 @@ define arm_aapcs_vfpcc i64 @scalar_to_vector_i32(<8 x i16> %v) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
-; CHECK-NEXT: adr r2, .LCPI76_0
+; CHECK-NEXT: adr r2, .LCPI88_0
; CHECK-NEXT: vmov.u16 r0, q0[0]
; CHECK-NEXT: vldrw.u32 q0, [r2]
; CHECK-NEXT: mov r1, sp
@@ -1558,7 +1741,7 @@ define arm_aapcs_vfpcc i64 @scalar_to_vector_i32(<8 x i16> %v) {
; CHECK-NEXT: bx lr
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.1:
-; CHECK-NEXT: .LCPI76_0:
+; CHECK-NEXT: .LCPI88_0:
; CHECK-NEXT: .zero 4
; CHECK-NEXT: .long 7 @ 0x7
; CHECK-NEXT: .long 1 @ 0x1
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll
index b8ddde719a67e..690c0179839dd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll
@@ -149,8 +149,8 @@ define arm_aapcs_vfpcc void @vmovn64_b2(<2 x i64> %src1, <2 x i64> %src2, <2 x i
; CHECK-LABEL: vmovn64_b2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.f32 s4, s6
-; CHECK-NEXT: vmov.f32 s5, s7
; CHECK-NEXT: vmov.f32 s6, s0
+; CHECK-NEXT: vmov.f32 s5, s7
; CHECK-NEXT: vmov.f32 s7, s1
; CHECK-NEXT: vstrw.32 q1, [r0]
; CHECK-NEXT: bx lr
@@ -164,8 +164,8 @@ define arm_aapcs_vfpcc void @vmovn64_b3(<2 x i64> %src1, <2 x i64> %src2, <2 x i
; CHECK-LABEL: vmovn64_b3:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.f32 s0, s2
-; CHECK-NEXT: vmov.f32 s1, s3
; CHECK-NEXT: vmov.f32 s2, s4
+; CHECK-NEXT: vmov.f32 s1, s3
; CHECK-NEXT: vmov.f32 s3, s5
; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll
index eafbf41bc6241..d482feef98990 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll
@@ -72,17 +72,14 @@ entry:
define <4 x i64> *@vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) {
; CHECK-LABEL: vst2_v2i64:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q0, [r0]
-; CHECK-NEXT: add.w r0, r1, #32
-; CHECK-NEXT: vmov.f32 s8, s2
-; CHECK-NEXT: vmov.f32 s9, s3
-; CHECK-NEXT: vmov.f32 s2, s4
-; CHECK-NEXT: vmov.f32 s3, s5
-; CHECK-NEXT: vmov.f32 s10, s6
-; CHECK-NEXT: vstrb.8 q0, [r1], #16
-; CHECK-NEXT: vmov.f32 s11, s7
-; CHECK-NEXT: vstrw.32 q2, [r1]
+; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vmov.f64 d5, d0
+; CHECK-NEXT: vmov.f64 d0, d3
+; CHECK-NEXT: vmov.f64 d4, d2
+; CHECK-NEXT: vstrw.32 q0, [r1, #16]
+; CHECK-NEXT: vstrw.32 q2, [r1], #32
+; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: bx lr
entry:
%s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
index c749b36416f66..bcddeae5c2a76 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
@@ -327,14 +327,11 @@ define void @vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vmov.f32 s10, s0
-; CHECK-NEXT: vmov.f32 s11, s1
-; CHECK-NEXT: vmov.f32 s8, s4
-; CHECK-NEXT: vmov.f32 s9, s5
-; CHECK-NEXT: vmov.f32 s0, s6
-; CHECK-NEXT: vstrb.8 q2, [r1], #16
-; CHECK-NEXT: vmov.f32 s1, s7
-; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: vmov.f64 d4, d3
+; CHECK-NEXT: vmov.f64 d5, d1
+; CHECK-NEXT: vmov.f64 d3, d0
+; CHECK-NEXT: vstrw.32 q2, [r1, #16]
+; CHECK-NEXT: vstrw.32 q1, [r1]
; CHECK-NEXT: bx lr
entry:
%s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0
@@ -349,29 +346,23 @@ entry:
define void @vst2_v4i64(<4 x i64> *%src, <8 x i64> *%dst) {
; CHECK-LABEL: vst2_v4i64:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .vsave {d8, d9, d10, d11}
-; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vmov.f32 s12, s2
-; CHECK-NEXT: vmov.f32 s13, s3
-; CHECK-NEXT: vmov.f32 s20, s6
-; CHECK-NEXT: vmov.f32 s21, s7
-; CHECK-NEXT: vmov.f32 s2, s16
-; CHECK-NEXT: vmov.f32 s3, s17
-; CHECK-NEXT: vmov.f32 s6, s8
-; CHECK-NEXT: vstrw.32 q0, [r1, #32]
-; CHECK-NEXT: vmov.f32 s7, s9
-; CHECK-NEXT: vmov.f32 s14, s18
-; CHECK-NEXT: vstrb.8 q1, [r1], #48
-; CHECK-NEXT: vmov.f32 s15, s19
-; CHECK-NEXT: vmov.f32 s22, s10
-; CHECK-NEXT: vstrw.32 q3, [r1]
-; CHECK-NEXT: vmov.f32 s23, s11
-; CHECK-NEXT: vstrw.32 q5, [r1, #-32]
-; CHECK-NEXT: vpop {d8, d9, d10, d11}
+; CHECK-NEXT: .vsave {d8, d9}
+; CHECK-NEXT: vpush {d8, d9}
+; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q2, [r0]
+; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT: vmov.f64 d8, d4
+; CHECK-NEXT: vmov.f64 d9, d0
+; CHECK-NEXT: vmov.f64 d0, d5
+; CHECK-NEXT: vstrw.32 q4, [r1]
+; CHECK-NEXT: vmov.f64 d5, d2
+; CHECK-NEXT: vstrw.32 q0, [r1, #16]
+; CHECK-NEXT: vmov.f64 d4, d6
+; CHECK-NEXT: vmov.f64 d2, d7
+; CHECK-NEXT: vstrw.32 q2, [r1, #32]
+; CHECK-NEXT: vstrw.32 q1, [r1, #48]
+; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
entry:
%s1 = getelementptr <4 x i64>, <4 x i64>* %src, i32 0
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index 7d4763fdeb03a..d3e042a601205 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -916,20 +916,16 @@ entry:
define void @vst3_v2i64(<2 x i64> *%src, <6 x i64> *%dst) {
; CHECK-LABEL: vst3_v2i64:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q0, [r0]
+; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
; CHECK-NEXT: vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vmov.f32 s14, s2
-; CHECK-NEXT: vmov.f32 s15, s3
-; CHECK-NEXT: vmov.f32 s2, s6
-; CHECK-NEXT: vmov.f32 s3, s7
-; CHECK-NEXT: vmov.f32 s6, s8
-; CHECK-NEXT: vmov.f32 s7, s9
-; CHECK-NEXT: vstrb.8 q1, [r1], #32
-; CHECK-NEXT: vmov.f32 s12, s10
-; CHECK-NEXT: vmov.f32 s13, s11
-; CHECK-NEXT: vstrw.32 q0, [r1, #-16]
-; CHECK-NEXT: vstrw.32 q3, [r1]
+; CHECK-NEXT: vmov.f64 d6, d2
+; CHECK-NEXT: vmov.f64 d7, d1
+; CHECK-NEXT: vmov.f64 d1, d4
+; CHECK-NEXT: vstrw.32 q3, [r1, #16]
+; CHECK-NEXT: vmov.f64 d2, d5
+; CHECK-NEXT: vstrw.32 q0, [r1]
+; CHECK-NEXT: vstrw.32 q1, [r1, #32]
; CHECK-NEXT: bx lr
entry:
%s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll
index ee1fe9e69c255..5fe7f2f3d7d2f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll
@@ -105,25 +105,19 @@ define <8 x i64> *@vst4_v2i64(<2 x i64> *%src, <8 x i64> *%dst) {
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q4, [r0]
-; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT: vmov.f32 s8, s16
-; CHECK-NEXT: vmov.f32 s9, s17
-; CHECK-NEXT: vmov.f32 s10, s0
-; CHECK-NEXT: vmov.f32 s11, s1
-; CHECK-NEXT: vmov.f32 s0, s18
-; CHECK-NEXT: vmov.f32 s1, s19
-; CHECK-NEXT: vmov.f32 s18, s4
+; CHECK-NEXT: vldrw.u32 q3, [r0]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT: vmov.f64 d2, d6
+; CHECK-NEXT: vmov.f64 d3, d0
+; CHECK-NEXT: vmov.f64 d0, d7
+; CHECK-NEXT: vmov.f64 d7, d4
; CHECK-NEXT: vstrw.32 q0, [r1, #32]
-; CHECK-NEXT: vmov.f32 s19, s5
-; CHECK-NEXT: vmov.f32 s16, s12
-; CHECK-NEXT: vmov.f32 s17, s13
-; CHECK-NEXT: vmov.f32 s4, s14
-; CHECK-NEXT: vstrw.32 q4, [r1, #16]
-; CHECK-NEXT: vmov.f32 s5, s15
-; CHECK-NEXT: vstrw.32 q1, [r1, #48]
-; CHECK-NEXT: vstrw.32 q2, [r1], #64
+; CHECK-NEXT: vmov.f64 d6, d8
+; CHECK-NEXT: vmov.f64 d4, d9
+; CHECK-NEXT: vstrw.32 q3, [r1, #16]
+; CHECK-NEXT: vstrw.32 q2, [r1, #48]
+; CHECK-NEXT: vstrw.32 q1, [r1], #64
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index db4a438ae076a..b76a97d0246bb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -719,24 +719,18 @@ define void @vst4_v2i64(<2 x i64> *%src, <8 x i64> *%dst) {
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT: vldrw.u32 q2, [r0]
-; CHECK-NEXT: vmov.f32 s14, s0
-; CHECK-NEXT: vmov.f32 s15, s1
-; CHECK-NEXT: vmov.f32 s22, s4
-; CHECK-NEXT: vmov.f32 s23, s5
-; CHECK-NEXT: vmov.f32 s12, s16
-; CHECK-NEXT: vmov.f32 s13, s17
-; CHECK-NEXT: vmov.f32 s20, s8
-; CHECK-NEXT: vstrw.32 q3, [r1, #16]
-; CHECK-NEXT: vmov.f32 s21, s9
-; CHECK-NEXT: vmov.f32 s0, s18
+; CHECK-NEXT: vldrw.u32 q3, [r0]
+; CHECK-NEXT: vmov.f64 d9, d0
+; CHECK-NEXT: vmov.f64 d8, d4
+; CHECK-NEXT: vmov.f64 d11, d2
+; CHECK-NEXT: vstrw.32 q4, [r1, #16]
+; CHECK-NEXT: vmov.f64 d10, d6
+; CHECK-NEXT: vmov.f64 d0, d5
; CHECK-NEXT: vstrw.32 q5, [r1]
-; CHECK-NEXT: vmov.f32 s1, s19
-; CHECK-NEXT: vmov.f32 s4, s10
+; CHECK-NEXT: vmov.f64 d2, d7
; CHECK-NEXT: vstrw.32 q0, [r1, #48]
-; CHECK-NEXT: vmov.f32 s5, s11
; CHECK-NEXT: vstrw.32 q1, [r1, #32]
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
More information about the llvm-commits
mailing list