[llvm] Greedy: Take copy hints involving subregisters (PR #159570)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 18 06:09:49 PDT 2025
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/159570
Previously this would only accept full copy hints. This relaxes
this to accept some subregister copies. Specifically, this now
accepts:
- Copies to/from physical registers if there is a compatible
super register
- Subreg-to-subreg copies
This has the potential to repeatedly add the same hint to the
hint vector, but not sure if that's a real problem.
>From ac0f1982979176d1412c2db9fcb54a514a75573f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 18 Sep 2025 19:50:56 +0900
Subject: [PATCH] Greedy: Take copy hints involving subregisters
Previously this would only accept full copy hints. This relaxes
this to accept some subregister copies. Specifically, this now
accepts:
- Copies to/from physical registers if there is a compatible
super register
- Subreg-to-subreg copies
This has the potential to repeatedly add the same hint to the
hint vector, but not sure if that's a real problem.
---
llvm/lib/CodeGen/RegAllocGreedy.cpp | 25 ++++++-
.../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 3 +-
llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 3 +-
llvm/test/CodeGen/AMDGPU/load-local-i16.ll | 24 +++----
.../unspill-vgpr-after-rewrite-vgpr-mfma.ll | 12 ++--
.../RISCV/rvv/fixed-vectors-trunc-vp.ll | 6 +-
llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll | 24 +++----
llvm/test/CodeGen/Thumb2/mve-vst3.ll | 69 +++++++++----------
llvm/test/CodeGen/Thumb2/mve-vst4.ll | 12 ++--
9 files changed, 89 insertions(+), 89 deletions(-)
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index fa384b296f2e6..7c8444fc93af4 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -2387,19 +2387,42 @@ void RAGreedy::initializeCSRCost() {
/// The results are stored into \p Out.
/// \p Out is not cleared before being populated.
void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) {
+ const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+
for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) {
- if (!TII->isFullCopyInstr(Instr))
+ if (!Instr.isCopy())
continue;
+
// Look for the other end of the copy.
Register OtherReg = Instr.getOperand(0).getReg();
+ unsigned OtherSubReg = Instr.getOperand(0).getSubReg();
+ unsigned SubReg = Instr.getOperand(1).getSubReg();
+
if (OtherReg == Reg) {
OtherReg = Instr.getOperand(1).getReg();
+ OtherSubReg = Instr.getOperand(1).getSubReg();
+ SubReg = Instr.getOperand(0).getSubReg();
if (OtherReg == Reg)
continue;
}
+
// Get the current assignment.
MCRegister OtherPhysReg =
OtherReg.isPhysical() ? OtherReg.asMCReg() : VRM->getPhys(OtherReg);
+ if (OtherSubReg) {
+ if (OtherReg.isPhysical()) {
+ MCRegister Tuple =
+ TRI->getMatchingSuperReg(OtherPhysReg, OtherSubReg, RC);
+ if (!Tuple)
+ continue;
+ OtherPhysReg = Tuple;
+ } else {
+ // TODO: There should be a hinting mechanism for subregisters
+ if (SubReg != OtherSubReg)
+ continue;
+ }
+ }
+
// Push the collected information.
Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
OtherPhysReg));
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 1ce7179774349..be08c4e33f072 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -159246,7 +159246,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v37, 24, v61
; GFX9-NEXT: v_lshrrev_b32_e32 v50, 8, v61
; GFX9-NEXT: v_lshrrev_b32_e32 v56, 16, v60
-; GFX9-NEXT: v_mov_b32_e32 v33, v60
+; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v60
; GFX9-NEXT: buffer_load_dword v61, off, s[0:3], s32 offset:416 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v62, off, s[0:3], s32 offset:420 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:408 ; 4-byte Folded Reload
@@ -159259,7 +159259,6 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX9-NEXT: v_lshrrev_b32_e32 v48, 8, v48
; GFX9-NEXT: v_lshrrev_b32_e32 v36, 16, v58
; GFX9-NEXT: v_lshrrev_b32_e32 v58, 8, v58
-; GFX9-NEXT: v_lshrrev_b32_e32 v55, 8, v33
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshrrev_b32_e32 v34, 16, v61
; GFX9-NEXT: s_waitcnt vmcnt(2)
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 09d3c3b01b809..bca39d06e941c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -7398,7 +7398,7 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v15
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v17
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2)
-; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v20
+; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v20
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v16
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v14
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v14
@@ -7413,7 +7413,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v18
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v18
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v20
-; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v5
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v19
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v19
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v21
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index ddd1ce66c013a..f44a0b0ac2c65 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -3851,9 +3851,9 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v11
; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; VI-DS128-NEXT: v_mov_b32_e32 v31, v15
+; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27
; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26
@@ -3864,17 +3864,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25
; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
-; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9
; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8
; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11
; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10
+; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25
; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24
; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24
; VI-DS128-NEXT: v_mov_b32_e32 v24, s0
-; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9
; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8
; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39
; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38
@@ -3944,7 +3943,7 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v11
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v11
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18
@@ -3992,8 +3991,8 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10
-; GFX9-DS128-NEXT: v_mov_b32_e32 v31, v15
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9
+; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26
@@ -4004,17 +4003,16 @@ define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25
; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
-; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8
; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11
; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10
; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9
+; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24
; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25
; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24
; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0
-; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
@@ -4890,7 +4888,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40
; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v11
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v11
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39
; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38
@@ -4901,14 +4899,13 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
; VI-DS128-NEXT: v_mov_b32_e32 v32, s0
; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v10
-; VI-DS128-NEXT: v_mov_b32_e32 v23, v15
; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9
+; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
-; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8
; VI-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16
; VI-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16
@@ -4986,7 +4983,7 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
-; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v11
+; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v11
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v19
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v18
@@ -5031,15 +5028,14 @@ define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out
; GFX9-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16
; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0
-; GFX9-DS128-NEXT: v_mov_b32_e32 v23, v15
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8
+; GFX9-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
-; GFX9-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16
; GFX9-DS128-NEXT: v_bfe_i32 v12, v8, 0, 16
diff --git a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
index 8878e9b65a088..a81d9a458e23a 100644
--- a/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/unspill-vgpr-after-rewrite-vgpr-mfma.ll
@@ -101,7 +101,7 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg,
; CHECK-NEXT: v_accvgpr_read_b32 v2, a2
; CHECK-NEXT: v_accvgpr_read_b32 v3, a3
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def v[10:13]
+; CHECK-NEXT: ; def v[6:9]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: ;;#ASMSTART
@@ -142,7 +142,7 @@ define void @eliminate_spill_after_mfma_rewrite(i32 %x, i32 %y, <4 x i32> %arg,
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
+; CHECK-NEXT: global_store_dwordx4 v0, v[6:9], s[16:17]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
@@ -306,10 +306,10 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
; CHECK-NEXT: v_accvgpr_read_b32 v5, a1
; CHECK-NEXT: v_accvgpr_read_b32 v4, a0
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def v[10:13]
+; CHECK-NEXT: ; def v[8:11]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def v[14:17]
+; CHECK-NEXT: ; def v[12:15]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a[0:31]
@@ -349,9 +349,9 @@ define void @eliminate_spill_after_mfma_rewrite_x2(i32 %x, i32 %y, <4 x i32> %ar
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: global_store_dwordx4 v0, a[36:39], s[16:17] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v0, v[10:13], s[16:17]
+; CHECK-NEXT: global_store_dwordx4 v0, v[8:11], s[16:17]
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v0, v[14:17], s[16:17]
+; CHECK-NEXT: global_store_dwordx4 v0, v[12:15], s[16:17]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: buffer_load_dword a63, off, s[0:3], s32 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword a62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
index 1267bcd1e0717..461b4d0e02cb8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-trunc-vp.ll
@@ -415,8 +415,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV32-NEXT: mul a4, a4, a5
; RV32-NEXT: add a4, sp, a4
; RV32-NEXT: addi a4, a4, 16
-; RV32-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
-; RV32-NEXT: vmv4r.v v8, v24
+; RV32-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
; RV32-NEXT: csrr a4, vlenb
; RV32-NEXT: slli a4, a4, 4
; RV32-NEXT: add a4, sp, a4
@@ -726,8 +725,7 @@ define <128 x i32> @vtrunc_v128i32_v128i64(<128 x i64> %a, <128 x i1> %m, i32 ze
; RV64-NEXT: mul a4, a4, a5
; RV64-NEXT: add a4, sp, a4
; RV64-NEXT: addi a4, a4, 32
-; RV64-NEXT: vl8r.v v24, (a4) # vscale x 64-byte Folded Reload
-; RV64-NEXT: vmv4r.v v8, v24
+; RV64-NEXT: vl8r.v v8, (a4) # vscale x 64-byte Folded Reload
; RV64-NEXT: csrr a4, vlenb
; RV64-NEXT: slli a4, a4, 4
; RV64-NEXT: add a4, sp, a4
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
index 0bfa68298f6b5..0a11501905b81 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfma-vp.ll
@@ -8831,8 +8831,7 @@ define <vscale x 32 x half> @vfnmadd_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v4, v12
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
; ZVFHMIN-NEXT: bltu a1, a0, .LBB286_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a1, a0
@@ -9460,8 +9459,7 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_commute(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v4, v12
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
; ZVFHMIN-NEXT: bltu a0, a1, .LBB291_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
@@ -9832,8 +9830,7 @@ define <vscale x 32 x half> @vfnmadd_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmadd.vv v16, v8, v24, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v16, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v4, v12
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v16, v0.t
; ZVFHMIN-NEXT: bltu a0, a1, .LBB294_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
@@ -10347,8 +10344,7 @@ define <vscale x 32 x half> @vfnmsub_vv_nxv32f16(<vscale x 32 x half> %va, <vsca
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v4, v12
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
; ZVFHMIN-NEXT: bltu a1, a0, .LBB298_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a1, a0
@@ -10975,8 +10971,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_commute(<vscale x 32 x half> %v
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmadd.vv v24, v16, v8, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v4, v12
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
; ZVFHMIN-NEXT: bltu a0, a1, .LBB303_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
@@ -11343,8 +11338,7 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat(<vscale x 32 x half>
; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
; ZVFHMIN-NEXT: vfmadd.vv v24, v8, v16, v0.t
; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m4, ta, ma
-; ZVFHMIN-NEXT: vfncvt.f.f.w v12, v24, v0.t
-; ZVFHMIN-NEXT: vmv.v.v v4, v12
+; ZVFHMIN-NEXT: vfncvt.f.f.w v4, v24, v0.t
; ZVFHMIN-NEXT: bltu a0, a1, .LBB306_2
; ZVFHMIN-NEXT: # %bb.1:
; ZVFHMIN-NEXT: mv a0, a1
@@ -11453,12 +11447,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_commute(<vscale x 32
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v12, v0.t
; ZVFHMIN-NEXT: addi a2, sp, 16
; ZVFHMIN-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill
-; ZVFHMIN-NEXT: vmv4r.v v8, v24
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 4
; ZVFHMIN-NEXT: add a2, sp, a2
; ZVFHMIN-NEXT: addi a2, a2, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v24, (a2) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v28, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
; ZVFHMIN-NEXT: slli a2, a2, 3
@@ -11580,12 +11573,11 @@ define <vscale x 32 x half> @vfnmsub_vf_nxv32f16_neg_splat_unmasked(<vscale x 32
; ZVFHMIN-NEXT: sltu a3, a0, a2
; ZVFHMIN-NEXT: addi a3, a3, -1
; ZVFHMIN-NEXT: and a2, a3, a2
-; ZVFHMIN-NEXT: vmv4r.v v8, v16
; ZVFHMIN-NEXT: csrr a3, vlenb
; ZVFHMIN-NEXT: slli a3, a3, 3
; ZVFHMIN-NEXT: add a3, sp, a3
; ZVFHMIN-NEXT: addi a3, a3, 16
-; ZVFHMIN-NEXT: vs8r.v v8, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT: vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
; ZVFHMIN-NEXT: vsetvli zero, a2, e16, m4, ta, ma
; ZVFHMIN-NEXT: vfwcvt.f.f.v v24, v20, v0.t
; ZVFHMIN-NEXT: csrr a2, vlenb
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index ff416dbe3f1a0..4fc1e06a14983 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -175,33 +175,30 @@ define void @vst3_v16i32(ptr %src, ptr %dst) {
; CHECK-NEXT: vmov.f32 s0, s5
; CHECK-NEXT: vmov.f32 s2, s14
; CHECK-NEXT: vmov.f32 s3, s6
-; CHECK-NEXT: vmov.f32 s26, s10
; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [sp, #160] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s26, s10
; CHECK-NEXT: vmov.f32 s20, s8
+; CHECK-NEXT: vmov.32 q6[1], r3
; CHECK-NEXT: vmov.f32 s23, s9
; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s16, s2
-; CHECK-NEXT: vmov.32 q6[1], r3
-; CHECK-NEXT: vmov.f32 s19, s3
; CHECK-NEXT: vstrw.32 q6, [r1, #16]
+; CHECK-NEXT: vmov.f32 s19, s3
+; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s17, s31
; CHECK-NEXT: vstrw.32 q5, [sp, #144] @ 16-byte Spill
; CHECK-NEXT: vmov.f32 s18, s11
-; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload
; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill
; CHECK-NEXT: vmov.f64 d8, d14
+; CHECK-NEXT: vmov.f32 s19, s29
+; CHECK-NEXT: vmov.f64 d14, d0
; CHECK-NEXT: vmov.f32 s25, s4
; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s20, s9
; CHECK-NEXT: vmov.f32 s23, s10
; CHECK-NEXT: vmov.f32 s17, s8
; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT: vmov.f32 s27, s13
-; CHECK-NEXT: vmov.f32 s13, s11
-; CHECK-NEXT: vmov.f32 s11, s6
-; CHECK-NEXT: vmov.f32 s19, s29
-; CHECK-NEXT: vmov.f64 d14, d0
; CHECK-NEXT: vmov.f32 s15, s3
; CHECK-NEXT: vmov.f32 s0, s8
; CHECK-NEXT: vmov.f32 s1, s4
@@ -211,12 +208,8 @@ define void @vst3_v16i32(ptr %src, ptr %dst) {
; CHECK-NEXT: vmov.f32 s12, s2
; CHECK-NEXT: vmov.32 q0[2], r0
; CHECK-NEXT: vstrw.32 q0, [r1, #48]
-; CHECK-NEXT: vmov.f64 d1, d5
-; CHECK-NEXT: vmov.f32 s0, s5
-; CHECK-NEXT: vmov.32 q0[1], r3
-; CHECK-NEXT: vmov.f32 s18, s30
-; CHECK-NEXT: vstrw.32 q0, [r1, #64]
; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s18, s30
; CHECK-NEXT: vmov.f32 s22, s30
; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload
; CHECK-NEXT: vstrw.32 q0, [r1, #128]
@@ -224,16 +217,22 @@ define void @vst3_v16i32(ptr %src, ptr %dst) {
; CHECK-NEXT: vmov r0, lr, d14
; CHECK-NEXT: vldrw.u32 q7, [sp, #160] @ 16-byte Reload
; CHECK-NEXT: vmov.32 q0[1], lr
-; CHECK-NEXT: vmov.f32 s14, s7
+; CHECK-NEXT: vmov.f32 s27, s13
; CHECK-NEXT: vstrw.32 q0, [r1, #160]
; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload
; CHECK-NEXT: vmov r2, r4, d14
; CHECK-NEXT: vmov.32 q6[2], r0
; CHECK-NEXT: vstrw.32 q0, [r1, #176]
; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s13, s11
; CHECK-NEXT: vmov.32 q4[2], r2
+; CHECK-NEXT: vmov.f32 s8, s5
; CHECK-NEXT: vmov.32 q5[1], r4
+; CHECK-NEXT: vmov.f32 s11, s6
; CHECK-NEXT: vmov.32 q0[2], r12
+; CHECK-NEXT: vmov.f32 s14, s7
+; CHECK-NEXT: vmov.32 q2[1], r3
+; CHECK-NEXT: vstrw.32 q2, [r1, #64]
; CHECK-NEXT: vstrw.32 q3, [r1, #80]
; CHECK-NEXT: vstrw.32 q4, [r1, #96]
; CHECK-NEXT: vstrw.32 q5, [r1, #112]
@@ -1038,10 +1037,10 @@ define void @vst3_v8f32(ptr %src, ptr %dst) {
; CHECK-NEXT: vstrw.32 q5, [r1]
; CHECK-NEXT: vmov.f32 s14, s19
; CHECK-NEXT: vmov.f32 s15, s31
-; CHECK-NEXT: vmov.f32 s5, s29
-; CHECK-NEXT: vstrw.32 q3, [r1, #32]
; CHECK-NEXT: vmov.f32 s4, s17
+; CHECK-NEXT: vstrw.32 q3, [r1, #32]
; CHECK-NEXT: vmov.f32 s7, s18
+; CHECK-NEXT: vmov.f32 s5, s29
; CHECK-NEXT: vstrw.32 q1, [r1, #16]
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
@@ -1333,37 +1332,35 @@ define void @vst3_v16f16(ptr %src, ptr %dst) {
; CHECK-NEXT: sub sp, #72
; CHECK-NEXT: vldrw.u32 q5, [r0, #16]
; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q4, [r0]
; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT: vldrw.u32 q7, [r0]
; CHECK-NEXT: vmov.f32 s0, s20
; CHECK-NEXT: vldrw.u32 q6, [r0, #32]
; CHECK-NEXT: vins.f16 s0, s4
; CHECK-NEXT: vmovx.f16 s2, s8
; CHECK-NEXT: vmov.f32 s12, s0
+; CHECK-NEXT: vldrw.u32 q4, [r0, #64]
; CHECK-NEXT: vmov.f32 s0, s21
+; CHECK-NEXT: vstrw.32 q7, [sp, #8] @ 16-byte Spill
; CHECK-NEXT: vins.f16 s0, s5
-; CHECK-NEXT: vmov.f64 d15, d9
; CHECK-NEXT: vstr s0, [sp, #68] @ 4-byte Spill
; CHECK-NEXT: vmovx.f16 s0, s4
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: vmovx.f16 s0, s20
-; CHECK-NEXT: vins.f16 s8, s0
-; CHECK-NEXT: vmov.f32 s0, s17
; CHECK-NEXT: vmov.16 q3[4], r2
-; CHECK-NEXT: vins.f16 s0, s25
+; CHECK-NEXT: vins.f16 s8, s0
+; CHECK-NEXT: vmov.f32 s0, s29
; CHECK-NEXT: vins.f16 s14, s2
+; CHECK-NEXT: vins.f16 s0, s25
+; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT: vmov.f32 s12, s28
; CHECK-NEXT: vstr s0, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: vmovx.f16 s0, s24
-; CHECK-NEXT: vstrw.32 q3, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT: vmov.f32 s12, s16
-; CHECK-NEXT: vmov r2, s0
-; CHECK-NEXT: vmovx.f16 s0, s16
-; CHECK-NEXT: vmov.f32 s29, s17
-; CHECK-NEXT: vldrw.u32 q4, [r0, #64]
; CHECK-NEXT: vins.f16 s12, s24
-; CHECK-NEXT: vmov.f32 s4, s23
+; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: vmovx.f16 s2, s16
; CHECK-NEXT: vmov.16 q3[4], r2
+; CHECK-NEXT: vmovx.f16 s0, s28
; CHECK-NEXT: vins.f16 s14, s2
; CHECK-NEXT: vmovx.f16 s2, s26
; CHECK-NEXT: vins.f16 s16, s0
@@ -1382,16 +1379,18 @@ define void @vst3_v16f16(ptr %src, ptr %dst) {
; CHECK-NEXT: vmovx.f16 s0, s31
; CHECK-NEXT: vins.f16 s19, s0
; CHECK-NEXT: vmovx.f16 s0, s6
-; CHECK-NEXT: vmov r0, s0
+; CHECK-NEXT: vmov.f32 s4, s23
; CHECK-NEXT: vins.f16 s15, s2
+; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vins.f16 s4, s7
; CHECK-NEXT: vmov.16 q0[0], r0
+; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: vmov.f32 s1, s4
; CHECK-NEXT: vmovx.f16 s4, s7
; CHECK-NEXT: vmov r0, s4
; CHECK-NEXT: vmovx.f16 s4, s11
; CHECK-NEXT: vmov.16 q0[6], r0
-; CHECK-NEXT: vstrw.32 q7, [sp, #8] @ 16-byte Spill
+; CHECK-NEXT: vldr s31, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: vmovx.f16 s2, s10
; CHECK-NEXT: vins.f16 s3, s4
; CHECK-NEXT: vins.f16 s0, s2
@@ -1399,11 +1398,11 @@ define void @vst3_v16f16(ptr %src, ptr %dst) {
; CHECK-NEXT: vins.f16 s11, s2
; CHECK-NEXT: vmov.f32 s2, s22
; CHECK-NEXT: vins.f16 s2, s6
-; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vmov.f32 s29, s16
; CHECK-NEXT: vstr s2, [sp, #28] @ 4-byte Spill
; CHECK-NEXT: vmovx.f16 s2, s5
; CHECK-NEXT: vmov r0, s2
-; CHECK-NEXT: vldr s31, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: vstrw.32 q7, [r1]
; CHECK-NEXT: vmov.16 q1[2], r0
; CHECK-NEXT: vmov.f32 s2, s11
; CHECK-NEXT: vmovx.f16 s4, s21
@@ -1427,15 +1426,13 @@ define void @vst3_v16f16(ptr %src, ptr %dst) {
; CHECK-NEXT: vins.f16 s25, s6
; CHECK-NEXT: vldr s23, [sp, #68] @ 4-byte Reload
; CHECK-NEXT: vldr s6, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT: vmov.f32 s29, s16
; CHECK-NEXT: vmov.f32 s14, s19
-; CHECK-NEXT: vstrw.32 q0, [r1, #80]
; CHECK-NEXT: vmov.f32 s21, s8
; CHECK-NEXT: vstrw.32 q3, [r1, #32]
; CHECK-NEXT: vmov.f32 s4, s9
; CHECK-NEXT: vstrw.32 q5, [r1, #48]
; CHECK-NEXT: vmov.f32 s7, s10
-; CHECK-NEXT: vstrw.32 q7, [r1]
+; CHECK-NEXT: vstrw.32 q0, [r1, #80]
; CHECK-NEXT: vmov.f32 s24, s17
; CHECK-NEXT: vstrw.32 q1, [r1, #64]
; CHECK-NEXT: vmov.f32 s26, s11
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index d96af49060efd..26ab555c2c593 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -119,22 +119,20 @@ define void @vst4_v16i32(ptr %src, ptr %dst) {
; CHECK-NEXT: sub sp, #192
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
; CHECK-NEXT: add r2, sp, #64
-; CHECK-NEXT: vldrw.u32 q4, [r0, #176]
; CHECK-NEXT: vldrw.u32 q3, [r0, #208]
; CHECK-NEXT: vldrw.u32 q2, [r0, #144]
; CHECK-NEXT: vldrw.u32 q1, [r0, #80]
; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q2, [r0, #128]
-; CHECK-NEXT: vldrw.u32 q5, [r0, #240]
-; CHECK-NEXT: vmov q6, q4
; CHECK-NEXT: vldrw.u32 q3, [r0, #192]
; CHECK-NEXT: vldrw.u32 q1, [r0, #64]
; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
+; CHECK-NEXT: vldrw.u32 q6, [r0, #176]
; CHECK-NEXT: vldrw.u32 q2, [r0, #160]
; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
; CHECK-NEXT: add r2, sp, #128
-; CHECK-NEXT: vmov q7, q5
+; CHECK-NEXT: vldrw.u32 q7, [r0, #240]
; CHECK-NEXT: vldrw.u32 q3, [r0, #224]
; CHECK-NEXT: vldrw.u32 q1, [r0, #96]
; CHECK-NEXT: vldrw.u32 q5, [r0, #112]
@@ -881,22 +879,20 @@ define void @vst4_v16f32(ptr %src, ptr %dst) {
; CHECK-NEXT: sub sp, #192
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
; CHECK-NEXT: add r2, sp, #64
-; CHECK-NEXT: vldrw.u32 q4, [r0, #176]
; CHECK-NEXT: vldrw.u32 q3, [r0, #208]
; CHECK-NEXT: vldrw.u32 q2, [r0, #144]
; CHECK-NEXT: vldrw.u32 q1, [r0, #80]
; CHECK-NEXT: vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vldrw.u32 q2, [r0, #128]
-; CHECK-NEXT: vldrw.u32 q5, [r0, #240]
-; CHECK-NEXT: vmov q6, q4
; CHECK-NEXT: vldrw.u32 q3, [r0, #192]
; CHECK-NEXT: vldrw.u32 q1, [r0, #64]
; CHECK-NEXT: vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
+; CHECK-NEXT: vldrw.u32 q6, [r0, #176]
; CHECK-NEXT: vldrw.u32 q2, [r0, #160]
; CHECK-NEXT: vldrw.u32 q4, [r0, #48]
; CHECK-NEXT: add r2, sp, #128
-; CHECK-NEXT: vmov q7, q5
+; CHECK-NEXT: vldrw.u32 q7, [r0, #240]
; CHECK-NEXT: vldrw.u32 q3, [r0, #224]
; CHECK-NEXT: vldrw.u32 q1, [r0, #96]
; CHECK-NEXT: vldrw.u32 q5, [r0, #112]
More information about the llvm-commits
mailing list