[llvm] [RegisterCoalescer]: Try inflated RC for coalescing reg->subreg (PR #134438)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Fri Apr 4 12:14:28 PDT 2025
https://github.com/jrbyrnes created https://github.com/llvm/llvm-project/pull/134438
This is the first of a few patches designed to improve RegisterCoalescing when there are RegisterClass mismatches between the SrcReg and DstReg of a copy. Thie is from an effort to split up https://github.com/llvm/llvm-project/pull/130870 . This PR handles the case of Reg->SubReg copies. It is being cherry-picked on top of https://github.com/llvm/llvm-project/pull/132137 since it uses the test coverage.
This PR introduces and uses `getLargestConstrainedSuperClass` to try to find a matching super RegisterClass for coalescing when the Src and Dst RegisterClasses are incompatible for coalescing. `getLargestConstrainedSuperClass` checks use MIs of a given register for any RegClass constrains in order to produce legal code.
>From 53e394d8acfe7c5a5c09dd681b1f942b7cdbe698 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 19 Mar 2025 20:10:11 -0700
Subject: [PATCH 1/4] [AMDGPU] Extend test coverage for cross RC register
coalescing
Change-Id: I36894fc36e6e6214930fae67f2ca35999abf3b88
---
.../coalesce-copy-to-agpr-to-av-registers.mir | 1255 +++++++++++++++++
1 file changed, 1255 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
index 8ea5c3ea73e77..15175be1c41cc 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
@@ -517,3 +517,1258 @@ body: |
SI_RETURN
...
+
+
+
+# Should coalesce %0 and %1 into subregisters of the av_64 common
+# class
+---
+name: copy_vgpr32_to_areg64_coalesce_with_av64_sub
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_sub
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ %0.sub1:vreg_64 = COPY $vgpr1
+ undef %2.sub0:areg_64 = COPY %0.sub0
+ %2.sub1:areg_64 = COPY %0.sub1
+ INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, killed %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_sub
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_sub
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ %0.sub1:vreg_64 = COPY $vgpr1
+ undef %2.sub0:areg_64_align2 = COPY %0.sub0
+ %2.sub1:areg_64_align2 = COPY %0.sub1
+ INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_sub
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_sub
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_96 =COPY $vgpr0
+ %0.sub1:vreg_96 = COPY $vgpr1
+ %0.sub2:vreg_96 = COPY $vgpr2
+ undef %3.sub0:areg_96 = COPY %0.sub0
+ %3.sub1:areg_96 = COPY %0.sub1
+ %3.sub2:areg_96 = COPY %0.sub2
+ INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %3
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_sub
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_sub
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_96 =COPY $vgpr0
+ %0.sub1:vreg_96 = COPY $vgpr1
+ %0.sub2:vreg_96 = COPY $vgpr2
+ undef %3.sub0:areg_96_align2 = COPY %0.sub0
+ %3.sub1:areg_96_align2 = COPY %0.sub1
+ %3.sub2:areg_96_align2 = COPY %0.sub2
+ INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, %3
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_coalesce_with_av128_sub
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_sub
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6225929 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
+ %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+ undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
+ %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
+ INLINEASM &"; use $0", 0 /* attdialect */, 6225929 /* reguse:AReg_128 */, killed %2
+ SI_RETURN
+
+...
+
+
+
+---
+name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_sub
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_sub
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_128 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1
+ %0.sub1:vreg_128 = COPY $vgpr2_vgpr3
+ undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0
+ %2.sub2_sub3:areg_128_align2 = COPY %0.sub1
+ INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_128_Align2 */, %2
+ SI_RETURN
+
+...
+
+---
+name: copy_sgpr32_to_areg64_align2_sub
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr8, $sgpr9
+
+ ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_sub
+ ; CHECK: liveins: $sgpr8, $sgpr9
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:sreg_64 = COPY $sgpr8
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:sreg_64 = COPY $sgpr8
+ %0.sub1:sreg_64 = COPY $sgpr9
+ undef %2.sub0:areg_64_align2 = COPY %0.sub0
+ %2.sub1:areg_64_align2 = COPY %0.sub1
+ INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_sub
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_sub
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_96 =COPY $vgpr0
+ %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ undef %2.sub0:areg_96 = COPY %0.sub0
+ %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
+ INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_sub
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_sub
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_96 =COPY $vgpr0
+ %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ undef %2.sub0:areg_96_align2 = COPY %0.sub0
+ %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_sub
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_sub
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+ %0.sub2:vreg_96 = COPY $vgpr2
+ undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
+ %2.sub2:areg_96 = COPY %0.sub2
+ INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_sub
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_sub
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+ %0.sub2:vreg_96 = COPY $vgpr2
+ undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
+ %2.sub2:areg_96_align2 = COPY %0.sub2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x2_to_areg64_sub
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_sub
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ undef %2.sub0:areg_64 = COPY %0.sub0
+ %2.sub1:areg_64 = COPY %0.sub0
+ INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, killed %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x3_to_areg96_sub
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_sub
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ undef %1.sub0:areg_96 = COPY %0.sub0
+ %1.sub1:areg_96 = COPY %0.sub0
+ %1.sub2:areg_96 = COPY %0.sub0
+ INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %1
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x3_to_areg96_align2_sub
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_align2_sub
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ undef %1.sub0:areg_96_align2 = COPY %0.sub0
+ %1.sub1:areg_96_align2 = COPY %0.sub0
+ INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, %1
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x4_to_areg128_sub
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_sub
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6225929 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ undef %1.sub0:areg_128 = COPY %0.sub0
+ %1.sub1:areg_128 = COPY %0.sub0
+ %1.sub2:areg_128 = COPY %0.sub0
+ %1.sub3:areg_128 = COPY %0.sub0
+ INLINEASM &"; use $0", 0 /* attdialect */, 6225929 /* reguse:AReg_128 */, killed %1
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x4_to_areg128_align2_sub
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_align2_sub
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ undef %1.sub0:areg_128_align2 = COPY %0.sub0
+ %1.sub1:areg_128_align2 = COPY %0.sub0
+ %1.sub2:areg_128_align2 = COPY %0.sub0
+ %1.sub3:areg_128_align2 = COPY %0.sub0
+ INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_128_Align2 */, %1
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_coalesce_with_av64_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ undef %2.sub0:areg_64 = COPY %0
+ %2.sub1:areg_64 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ undef %2.sub0:areg_64_align2 = COPY %0
+ %2.sub1:areg_64_align2 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY3]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = COPY $vgpr2
+ undef %3.sub0:areg_96 = COPY %0
+ %3.sub1:areg_96 = COPY %1
+ %3.sub2:areg_96 = COPY %2
+ S_NOP 0, implicit %3
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY3]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = COPY $vgpr2
+ undef %3.sub0:areg_96_align2 = COPY %0
+ %3.sub1:areg_96_align2 = COPY %1
+ %3.sub2:areg_96_align2 = COPY %2
+ S_NOP 0, implicit %3
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_coalesce_with_av128_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_snop
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64 = COPY $vgpr0_vgpr1
+ %1:vreg_64 = COPY $vgpr2_vgpr3
+ undef %2.sub0_sub1:areg_128 = COPY %0
+ %2.sub2_sub3:areg_128 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_snop
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64 = COPY $vgpr0_vgpr1
+ %1:vreg_64 = COPY $vgpr2_vgpr3
+ undef %2.sub0_sub1:areg_128_align2 = COPY %0
+ %2.sub2_sub3:areg_128_align2 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_sgpr32_to_areg64_align2_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr8, $sgpr9
+
+ ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_snop
+ ; CHECK: liveins: $sgpr8, $sgpr9
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr8
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr9
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:sgpr_32 = COPY $sgpr8
+ %1:sgpr_32 = COPY $sgpr9
+ undef %2.sub0:areg_64_align2 = COPY %0
+ %2.sub1:areg_64_align2 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vreg_64 = COPY $vgpr1_vgpr2
+ undef %2.sub0:areg_96 = COPY %0
+ %2.sub1_sub2:areg_96 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vreg_64 = COPY $vgpr1_vgpr2
+ undef %2.sub0:areg_96_align2 = COPY %0
+ %2.sub1_sub2:areg_96_align2 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_snop
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64 = COPY $vgpr0_vgpr1
+ %1:vgpr_32 = COPY $vgpr2
+ undef %2.sub0_sub1:areg_96 = COPY %0
+ %2.sub2:areg_96 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64 = COPY $vgpr0_vgpr1
+ %1:vgpr_32 = COPY $vgpr2
+ undef %2.sub0_sub1:areg_96_align2 = COPY %0
+ %2.sub2:areg_96_align2 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x2_to_areg64_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ undef %2.sub0:areg_64 = COPY %0
+ %2.sub1:areg_64 = COPY %0
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x2_to_areg64_coalesce_with_av64_align2_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_coalesce_with_av64_align2_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ undef %2.sub0:areg_64_align2 = COPY %0
+ %2.sub1:areg_64_align2 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x3_to_areg96_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ undef %1.sub0:areg_96 = COPY %0
+ %1.sub1:areg_96 = COPY %0
+ S_NOP 0, implicit %1
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x3_to_areg96_align2_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_align2_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ undef %1.sub0:areg_96_align2 = COPY %0
+ %1.sub1:areg_96_align2 = COPY %0
+ S_NOP 0, implicit %1
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x4_to_areg128_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ undef %1.sub0:areg_128 = COPY %0
+ %1.sub1:areg_128 = COPY %0
+ %1.sub2:areg_128 = COPY %0
+ %1.sub3:areg_128 = COPY %0
+ S_NOP 0, implicit %1
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x4_to_areg128_align2_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_align2_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ undef %1.sub0:areg_128_align2 = COPY %0
+ %1.sub1:areg_128_align2 = COPY %0
+ %1.sub2:areg_128_align2 = COPY %0
+ %1.sub3:areg_128_align2 = COPY %0
+ S_NOP 0, implicit %1
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_coalesce_with_av64_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_sub_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ %0.sub1:vreg_64 = COPY $vgpr1
+ undef %2.sub0:areg_64 = COPY %0.sub0
+ %2.sub1:areg_64 = COPY %0.sub1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_sub_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ %0.sub1:vreg_64 = COPY $vgpr1
+ undef %2.sub0:areg_64_align2 = COPY %0.sub0
+ %2.sub1:areg_64_align2 = COPY %0.sub1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_sub_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_96 =COPY $vgpr0
+ %0.sub1:vreg_96 = COPY $vgpr1
+ %0.sub2:vreg_96 = COPY $vgpr2
+ undef %3.sub0:areg_96 = COPY %0.sub0
+ %3.sub1:areg_96 = COPY %0.sub1
+ %3.sub2:areg_96 = COPY %0.sub2
+ S_NOP 0, implicit %3
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_sub_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_96 =COPY $vgpr0
+ %0.sub1:vreg_96 = COPY $vgpr1
+ %0.sub2:vreg_96 = COPY $vgpr2
+ undef %3.sub0:areg_96_align2 = COPY %0.sub0
+ %3.sub1:areg_96_align2 = COPY %0.sub1
+ %3.sub2:areg_96_align2 = COPY %0.sub2
+ S_NOP 0, implicit %3
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_coalesce_with_av128_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_sub_snop
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
+ %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+ undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
+ %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+
+
+---
+name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_sub_snop
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
+ %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+ undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1
+ %2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_sgpr32_to_areg64_align2_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr8, $sgpr9
+
+ ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_sub_snop
+ ; CHECK: liveins: $sgpr8, $sgpr9
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:sreg_64 = COPY $sgpr8
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:sreg_64 = COPY $sgpr8
+ %0.sub1:sreg_64 = COPY $sgpr9
+ undef %2.sub0:areg_64_align2 = COPY %0.sub0
+ %2.sub1:areg_64_align2 = COPY %0.sub1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_sub_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_96 =COPY $vgpr0
+ %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ undef %2.sub0:areg_96 = COPY %0.sub0
+ %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_sub_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_96 =COPY $vgpr0
+ %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ undef %2.sub0:areg_96_align2 = COPY %0.sub0
+ %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_sub_snop
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+ %0.sub2:vreg_96 = COPY $vgpr2
+ undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
+ %2.sub2:areg_96 = COPY %0.sub2
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_sub_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+ %0.sub2:vreg_96 = COPY $vgpr2
+ undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
+ %2.sub2:areg_96_align2 = COPY %0.sub2
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x2_to_areg64_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_sub_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ undef %2.sub0:areg_64 = COPY %0.sub0
+ %2.sub1:areg_64 = COPY %0.sub0
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x3_to_areg96_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_sub_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ undef %1.sub0:areg_96 = COPY %0.sub0
+ %1.sub1:areg_96 = COPY %0.sub0
+ %1.sub2:areg_96 = COPY %0.sub0
+ S_NOP 0, implicit %1
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x3_to_areg96_align2_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_align2_sub_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ undef %1.sub0:areg_96_align2 = COPY %0.sub0
+ %1.sub1:areg_96_align2 = COPY %0.sub0
+ S_NOP 0, implicit %1
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x4_to_areg128_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_sub_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ undef %1.sub0:areg_128 = COPY %0.sub0
+ %1.sub1:areg_128 = COPY %0.sub0
+ %1.sub2:areg_128 = COPY %0.sub0
+ %1.sub3:areg_128 = COPY %0.sub0
+ S_NOP 0, implicit %1
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x4_to_areg128_align2_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_align2_sub_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ undef %1.sub0:areg_128_align2 = COPY %0.sub0
+ %1.sub1:areg_128_align2 = COPY %0.sub0
+ %1.sub2:areg_128_align2 = COPY %0.sub0
+ %1.sub3:areg_128_align2 = COPY %0.sub0
+ S_NOP 0, implicit %1
+ SI_RETURN
+
+...
>From 1e3da559a1c34143f93a3195f7ac299058ca531e Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Wed, 19 Mar 2025 20:10:11 -0700
Subject: [PATCH 2/4] [AMDGPU] Extend test coverage for cross RC register
coalescing
Change-Id: I36894fc36e6e6214930fae67f2ca35999abf3b88
---
.../AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
index 15175be1c41cc..c8c8db5112bdb 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
@@ -644,7 +644,7 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6225929 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6225929 /* reguse:AV_128_with_hi16_in_VGPR_16_Lo128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
%0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
@@ -671,7 +671,7 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_128 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub1
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_128_with_sub0_sub1_sub2_in_AReg_96_with_sub1_sub2_in_AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_128 =COPY $vgpr0_vgpr1
%0.sub1:vreg_128 = COPY $vgpr2_vgpr3
@@ -893,7 +893,7 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6225929 /* reguse:AReg_128 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6225929 /* reguse:AV_128_with_hi16_in_VGPR_16_Lo128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_64 = COPY $vgpr0
undef %1.sub0:areg_128 = COPY %0.sub0
@@ -920,7 +920,7 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0
- ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_128_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_128_with_sub0_sub1_sub2_in_AReg_96_with_sub1_sub2_in_AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_64 = COPY $vgpr0
undef %1.sub0:areg_128_align2 = COPY %0.sub0
>From d41f2d4196cc3cb374fc7528b8d8a699f30de649 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 25 Mar 2025 12:16:34 -0700
Subject: [PATCH 3/4] Add more coverage
Change-Id: I49b2a587eeef47ebc7150d9129d90843ae7a3042
---
.../coalesce-copy-to-agpr-to-av-registers.mir | 1604 +++++++++++++----
1 file changed, 1303 insertions(+), 301 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
index c8c8db5112bdb..2e7f805798120 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
@@ -4,7 +4,6 @@
# Test coalescing situations which can use av_* registers to handle
# copies between VGPRs and AGPRs.
-
# Should coalesce %0 and %1 into subregisters of the av_64 common
# class
---
@@ -933,319 +932,539 @@ body: |
...
---
-name: copy_vgpr32_to_areg64_coalesce_with_av64_snop
+name: copy_vgpr32_to_areg64_coalesce_with_av64_both_sub
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_snop
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_both_sub
; CHECK: liveins: $vgpr0, $vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
- %0:vgpr_32 = COPY $vgpr0
- %1:vgpr_32 = COPY $vgpr1
- undef %2.sub0:areg_64 = COPY %0
- %2.sub1:areg_64 = COPY %1
- S_NOP 0, implicit %2
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ %0.sub1:vreg_64 = COPY $vgpr1
+ undef %2.sub0:areg_64 = COPY %0.sub0
+ %2.sub1:areg_64 = COPY %0.sub1
+ INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, killed %2
SI_RETURN
...
---
-name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_snop
+name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_both_sub
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_snop
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_both_sub
; CHECK: liveins: $vgpr0, $vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
- %0:vgpr_32 = COPY $vgpr0
- %1:vgpr_32 = COPY $vgpr1
- undef %2.sub0:areg_64_align2 = COPY %0
- %2.sub1:areg_64_align2 = COPY %1
- S_NOP 0, implicit %2
+ undef %0.sub0:vreg_64_align2 = COPY $vgpr0
+ %0.sub1:vreg_64_align2 = COPY $vgpr1
+ undef %2.sub0:areg_64_align2 = COPY %0.sub0
+ %2.sub1:areg_64_align2 = COPY %0.sub1
+ INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
---
-name: copy_vgpr32_to_areg96_coalesce_with_av96_snop
+name: copy_vgpr32_to_areg96_coalesce_with_av96_both_sub
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
- ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_snop
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_both_sub
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]]
- ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY3]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
- %0:vgpr_32 = COPY $vgpr0
- %1:vgpr_32 = COPY $vgpr1
- %2:vgpr_32 = COPY $vgpr2
- undef %3.sub0:areg_96 = COPY %0
- %3.sub1:areg_96 = COPY %1
- %3.sub2:areg_96 = COPY %2
- S_NOP 0, implicit %3
+ undef %0.sub0:vreg_96 = COPY $vgpr0
+ %0.sub1:vreg_96 = COPY $vgpr1
+ %0.sub2:vreg_96 = COPY $vgpr2
+ undef %3.sub0:areg_96 = COPY %0.sub0
+ %3.sub1:areg_96 = COPY %0.sub1
+ %3.sub2:areg_96 = COPY %0.sub2
+ INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %3
SI_RETURN
...
---
-name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_snop
+name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_both_sub
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
- ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_snop
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_both_sub
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY3]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96_align2 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96_align2 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96_align2 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
- %0:vgpr_32 = COPY $vgpr0
- %1:vgpr_32 = COPY $vgpr1
- %2:vgpr_32 = COPY $vgpr2
- undef %3.sub0:areg_96_align2 = COPY %0
- %3.sub1:areg_96_align2 = COPY %1
- %3.sub2:areg_96_align2 = COPY %2
- S_NOP 0, implicit %3
+ undef %0.sub0:vreg_96_align2 = COPY $vgpr0
+ %0.sub1:vreg_96_align2 = COPY $vgpr1
+ %0.sub2:vreg_96_align2 = COPY $vgpr2
+ undef %3.sub0:areg_96_align2 = COPY %0.sub0
+ %3.sub1:areg_96_align2 = COPY %0.sub1
+ %3.sub2:areg_96_align2 = COPY %0.sub2
+ INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, %3
SI_RETURN
...
---
-name: copy_vgpr64_to_areg64_coalesce_with_av128_snop
+name: copy_vgpr64_to_areg64_coalesce_with_av128_both_sub
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_snop
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_both_sub
; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6225929 /* reguse:AV_128_with_hi16_in_VGPR_16_Lo128 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
- %0:vreg_64 = COPY $vgpr0_vgpr1
- %1:vreg_64 = COPY $vgpr2_vgpr3
- undef %2.sub0_sub1:areg_128 = COPY %0
- %2.sub2_sub3:areg_128 = COPY %1
- S_NOP 0, implicit %2
+ undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
+ %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+ undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
+ %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
+ INLINEASM &"; use $0", 0 /* attdialect */, 6225929 /* reguse:AReg_128 */, killed %2
SI_RETURN
...
---
-name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_snop
+name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_both_sub
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_snop
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_both_sub
; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_128_with_sub0_sub1_sub2_in_AReg_96_with_sub1_sub2_in_AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
- %0:vreg_64 = COPY $vgpr0_vgpr1
- %1:vreg_64 = COPY $vgpr2_vgpr3
- undef %2.sub0_sub1:areg_128_align2 = COPY %0
- %2.sub2_sub3:areg_128_align2 = COPY %1
- S_NOP 0, implicit %2
+ undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1
+ %0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
+ undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1
+ %2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3
+ INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_128_Align2 */, %2
SI_RETURN
...
---
-name: copy_sgpr32_to_areg64_align2_snop
+name: copy_sgpr32_to_areg64_align2_both_sub
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr8, $sgpr9
- ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_snop
+ ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_both_sub
; CHECK: liveins: $sgpr8, $sgpr9
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr8
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr9
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:sreg_64 = COPY $sgpr8
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
- %0:sgpr_32 = COPY $sgpr8
- %1:sgpr_32 = COPY $sgpr9
- undef %2.sub0:areg_64_align2 = COPY %0
- %2.sub1:areg_64_align2 = COPY %1
- S_NOP 0, implicit %2
+ undef %0.sub0:sreg_64 = COPY $sgpr8
+ %0.sub1:sreg_64 = COPY $sgpr9
+ undef %2.sub0:areg_64_align2 = COPY %0.sub0
+ %2.sub1:areg_64_align2 = COPY %0.sub1
+ INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
---
-name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_snop
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_both_sub
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1_vgpr2
- ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_snop
+ ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_both_sub
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_96 = COPY $vgpr0
+ %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ undef %2.sub0:areg_96 = COPY %0.sub0
+ %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
+ INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %2
+ SI_RETURN
+
+...
+
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_shuffle_both_sub
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96
; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
- %0:vgpr_32 = COPY $vgpr0
- %1:vreg_64 = COPY $vgpr1_vgpr2
- undef %2.sub0:areg_96 = COPY %0
- %2.sub1_sub2:areg_96 = COPY %1
- S_NOP 0, implicit %2
+ undef %0.sub0:vreg_96 = COPY $vgpr0
+ %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ undef %2.sub0:areg_96 = COPY %0.sub2
+ %2.sub1_sub2:areg_96 = COPY %0.sub0_sub1
+ INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %2
SI_RETURN
...
+
---
-name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_snop
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_both_sub
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1_vgpr2
- ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_snop
+ ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_both_sub
; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96_align2 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
- %0:vgpr_32 = COPY $vgpr0
- %1:vreg_64 = COPY $vgpr1_vgpr2
- undef %2.sub0:areg_96_align2 = COPY %0
- %2.sub1_sub2:areg_96_align2 = COPY %1
- S_NOP 0, implicit %2
+ undef %0.sub0:vreg_96_align2 = COPY $vgpr0
+ %0.sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
+ undef %2.sub0:areg_96_align2 = COPY %0.sub0
+ %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
---
-name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_snop
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_both_sub
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2
- ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_snop
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_both_sub
; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
- %0:vreg_64 = COPY $vgpr0_vgpr1
- %1:vgpr_32 = COPY $vgpr2
- undef %2.sub0_sub1:areg_96 = COPY %0
- %2.sub2:areg_96 = COPY %1
- S_NOP 0, implicit %2
+ undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+ %0.sub2:vreg_96 = COPY $vgpr2
+ undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
+ %2.sub2:areg_96 = COPY %0.sub2
+ INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %2
SI_RETURN
...
---
-name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_snop
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_both_sub
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1_vgpr2
- ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_snop
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_both_sub
; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96_align2 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
; CHECK-NEXT: SI_RETURN
- %0:vreg_64 = COPY $vgpr0_vgpr1
- %1:vgpr_32 = COPY $vgpr2
- undef %2.sub0_sub1:areg_96_align2 = COPY %0
- %2.sub2:areg_96_align2 = COPY %1
- S_NOP 0, implicit %2
+ undef %0.sub0_sub1:vreg_96_align2 = COPY $vgpr0_vgpr1
+ %0.sub2:vreg_96_align2 = COPY $vgpr2
+ undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
+ %2.sub2:areg_96_align2 = COPY %0.sub2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
SI_RETURN
...
---
-name: copy_vgpr32_x2_to_areg64_snop
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_mismatch_both_sub
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0
+ liveins: $vgpr0, $vgpr1_vgpr2
- ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_snop
- ; CHECK: liveins: $vgpr0
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_mismatch_both_sub
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+ %0.sub2:vreg_96 = COPY $vgpr2
+ undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
+ %2.sub2:areg_96_align2 = COPY %0.sub2
+ INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_coalesce_with_av64_whole_reg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_whole_reg
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64 = COPY [[COPY]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64 = COPY $vgpr0_vgpr1
+ %2:areg_64 = COPY %0
+ INLINEASM &"; use $0", 0 /* attdialect */, 3473417 /* reguse:AReg_64 */, killed %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_whole_reg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_whole_reg
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64_align2 = COPY $vgpr0_vgpr1
+ %2:areg_64_align2 = COPY %0
+ INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_whole_reg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_whole_reg
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96 = COPY [[COPY]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
+ %3:areg_96 = COPY %0
+ INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, %3
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_whole_reg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_whole_reg
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
+ %3:areg_96_align2 = COPY %0
+ INLINEASM &"; use $0", 0 /* attdialect */, 4915209 /* reguse:AReg_96_Align2 */, %3
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_coalesce_with_av128_whole_reg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_whole_reg
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6225929 /* reguse:AV_128_with_hi16_in_VGPR_16_Lo128 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ %2:areg_128 = COPY %0
+ INLINEASM &"; use $0", 0 /* attdialect */, 6225929 /* reguse:AReg_128 */, killed %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_whole_reg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_whole_reg
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_128_with_sub0_sub1_sub2_in_AReg_96_with_sub1_sub2_in_AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ %2:areg_128_align2 = COPY %0
+ INLINEASM &"; use $0", 0 /* attdialect */, 6488073 /* reguse:AReg_128_Align2 */, %2
+ SI_RETURN
+
+...
+
+---
+name: copy_sgpr32_to_areg64_align2_whole_reg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr8, $sgpr9
+
+ ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_whole_reg
+ ; CHECK: liveins: $sgpr8, $sgpr9
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:sreg_64 = COPY $sgpr8_sgpr9
+ %2:areg_64_align2 = COPY %0
+ INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_mismatch_whole_reg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_mismatch_whole_reg
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
+ %2:areg_96_align2 = COPY %0
+ INLINEASM &"; use $0", 0 /* attdialect */, 3735561 /* reguse:AReg_64_Align2 */, %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_coalesce_with_av64_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
undef %2.sub0:areg_64 = COPY %0
- %2.sub1:areg_64 = COPY %0
+ %2.sub1:areg_64 = COPY %1
S_NOP 0, implicit %2
SI_RETURN
...
---
-name: copy_vgpr32_x2_to_areg64_coalesce_with_av64_align2_snop
+name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_snop
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_coalesce_with_av64_align2_snop
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_snop
; CHECK: liveins: $vgpr0, $vgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -1264,113 +1483,804 @@ body: |
...
---
-name: copy_vgpr32_x3_to_areg96_snop
+name: copy_vgpr32_to_areg96_coalesce_with_av96_snop
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0
+ liveins: $vgpr0, $vgpr1, $vgpr2
- ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_snop
- ; CHECK: liveins: $vgpr0
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY3]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
- undef %1.sub0:areg_96 = COPY %0
- %1.sub1:areg_96 = COPY %0
- S_NOP 0, implicit %1
+ %1:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = COPY $vgpr2
+ undef %3.sub0:areg_96 = COPY %0
+ %3.sub1:areg_96 = COPY %1
+ %3.sub2:areg_96 = COPY %2
+ S_NOP 0, implicit %3
SI_RETURN
...
---
-name: copy_vgpr32_x3_to_areg96_align2_snop
+name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_snop
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0
+ liveins: $vgpr0, $vgpr1, $vgpr2
- ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_align2_snop
- ; CHECK: liveins: $vgpr0
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY3]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
- undef %1.sub0:areg_96_align2 = COPY %0
- %1.sub1:areg_96_align2 = COPY %0
- S_NOP 0, implicit %1
+ %1:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = COPY $vgpr2
+ undef %3.sub0:areg_96_align2 = COPY %0
+ %3.sub1:areg_96_align2 = COPY %1
+ %3.sub2:areg_96_align2 = COPY %2
+ S_NOP 0, implicit %3
SI_RETURN
...
---
-name: copy_vgpr32_x4_to_areg128_snop
+name: copy_vgpr64_to_areg64_coalesce_with_av128_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_snop
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64 = COPY $vgpr0_vgpr1
+ %1:vreg_64 = COPY $vgpr2_vgpr3
+ undef %2.sub0_sub1:areg_128 = COPY %0
+ %2.sub2_sub3:areg_128 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_snop
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64 = COPY $vgpr0_vgpr1
+ %1:vreg_64 = COPY $vgpr2_vgpr3
+ undef %2.sub0_sub1:areg_128_align2 = COPY %0
+ %2.sub2_sub3:areg_128_align2 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_sgpr32_to_areg64_align2_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr8, $sgpr9
+
+ ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_snop
+ ; CHECK: liveins: $sgpr8, $sgpr9
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr8
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr9
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:sgpr_32 = COPY $sgpr8
+ %1:sgpr_32 = COPY $sgpr9
+ undef %2.sub0:areg_64_align2 = COPY %0
+ %2.sub1:areg_64_align2 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vreg_64 = COPY $vgpr1_vgpr2
+ undef %2.sub0:areg_96 = COPY %0
+ %2.sub1_sub2:areg_96 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vreg_64 = COPY $vgpr1_vgpr2
+ undef %2.sub0:areg_96_align2 = COPY %0
+ %2.sub1_sub2:areg_96_align2 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_snop
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64 = COPY $vgpr0_vgpr1
+ %1:vgpr_32 = COPY $vgpr2
+ undef %2.sub0_sub1:areg_96 = COPY %0
+ %2.sub2:areg_96 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_64 = COPY $vgpr0_vgpr1
+ %1:vgpr_32 = COPY $vgpr2
+ undef %2.sub0_sub1:areg_96_align2 = COPY %0
+ %2.sub2:areg_96_align2 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x2_to_areg64_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ undef %2.sub0:areg_64 = COPY %0
+ %2.sub1:areg_64 = COPY %0
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x2_to_areg64_coalesce_with_av64_align2_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_coalesce_with_av64_align2_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ %1:vgpr_32 = COPY $vgpr1
+ undef %2.sub0:areg_64_align2 = COPY %0
+ %2.sub1:areg_64_align2 = COPY %1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x3_to_areg96_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ undef %1.sub0:areg_96 = COPY %0
+ %1.sub1:areg_96 = COPY %0
+ S_NOP 0, implicit %1
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x3_to_areg96_align2_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_align2_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ undef %1.sub0:areg_96_align2 = COPY %0
+ %1.sub1:areg_96_align2 = COPY %0
+ S_NOP 0, implicit %1
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x4_to_areg128_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ undef %1.sub0:areg_128 = COPY %0
+ %1.sub1:areg_128 = COPY %0
+ %1.sub2:areg_128 = COPY %0
+ %1.sub3:areg_128 = COPY %0
+ S_NOP 0, implicit %1
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x4_to_areg128_align2_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_align2_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vgpr_32 = COPY $vgpr0
+ undef %1.sub0:areg_128_align2 = COPY %0
+ %1.sub1:areg_128_align2 = COPY %0
+ %1.sub2:areg_128_align2 = COPY %0
+ %1.sub3:areg_128_align2 = COPY %0
+ S_NOP 0, implicit %1
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_coalesce_with_av64_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_sub_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ %0.sub1:vreg_64 = COPY $vgpr1
+ undef %2.sub0:areg_64 = COPY %0.sub0
+ %2.sub1:areg_64 = COPY %0.sub1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_sub_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ %0.sub1:vreg_64 = COPY $vgpr1
+ undef %2.sub0:areg_64_align2 = COPY %0.sub0
+ %2.sub1:areg_64_align2 = COPY %0.sub1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_sub_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_96 =COPY $vgpr0
+ %0.sub1:vreg_96 = COPY $vgpr1
+ %0.sub2:vreg_96 = COPY $vgpr2
+ undef %3.sub0:areg_96 = COPY %0.sub0
+ %3.sub1:areg_96 = COPY %0.sub1
+ %3.sub2:areg_96 = COPY %0.sub2
+ S_NOP 0, implicit %3
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_sub_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_96 =COPY $vgpr0
+ %0.sub1:vreg_96 = COPY $vgpr1
+ %0.sub2:vreg_96 = COPY $vgpr2
+ undef %3.sub0:areg_96_align2 = COPY %0.sub0
+ %3.sub1:areg_96_align2 = COPY %0.sub1
+ %3.sub2:areg_96_align2 = COPY %0.sub2
+ S_NOP 0, implicit %3
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_coalesce_with_av128_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_sub_snop
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
+ %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+ undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
+ %2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+
+
+---
+name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_sub_snop
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
+ %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+ undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1
+ %2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_sgpr32_to_areg64_align2_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr8, $sgpr9
+
+ ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_sub_snop
+ ; CHECK: liveins: $sgpr8, $sgpr9
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:sreg_64 = COPY $sgpr8
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:sreg_64 = COPY $sgpr9
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:sreg_64 = COPY $sgpr8
+ %0.sub1:sreg_64 = COPY $sgpr9
+ undef %2.sub0:areg_64_align2 = COPY %0.sub0
+ %2.sub1:areg_64_align2 = COPY %0.sub1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_sub_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_96 =COPY $vgpr0
+ %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ undef %2.sub0:areg_96 = COPY %0.sub0
+ %2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_sub_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_96 =COPY $vgpr0
+ %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ undef %2.sub0:areg_96_align2 = COPY %0.sub0
+ %2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_sub_snop
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+ %0.sub2:vreg_96 = COPY $vgpr2
+ undef %2.sub0_sub1:areg_96 = COPY %0.sub0_sub1
+ %2.sub2:areg_96 = COPY %0.sub2
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_sub_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
+ %0.sub2:vreg_96 = COPY $vgpr2
+ undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
+ %2.sub2:areg_96_align2 = COPY %0.sub2
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x2_to_areg64_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_sub_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ undef %2.sub0:areg_64 = COPY %0.sub0
+ %2.sub1:areg_64 = COPY %0.sub0
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x3_to_areg96_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_sub_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ undef %1.sub0:areg_96 = COPY %0.sub0
+ %1.sub1:areg_96 = COPY %0.sub0
+ %1.sub2:areg_96 = COPY %0.sub0
+ S_NOP 0, implicit %1
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x3_to_areg96_align2_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0
+
+ ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_align2_sub_snop
+ ; CHECK: liveins: $vgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ undef %1.sub0:areg_96_align2 = COPY %0.sub0
+ %1.sub1:areg_96_align2 = COPY %0.sub0
+ S_NOP 0, implicit %1
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr32_x4_to_areg128_sub_snop
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0
- ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_snop
+ ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_sub_snop
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
; CHECK-NEXT: SI_RETURN
- %0:vgpr_32 = COPY $vgpr0
- undef %1.sub0:areg_128 = COPY %0
- %1.sub1:areg_128 = COPY %0
- %1.sub2:areg_128 = COPY %0
- %1.sub3:areg_128 = COPY %0
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ undef %1.sub0:areg_128 = COPY %0.sub0
+ %1.sub1:areg_128 = COPY %0.sub0
+ %1.sub2:areg_128 = COPY %0.sub0
+ %1.sub3:areg_128 = COPY %0.sub0
S_NOP 0, implicit %1
SI_RETURN
...
---
-name: copy_vgpr32_x4_to_areg128_align2_snop
+name: copy_vgpr32_x4_to_areg128_align2_sub_snop
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0
- ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_align2_snop
+ ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_align2_sub_snop
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
; CHECK-NEXT: SI_RETURN
- %0:vgpr_32 = COPY $vgpr0
- undef %1.sub0:areg_128_align2 = COPY %0
- %1.sub1:areg_128_align2 = COPY %0
- %1.sub2:areg_128_align2 = COPY %0
- %1.sub3:areg_128_align2 = COPY %0
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ undef %1.sub0:areg_128_align2 = COPY %0.sub0
+ %1.sub1:areg_128_align2 = COPY %0.sub0
+ %1.sub2:areg_128_align2 = COPY %0.sub0
+ %1.sub3:areg_128_align2 = COPY %0.sub0
S_NOP 0, implicit %1
SI_RETURN
...
---
-name: copy_vgpr32_to_areg64_coalesce_with_av64_sub_snop
+name: copy_vgpr32_to_areg64_coalesce_with_av64_both_sub_snop
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_sub_snop
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_both_sub_snop
; CHECK: liveins: $vgpr0, $vgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
@@ -1389,23 +2299,23 @@ body: |
...
---
-name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_sub_snop
+name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_both_sub_snop
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_sub_snop
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_both_sub_snop
; CHECK: liveins: $vgpr0, $vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY $vgpr1
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64_align2 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64_align2 = COPY $vgpr1
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY]].sub1
; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
; CHECK-NEXT: SI_RETURN
- undef %0.sub0:vreg_64 = COPY $vgpr0
- %0.sub1:vreg_64 = COPY $vgpr1
+ undef %0.sub0:vreg_64_align2 = COPY $vgpr0
+ %0.sub1:vreg_64_align2 = COPY $vgpr1
undef %2.sub0:areg_64_align2 = COPY %0.sub0
%2.sub1:areg_64_align2 = COPY %0.sub1
S_NOP 0, implicit %2
@@ -1414,13 +2324,13 @@ body: |
...
---
-name: copy_vgpr32_to_areg96_coalesce_with_av96_sub_snop
+name: copy_vgpr32_to_areg96_coalesce_with_av96_both_sub_snop
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
- ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_sub_snop
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_both_sub_snop
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
@@ -1431,7 +2341,7 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub2
; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
; CHECK-NEXT: SI_RETURN
- undef %0.sub0:vreg_96 =COPY $vgpr0
+ undef %0.sub0:vreg_96 = COPY $vgpr0
%0.sub1:vreg_96 = COPY $vgpr1
%0.sub2:vreg_96 = COPY $vgpr2
undef %3.sub0:areg_96 = COPY %0.sub0
@@ -1443,26 +2353,26 @@ body: |
...
---
-name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_sub_snop
+name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_both_sub_snop
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
- ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_sub_snop
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_both_sub_snop
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96 = COPY $vgpr1
- ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96_align2 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_96_align2 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96_align2 = COPY $vgpr2
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
; CHECK-NEXT: SI_RETURN
- undef %0.sub0:vreg_96 =COPY $vgpr0
- %0.sub1:vreg_96 = COPY $vgpr1
- %0.sub2:vreg_96 = COPY $vgpr2
+ undef %0.sub0:vreg_96_align2 = COPY $vgpr0
+ %0.sub1:vreg_96_align2 = COPY $vgpr1
+ %0.sub2:vreg_96_align2 = COPY $vgpr2
undef %3.sub0:areg_96_align2 = COPY %0.sub0
%3.sub1:areg_96_align2 = COPY %0.sub1
%3.sub2:areg_96_align2 = COPY %0.sub2
@@ -1472,13 +2382,13 @@ body: |
...
---
-name: copy_vgpr64_to_areg64_coalesce_with_av128_sub_snop
+name: copy_vgpr64_to_areg64_coalesce_with_av128_both_sub_snop
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_sub_snop
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_both_sub_snop
; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
@@ -1487,7 +2397,7 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY]].sub2_sub3
; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
; CHECK-NEXT: SI_RETURN
- undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
+ undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
%0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128 = COPY %0.sub0_sub1
%2.sub2_sub3:areg_128 = COPY %0.sub2_sub3
@@ -1496,26 +2406,24 @@ body: |
...
-
-
---
-name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_sub_snop
+name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_both_sub_snop
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_sub_snop
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_both_sub_snop
; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1
- ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY]].sub2_sub3
; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
; CHECK-NEXT: SI_RETURN
- undef %0.sub0_sub1:vreg_128 =COPY $vgpr0_vgpr1
- %0.sub2_sub3:vreg_128 = COPY $vgpr2_vgpr3
+ undef %0.sub0_sub1:vreg_128_align2 = COPY $vgpr0_vgpr1
+ %0.sub2_sub3:vreg_128_align2 = COPY $vgpr2_vgpr3
undef %2.sub0_sub1:areg_128_align2 = COPY %0.sub0_sub1
%2.sub2_sub3:areg_128_align2 = COPY %0.sub2_sub3
S_NOP 0, implicit %2
@@ -1524,13 +2432,13 @@ body: |
...
---
-name: copy_sgpr32_to_areg64_align2_sub_snop
+name: copy_sgpr32_to_areg64_align2_both_sub_snop
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr8, $sgpr9
- ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_sub_snop
+ ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_both_sub_snop
; CHECK: liveins: $sgpr8, $sgpr9
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:sreg_64 = COPY $sgpr8
@@ -1549,13 +2457,13 @@ body: |
...
---
-name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_sub_snop
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_both_sub_snop
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1_vgpr2
- ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_sub_snop
+ ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_both_sub_snop
; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
@@ -1564,7 +2472,7 @@ body: |
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub1_sub2
; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
; CHECK-NEXT: SI_RETURN
- undef %0.sub0:vreg_96 =COPY $vgpr0
+ undef %0.sub0:vreg_96 = COPY $vgpr0
%0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
undef %2.sub0:areg_96 = COPY %0.sub0
%2.sub1_sub2:areg_96 = COPY %0.sub1_sub2
@@ -1573,24 +2481,49 @@ body: |
...
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_shuffle_both_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
+ ; CHECK-NEXT: S_NOP 0, implicit %2
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0:vreg_96 = COPY $vgpr0
+ %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ undef %2.sub0:areg_96 = COPY %0.sub2
+ %2.sub1_sub2:areg_96 = COPY %0.sub0_sub1
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+
---
-name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_sub_snop
+name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_both_sub_snop
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1_vgpr2
- ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_sub_snop
+ ; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_both_sub_snop
; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_96_align2 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY]].sub1_sub2
; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
; CHECK-NEXT: SI_RETURN
- undef %0.sub0:vreg_96 =COPY $vgpr0
- %0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ undef %0.sub0:vreg_96_align2 = COPY $vgpr0
+ %0.sub1_sub2:vreg_96_align2 = COPY $vgpr1_vgpr2
undef %2.sub0:areg_96_align2 = COPY %0.sub0
%2.sub1_sub2:areg_96_align2 = COPY %0.sub1_sub2
S_NOP 0, implicit %2
@@ -1599,13 +2532,13 @@ body: |
...
---
-name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_sub_snop
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_both_sub_snop
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2
- ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_sub_snop
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_both_sub_snop
; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
@@ -1624,13 +2557,38 @@ body: |
...
---
-name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_sub_snop
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_both_sub_snop
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1_vgpr2
- ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_sub_snop
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_both_sub_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:vreg_96_align2 = COPY $vgpr2
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ undef %0.sub0_sub1:vreg_96_align2 = COPY $vgpr0_vgpr1
+ %0.sub2:vreg_96_align2 = COPY $vgpr2
+ undef %2.sub0_sub1:areg_96_align2 = COPY %0.sub0_sub1
+ %2.sub2:areg_96_align2 = COPY %0.sub2
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_mismatch_both_sub_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_mismatch_both_sub_snop
; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:vreg_96 = COPY $vgpr0_vgpr1
@@ -1648,127 +2606,171 @@ body: |
...
+
---
-name: copy_vgpr32_x2_to_areg64_sub_snop
+name: copy_vgpr32_to_areg64_coalesce_with_av64_whole_reg_snop
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0
+ liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_sub_snop
- ; CHECK: liveins: $vgpr0
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_whole_reg_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
- ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]].sub0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64 = COPY [[COPY]]
; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
; CHECK-NEXT: SI_RETURN
- undef %0.sub0:vreg_64 = COPY $vgpr0
- undef %2.sub0:areg_64 = COPY %0.sub0
- %2.sub1:areg_64 = COPY %0.sub0
+ %0:vreg_64 = COPY $vgpr0_vgpr1
+ %2:areg_64 = COPY %0
S_NOP 0, implicit %2
SI_RETURN
...
---
-name: copy_vgpr32_x3_to_areg96_sub_snop
+name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_whole_reg_snop
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0
+ liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_sub_snop
- ; CHECK: liveins: $vgpr0
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_whole_reg_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
- ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]].sub0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
; CHECK-NEXT: SI_RETURN
- undef %0.sub0:vreg_64 = COPY $vgpr0
- undef %1.sub0:areg_96 = COPY %0.sub0
- %1.sub1:areg_96 = COPY %0.sub0
- %1.sub2:areg_96 = COPY %0.sub0
- S_NOP 0, implicit %1
+ %0:vreg_64_align2 = COPY $vgpr0_vgpr1
+ %2:areg_64_align2 = COPY %0
+ S_NOP 0, implicit %2
SI_RETURN
...
---
-name: copy_vgpr32_x3_to_areg96_align2_sub_snop
+name: copy_vgpr32_to_areg96_coalesce_with_av96_whole_reg_snop
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0
+ liveins: $vgpr0, $vgpr1, $vgpr2
- ; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_align2_sub_snop
- ; CHECK: liveins: $vgpr0
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_whole_reg_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
- ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]].sub0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96 = COPY [[COPY]]
; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
; CHECK-NEXT: SI_RETURN
- undef %0.sub0:vreg_64 = COPY $vgpr0
- undef %1.sub0:areg_96_align2 = COPY %0.sub0
- %1.sub1:areg_96_align2 = COPY %0.sub0
- S_NOP 0, implicit %1
+ %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
+ %3:areg_96 = COPY %0
+ S_NOP 0, implicit %3
SI_RETURN
...
---
-name: copy_vgpr32_x4_to_areg128_sub_snop
+name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_whole_reg_snop
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0
+ liveins: $vgpr0, $vgpr1, $vgpr2
- ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_sub_snop
- ; CHECK: liveins: $vgpr0
+ ; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_whole_reg_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
- ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128 = COPY [[COPY]].sub0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]].sub0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]].sub0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
; CHECK-NEXT: SI_RETURN
- undef %0.sub0:vreg_64 = COPY $vgpr0
- undef %1.sub0:areg_128 = COPY %0.sub0
- %1.sub1:areg_128 = COPY %0.sub0
- %1.sub2:areg_128 = COPY %0.sub0
- %1.sub3:areg_128 = COPY %0.sub0
- S_NOP 0, implicit %1
+ %0:vreg_96_align2 = COPY $vgpr0_vgpr1_vgpr2
+ %3:areg_96_align2 = COPY %0
+ S_NOP 0, implicit %3
SI_RETURN
...
---
-name: copy_vgpr32_x4_to_areg128_align2_sub_snop
+name: copy_vgpr64_to_areg64_coalesce_with_av128_whole_reg_snop
tracksRegLiveness: true
body: |
bb.0:
- liveins: $vgpr0
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_align2_sub_snop
- ; CHECK: liveins: $vgpr0
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_whole_reg_snop
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $vgpr0
- ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY]].sub0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]].sub0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]].sub0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128 = COPY [[COPY]]
; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
; CHECK-NEXT: SI_RETURN
- undef %0.sub0:vreg_64 = COPY $vgpr0
- undef %1.sub0:areg_128_align2 = COPY %0.sub0
- %1.sub1:areg_128_align2 = COPY %0.sub0
- %1.sub2:areg_128_align2 = COPY %0.sub0
- %1.sub3:areg_128_align2 = COPY %0.sub0
- S_NOP 0, implicit %1
+ %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ %2:areg_128 = COPY %0
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_whole_reg_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+ ; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_whole_reg_snop
+ ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_128_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_128_align2 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+ %2:areg_128_align2 = COPY %0
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_sgpr32_to_areg64_align2_whole_reg_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr8, $sgpr9
+
+ ; CHECK-LABEL: name: copy_sgpr32_to_areg64_align2_whole_reg_snop
+ ; CHECK: liveins: $sgpr8, $sgpr9
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr8_sgpr9
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_64_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:sreg_64 = COPY $sgpr8_sgpr9
+ %2:areg_64_align2 = COPY %0
+ S_NOP 0, implicit %2
+ SI_RETURN
+
+...
+
+---
+name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_mismatch_whole_reg_snop
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ ; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_mismatch_whole_reg_snop
+ ; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:areg_96_align2 = COPY [[COPY]]
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: SI_RETURN
+ %0:vreg_96 = COPY $vgpr0_vgpr1_vgpr2
+ %2:areg_96_align2 = COPY %0
+ S_NOP 0, implicit %2
SI_RETURN
...
>From 72a9bfbecd3874d466f3e924c6183c523dc7cc81 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 25 Mar 2025 15:36:55 -0700
Subject: [PATCH 4/4] [RegisterCoalescer]: Try inflated RC for coalescing
reg->subreg
Change-Id: Id1123431e9fdcabac2811f5b18abff7d66e2d1f3
---
.../llvm/CodeGen/MachineRegisterInfo.h | 9 +-
llvm/lib/CodeGen/MachineRegisterInfo.cpp | 16 +-
llvm/lib/CodeGen/RegisterCoalescer.cpp | 9 +-
.../coalesce-copy-to-agpr-to-av-registers.mir | 155 +-
...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 300 +-
.../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll | 3266 ++++++++++-------
.../AMDGPU/mfma-no-register-aliasing.ll | 706 ++--
7 files changed, 2568 insertions(+), 1893 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 1c465741cb462..dc49733e2a59b 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -738,7 +738,14 @@ class MachineRegisterInfo {
/// recomputeRegClass - Try to find a legal super-class of Reg's register
/// class that still satisfies the constraints from the instructions using
- /// Reg. Returns true if Reg was upgraded.
+ /// \p Reg. \p return the super-class TargetRegisterClass if one was found,
+ /// otherwise \p return the original TargetRegisterClass.
+ const TargetRegisterClass *
+ getLargestConstrainedSuperClass(Register Reg) const;
+
+ /// recomputeRegClass - Try to find a legal super-class of Reg's register
+ /// class that still satisfies the constraints from the instructions using
+ /// \p Reg. \p return true if Reg was upgraded.
///
/// This method can be used after constraints have been removed from a
/// virtual register, for example after removing instructions or splitting
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 937f63f6c5e00..d483625212c55 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -118,8 +118,8 @@ MachineRegisterInfo::constrainRegAttrs(Register Reg,
return true;
}
-bool
-MachineRegisterInfo::recomputeRegClass(Register Reg) {
+const TargetRegisterClass *
+MachineRegisterInfo::getLargestConstrainedSuperClass(Register Reg) const {
const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
const TargetRegisterClass *OldRC = getRegClass(Reg);
const TargetRegisterInfo *TRI = getTargetRegisterInfo();
@@ -127,7 +127,7 @@ MachineRegisterInfo::recomputeRegClass(Register Reg) {
// Stop early if there is no room to grow.
if (NewRC == OldRC)
- return false;
+ return NewRC;
// Accumulate constraints from all uses.
for (MachineOperand &MO : reg_nodbg_operands(Reg)) {
@@ -136,8 +136,16 @@ MachineRegisterInfo::recomputeRegClass(Register Reg) {
unsigned OpNo = &MO - &MI->getOperand(0);
NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, TII, TRI);
if (!NewRC || NewRC == OldRC)
- return false;
+ return OldRC;
}
+ return NewRC;
+}
+
+bool MachineRegisterInfo::recomputeRegClass(Register Reg) {
+ const TargetRegisterClass *OldRC = getRegClass(Reg);
+ const TargetRegisterClass *NewRC = getLargestConstrainedSuperClass(Reg);
+ if (NewRC == OldRC)
+ return false;
setRegClass(Reg, NewRC);
return true;
}
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index dbd354f2ca2c4..b2e83beae6f62 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -477,7 +477,9 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
Flipped = true;
}
- const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
+ const MachineFunction *MF = MI->getMF();
+
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
if (Dst.isPhysical()) {
@@ -515,6 +517,11 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
// SrcReg will be merged with a sub-register of DstReg.
SrcIdx = DstSub;
NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+ if (!NewRC) {
+ auto SuperDstRC = MRI.getLargestConstrainedSuperClass(Dst);
+ if (SuperDstRC != DstRC)
+ NewRC = TRI.getMatchingSuperRegClass(SuperDstRC, SrcRC, DstSub);
+ }
} else if (SrcSub) {
// DstReg will be merged with a sub-register of SrcReg.
DstIdx = SrcSub;
diff --git a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
index 2e7f805798120..6786c6712ad3e 100644
--- a/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
+++ b/llvm/test/CodeGen/AMDGPU/coalesce-copy-to-agpr-to-av-registers.mir
@@ -1148,10 +1148,10 @@ body: |
; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96
; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub0_sub1
; CHECK-NEXT: INLINEASM &"; use $0", 0 /* attdialect */, 4587529 /* reguse:AReg_96 */, [[COPY2]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_96 = COPY $vgpr0
@@ -1442,11 +1442,9 @@ body: |
; CHECK-LABEL: name: copy_vgpr32_to_areg64_coalesce_with_av64_snop
; CHECK: liveins: $vgpr0, $vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_64 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_64 = COPY $vgpr1
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
@@ -1467,11 +1465,9 @@ body: |
; CHECK-LABEL: name: copy_vgpr32_to_areg64_align2_coalesce_with_av64_align2_snop
; CHECK: liveins: $vgpr0, $vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_64_align2 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_64_align2 = COPY $vgpr1
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
@@ -1492,13 +1488,10 @@ body: |
; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_snop
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96 = COPY [[COPY1]]
- ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96 = COPY [[COPY2]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY3]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_96 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_96 = COPY $vgpr2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
@@ -1521,13 +1514,10 @@ body: |
; CHECK-LABEL: name: copy_vgpr32_to_areg96_coalesce_with_av96_align2_snop
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY2]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY3]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96_align2 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_96_align2 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_96_align2 = COPY $vgpr2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
@@ -1550,11 +1540,9 @@ body: |
; CHECK-LABEL: name: copy_vgpr64_to_areg64_coalesce_with_av128_snop
; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:av_128 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:av_128 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
; CHECK-NEXT: SI_RETURN
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
@@ -1575,11 +1563,9 @@ body: |
; CHECK-LABEL: name: copy_vgpr64_to_areg64_align2_coalesce_with_av128_align2_snop
; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_128_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2_sub3:areg_128_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:av_128_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2_sub3:av_128_align2 = COPY $vgpr2_vgpr3
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
; CHECK-NEXT: SI_RETURN
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vreg_64 = COPY $vgpr2_vgpr3
@@ -1625,11 +1611,9 @@ body: |
; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_snop
; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:av_96 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
%1:vreg_64 = COPY $vgpr1_vgpr2
@@ -1650,11 +1634,9 @@ body: |
; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96_align2_snop
; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96_align2 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:av_96_align2 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
%1:vreg_64 = COPY $vgpr1_vgpr2
@@ -1675,11 +1657,9 @@ body: |
; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_snop
; CHECK: liveins: $vgpr0_vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:av_96 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_96 = COPY $vgpr2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
; CHECK-NEXT: SI_RETURN
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vgpr_32 = COPY $vgpr2
@@ -1700,11 +1680,9 @@ body: |
; CHECK-LABEL: name: copy_vgpr64_vgpr32_to_areg96_coalesce_with_av96_align2_snop
; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:areg_96_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:areg_96_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0_sub1:av_96_align2 = COPY $vgpr0_vgpr1
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_96_align2 = COPY $vgpr2
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
; CHECK-NEXT: SI_RETURN
%0:vreg_64 = COPY $vgpr0_vgpr1
%1:vgpr_32 = COPY $vgpr2
@@ -1725,10 +1703,9 @@ body: |
; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_snop
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_64 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_64 = COPY [[COPY]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_64 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_64 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
undef %2.sub0:areg_64 = COPY %0
@@ -1748,11 +1725,9 @@ body: |
; CHECK-LABEL: name: copy_vgpr32_x2_to_areg64_coalesce_with_av64_align2_snop
; CHECK: liveins: $vgpr0, $vgpr1
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_64_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:areg_64_align2 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_64_align2 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_64_align2 = COPY $vgpr1
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = COPY $vgpr1
@@ -1773,10 +1748,9 @@ body: |
; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_snop
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96 = COPY [[COPY]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_96 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
undef %1.sub0:areg_96 = COPY %0
@@ -1796,10 +1770,9 @@ body: |
; CHECK-LABEL: name: copy_vgpr32_x3_to_areg96_align2_snop
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_96_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_96_align2 = COPY [[COPY]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_96_align2 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_96_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
undef %1.sub0:areg_96_align2 = COPY %0
@@ -1819,12 +1792,11 @@ body: |
; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_snop
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128 = COPY [[COPY]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_128 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_128 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_128 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub3:av_128 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
undef %1.sub0:areg_128 = COPY %0
@@ -1846,12 +1818,11 @@ body: |
; CHECK-LABEL: name: copy_vgpr32_x4_to_areg128_align2_snop
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:areg_128_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub2:areg_128_align2 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub3:areg_128_align2 = COPY [[COPY]]
- ; CHECK-NEXT: S_NOP 0, implicit [[COPY1]]
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:av_128_align2 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:av_128_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub2:av_128_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub3:av_128_align2 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY]]
; CHECK-NEXT: SI_RETURN
%0:vgpr_32 = COPY $vgpr0
undef %1.sub0:areg_128_align2 = COPY %0
@@ -2490,11 +2461,11 @@ body: |
; CHECK-LABEL: name: copy_vgpr32_vgpr64_to_areg96_coalesce_with_av96
; CHECK: liveins: $vgpr0, $vgpr1_vgpr2
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
- ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY1]]
- ; CHECK-NEXT: S_NOP 0, implicit %2
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub0:vreg_96 = COPY $vgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:areg_96 = COPY [[COPY]].sub2
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1_sub2:areg_96 = COPY [[COPY]].sub0_sub1
+ ; CHECK-NEXT: S_NOP 0, implicit [[COPY2]]
; CHECK-NEXT: SI_RETURN
undef %0.sub0:vreg_96 = COPY $vgpr0
%0.sub1_sub2:vreg_96 = COPY $vgpr1_vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index dac54c9f85e96..1ab7de7cda935 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -3515,42 +3515,56 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v24, s0
-; SDAG-NEXT: v_mov_b32_e32 v25, s1
-; SDAG-NEXT: v_mov_b32_e32 v26, s2
-; SDAG-NEXT: v_mov_b32_e32 v27, s3
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
-; SDAG-NEXT: v_mov_b32_e32 v29, s17
-; SDAG-NEXT: v_mov_b32_e32 v30, s18
-; SDAG-NEXT: v_mov_b32_e32 v31, s19
-; SDAG-NEXT: v_mov_b32_e32 v32, s28
-; SDAG-NEXT: v_mov_b32_e32 v33, s29
-; SDAG-NEXT: v_mov_b32_e32 v16, s20
-; SDAG-NEXT: v_mov_b32_e32 v17, s21
-; SDAG-NEXT: v_mov_b32_e32 v18, s22
-; SDAG-NEXT: v_mov_b32_e32 v19, s23
-; SDAG-NEXT: v_mov_b32_e32 v20, s24
-; SDAG-NEXT: v_mov_b32_e32 v21, s25
-; SDAG-NEXT: v_mov_b32_e32 v22, s26
-; SDAG-NEXT: v_mov_b32_e32 v23, s27
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v32
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v33
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v0
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v1
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v2
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v3
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v4
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v5
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v6
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v7
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
+; SDAG-NEXT: v_mov_b32_e32 v32, s0
+; SDAG-NEXT: v_mov_b32_e32 v33, s1
+; SDAG-NEXT: v_mov_b32_e32 v34, s2
+; SDAG-NEXT: v_mov_b32_e32 v35, s3
+; SDAG-NEXT: v_mov_b32_e32 v36, s16
+; SDAG-NEXT: v_mov_b32_e32 v37, s17
+; SDAG-NEXT: v_mov_b32_e32 v38, s18
+; SDAG-NEXT: v_mov_b32_e32 v39, s19
+; SDAG-NEXT: v_mov_b32_e32 v16, s28
+; SDAG-NEXT: v_mov_b32_e32 v31, v13
+; SDAG-NEXT: v_mov_b32_e32 v30, v12
+; SDAG-NEXT: v_mov_b32_e32 v29, v11
+; SDAG-NEXT: v_mov_b32_e32 v28, v10
+; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: v_mov_b32_e32 v26, v8
+; SDAG-NEXT: v_mov_b32_e32 v25, v7
+; SDAG-NEXT: v_mov_b32_e32 v24, v6
+; SDAG-NEXT: v_mov_b32_e32 v23, v5
+; SDAG-NEXT: v_mov_b32_e32 v22, v4
+; SDAG-NEXT: v_mov_b32_e32 v21, v3
+; SDAG-NEXT: v_mov_b32_e32 v20, v2
+; SDAG-NEXT: v_mov_b32_e32 v19, v1
+; SDAG-NEXT: v_mov_b32_e32 v18, v0
+; SDAG-NEXT: v_mov_b32_e32 v17, s29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_mov_b32_e32 v0, s20
+; SDAG-NEXT: v_mov_b32_e32 v1, s21
+; SDAG-NEXT: v_mov_b32_e32 v2, s22
+; SDAG-NEXT: v_mov_b32_e32 v3, s23
+; SDAG-NEXT: v_mov_b32_e32 v4, s24
+; SDAG-NEXT: v_mov_b32_e32 v5, s25
+; SDAG-NEXT: v_mov_b32_e32 v6, s26
+; SDAG-NEXT: v_mov_b32_e32 v7, s27
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v31
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[24:31], v[16:23], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
@@ -3579,34 +3593,48 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgprs(<8 x i32> inr
; GISEL-NEXT: s_mov_b32 s13, s1
; GISEL-NEXT: s_mov_b32 s14, s2
; GISEL-NEXT: s_mov_b32 s15, s3
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-NEXT: v_mov_b32_e32 v32, s28
-; GISEL-NEXT: v_mov_b32_e32 v33, s29
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[26:27]
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[24:25]
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[22:23]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[20:21]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v32
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v33
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v0
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v1
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v2
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v3
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v4
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v5
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v6
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v7
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v13
+; GISEL-NEXT: v_mov_b32_e32 v18, v0
+; GISEL-NEXT: v_mov_b32_e32 v19, v1
+; GISEL-NEXT: v_mov_b32_e32 v20, v2
+; GISEL-NEXT: v_mov_b32_e32 v21, v3
+; GISEL-NEXT: v_mov_b32_e32 v22, v4
+; GISEL-NEXT: v_mov_b32_e32 v23, v5
+; GISEL-NEXT: v_mov_b32_e32 v24, v6
+; GISEL-NEXT: v_mov_b32_e32 v25, v7
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
+; GISEL-NEXT: v_mov_b32_e32 v16, s28
+; GISEL-NEXT: v_mov_b32_e32 v26, v8
+; GISEL-NEXT: v_mov_b32_e32 v27, v9
+; GISEL-NEXT: v_mov_b32_e32 v28, v10
+; GISEL-NEXT: v_mov_b32_e32 v29, v11
+; GISEL-NEXT: v_mov_b32_e32 v30, v12
+; GISEL-NEXT: v_mov_b32_e32 v31, v13
+; GISEL-NEXT: v_mov_b32_e32 v17, s29
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[26:27]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[20:21]
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v31
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[24:31], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[32:39], a[0:15], v14, v15 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
@@ -3843,6 +3871,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
; SDAG-NEXT: v_mov_b32_e32 v26, s0
; SDAG-NEXT: v_mov_b32_e32 v27, s1
; SDAG-NEXT: v_mov_b32_e32 v28, s2
@@ -3851,7 +3880,6 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; SDAG-NEXT: v_mov_b32_e32 v31, s17
; SDAG-NEXT: v_mov_b32_e32 v32, s18
; SDAG-NEXT: v_mov_b32_e32 v33, s19
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v8
; SDAG-NEXT: v_accvgpr_write_b32 a1, v9
; SDAG-NEXT: v_accvgpr_write_b32 a2, v10
; SDAG-NEXT: v_accvgpr_write_b32 a3, v11
@@ -3898,10 +3926,10 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_vgpr_sgpr_vgpr__vgp
; GISEL-NEXT: s_mov_b32 s14, s2
; GISEL-NEXT: s_mov_b32 s15, s3
; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[18:19]
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[16:17]
; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[14:15]
; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[12:13]
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v8
; GISEL-NEXT: v_accvgpr_write_b32 a1, v9
; GISEL-NEXT: v_accvgpr_write_b32 a2, v10
; GISEL-NEXT: v_accvgpr_write_b32 a3, v11
@@ -3993,42 +4021,48 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgpr_sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v16, s0
-; SDAG-NEXT: v_mov_b32_e32 v17, s1
-; SDAG-NEXT: v_mov_b32_e32 v18, s2
-; SDAG-NEXT: v_mov_b32_e32 v19, s3
-; SDAG-NEXT: v_mov_b32_e32 v20, s16
-; SDAG-NEXT: v_mov_b32_e32 v21, s17
-; SDAG-NEXT: v_mov_b32_e32 v22, s18
-; SDAG-NEXT: v_mov_b32_e32 v23, s19
-; SDAG-NEXT: v_mov_b32_e32 v24, s20
-; SDAG-NEXT: v_mov_b32_e32 v25, s21
-; SDAG-NEXT: v_mov_b32_e32 v26, s22
-; SDAG-NEXT: v_mov_b32_e32 v27, s23
-; SDAG-NEXT: v_mov_b32_e32 v28, s24
-; SDAG-NEXT: v_mov_b32_e32 v29, s25
-; SDAG-NEXT: v_mov_b32_e32 v30, s26
-; SDAG-NEXT: v_mov_b32_e32 v31, s27
-; SDAG-NEXT: v_mov_b32_e32 v32, s28
-; SDAG-NEXT: v_mov_b32_e32 v33, s29
-; SDAG-NEXT: v_accvgpr_write_b32 a0, v24
-; SDAG-NEXT: v_accvgpr_write_b32 a1, v25
-; SDAG-NEXT: v_accvgpr_write_b32 a2, v26
-; SDAG-NEXT: v_accvgpr_write_b32 a3, v27
-; SDAG-NEXT: v_accvgpr_write_b32 a4, v28
-; SDAG-NEXT: v_accvgpr_write_b32 a5, v29
-; SDAG-NEXT: v_accvgpr_write_b32 a6, v30
-; SDAG-NEXT: v_accvgpr_write_b32 a7, v31
-; SDAG-NEXT: v_accvgpr_write_b32 a8, v32
-; SDAG-NEXT: v_accvgpr_write_b32 a9, v33
-; SDAG-NEXT: v_accvgpr_write_b32 a10, v8
-; SDAG-NEXT: v_accvgpr_write_b32 a11, v9
-; SDAG-NEXT: v_accvgpr_write_b32 a12, v10
-; SDAG-NEXT: v_accvgpr_write_b32 a13, v11
-; SDAG-NEXT: v_accvgpr_write_b32 a14, v12
-; SDAG-NEXT: v_accvgpr_write_b32 a15, v13
+; SDAG-NEXT: v_mov_b32_e32 v32, s0
+; SDAG-NEXT: v_mov_b32_e32 v33, s1
+; SDAG-NEXT: v_mov_b32_e32 v34, s2
+; SDAG-NEXT: v_mov_b32_e32 v35, s3
+; SDAG-NEXT: v_mov_b32_e32 v36, s16
+; SDAG-NEXT: v_mov_b32_e32 v37, s17
+; SDAG-NEXT: v_mov_b32_e32 v38, s18
+; SDAG-NEXT: v_mov_b32_e32 v39, s19
+; SDAG-NEXT: v_mov_b32_e32 v16, s20
+; SDAG-NEXT: v_mov_b32_e32 v31, v13
+; SDAG-NEXT: v_mov_b32_e32 v30, v12
+; SDAG-NEXT: v_mov_b32_e32 v29, v11
+; SDAG-NEXT: v_mov_b32_e32 v28, v10
+; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: v_mov_b32_e32 v26, v8
+; SDAG-NEXT: v_mov_b32_e32 v17, s21
+; SDAG-NEXT: v_mov_b32_e32 v18, s22
+; SDAG-NEXT: v_mov_b32_e32 v19, s23
+; SDAG-NEXT: v_mov_b32_e32 v20, s24
+; SDAG-NEXT: v_mov_b32_e32 v21, s25
+; SDAG-NEXT: v_mov_b32_e32 v22, s26
+; SDAG-NEXT: v_mov_b32_e32 v23, s27
+; SDAG-NEXT: v_mov_b32_e32 v24, s28
+; SDAG-NEXT: v_mov_b32_e32 v25, s29
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v27
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v28
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v29
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v30
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v31
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
@@ -4057,38 +4091,44 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4_0_0_sgpr_vgpr_sgpr__vgp
; GISEL-NEXT: s_mov_b32 s13, s1
; GISEL-NEXT: s_mov_b32 s14, s2
; GISEL-NEXT: s_mov_b32 s15, s3
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[16:17]
-; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[14:15]
-; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[12:13]
-; GISEL-NEXT: v_mov_b32_e32 v24, s20
-; GISEL-NEXT: v_mov_b32_e32 v25, s21
-; GISEL-NEXT: v_mov_b32_e32 v26, s22
-; GISEL-NEXT: v_mov_b32_e32 v27, s23
-; GISEL-NEXT: v_mov_b32_e32 v28, s24
-; GISEL-NEXT: v_mov_b32_e32 v29, s25
-; GISEL-NEXT: v_mov_b32_e32 v30, s26
-; GISEL-NEXT: v_mov_b32_e32 v31, s27
-; GISEL-NEXT: v_mov_b32_e32 v32, s28
-; GISEL-NEXT: v_mov_b32_e32 v33, s29
-; GISEL-NEXT: v_accvgpr_write_b32 a0, v24
-; GISEL-NEXT: v_accvgpr_write_b32 a1, v25
-; GISEL-NEXT: v_accvgpr_write_b32 a2, v26
-; GISEL-NEXT: v_accvgpr_write_b32 a3, v27
-; GISEL-NEXT: v_accvgpr_write_b32 a4, v28
-; GISEL-NEXT: v_accvgpr_write_b32 a5, v29
-; GISEL-NEXT: v_accvgpr_write_b32 a6, v30
-; GISEL-NEXT: v_accvgpr_write_b32 a7, v31
-; GISEL-NEXT: v_accvgpr_write_b32 a8, v32
-; GISEL-NEXT: v_accvgpr_write_b32 a9, v33
-; GISEL-NEXT: v_accvgpr_write_b32 a10, v8
-; GISEL-NEXT: v_accvgpr_write_b32 a11, v9
-; GISEL-NEXT: v_accvgpr_write_b32 a12, v10
-; GISEL-NEXT: v_accvgpr_write_b32 a13, v11
-; GISEL-NEXT: v_accvgpr_write_b32 a14, v12
-; GISEL-NEXT: v_accvgpr_write_b32 a15, v13
+; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[12:13]
+; GISEL-NEXT: v_mov_b32_e32 v16, s20
+; GISEL-NEXT: v_mov_b32_e32 v26, v8
+; GISEL-NEXT: v_mov_b32_e32 v27, v9
+; GISEL-NEXT: v_mov_b32_e32 v28, v10
+; GISEL-NEXT: v_mov_b32_e32 v29, v11
+; GISEL-NEXT: v_mov_b32_e32 v30, v12
+; GISEL-NEXT: v_mov_b32_e32 v31, v13
+; GISEL-NEXT: v_mov_b32_e32 v17, s21
+; GISEL-NEXT: v_mov_b32_e32 v18, s22
+; GISEL-NEXT: v_mov_b32_e32 v19, s23
+; GISEL-NEXT: v_mov_b32_e32 v20, s24
+; GISEL-NEXT: v_mov_b32_e32 v21, s25
+; GISEL-NEXT: v_mov_b32_e32 v22, s26
+; GISEL-NEXT: v_mov_b32_e32 v23, s27
+; GISEL-NEXT: v_mov_b32_e32 v24, s28
+; GISEL-NEXT: v_mov_b32_e32 v25, s29
+; GISEL-NEXT: v_accvgpr_write_b32 a0, v16
+; GISEL-NEXT: v_accvgpr_write_b32 a1, v17
+; GISEL-NEXT: v_accvgpr_write_b32 a2, v18
+; GISEL-NEXT: v_accvgpr_write_b32 a3, v19
+; GISEL-NEXT: v_accvgpr_write_b32 a4, v20
+; GISEL-NEXT: v_accvgpr_write_b32 a5, v21
+; GISEL-NEXT: v_accvgpr_write_b32 a6, v22
+; GISEL-NEXT: v_accvgpr_write_b32 a7, v23
+; GISEL-NEXT: v_accvgpr_write_b32 a8, v24
+; GISEL-NEXT: v_accvgpr_write_b32 a9, v25
+; GISEL-NEXT: v_accvgpr_write_b32 a10, v26
+; GISEL-NEXT: v_accvgpr_write_b32 a11, v27
+; GISEL-NEXT: v_accvgpr_write_b32 a12, v28
+; GISEL-NEXT: v_accvgpr_write_b32 a13, v29
+; GISEL-NEXT: v_accvgpr_write_b32 a14, v30
+; GISEL-NEXT: v_accvgpr_write_b32 a15, v31
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[16:23], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
+; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[32:39], v[0:7], a[0:15], v14, v15 op_sel_hi:[0,0,0]
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 7
; GISEL-NEXT: s_nop 3
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 77d4aad5f3174..08167cd226a4c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -72,46 +72,94 @@ bb:
}
define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x64_f16:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x64_f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x64_f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
}
define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <4 x float> %result
}
define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <4 x float> %result
}
@@ -252,25 +300,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x32_f16:
@@ -288,22 +353,14 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16(<8 x half> %arg0, <16 x half>
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -315,25 +372,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags0:
@@ -351,22 +425,14 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags0(<8 x half> %arg0, <16
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -378,25 +444,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__flags1:
@@ -414,22 +497,14 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__flags1(<8 x half> %arg0, <16
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -441,94 +516,108 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
; SDAG-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v36, s0
-; SDAG-NEXT: v_mov_b32_e32 v37, s1
-; SDAG-NEXT: v_mov_b32_e32 v38, s2
-; SDAG-NEXT: v_mov_b32_e32 v39, s3
+; SDAG-NEXT: v_mov_b32_e32 v28, s0
+; SDAG-NEXT: v_mov_b32_e32 v29, s1
+; SDAG-NEXT: v_mov_b32_e32 v30, s2
+; SDAG-NEXT: v_mov_b32_e32 v31, s3
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: v_mov_b32_e32 v26, v8
+; SDAG-NEXT: v_mov_b32_e32 v25, v7
+; SDAG-NEXT: v_mov_b32_e32 v24, v6
+; SDAG-NEXT: v_mov_b32_e32 v23, v5
+; SDAG-NEXT: v_mov_b32_e32 v22, v4
+; SDAG-NEXT: v_mov_b32_e32 v21, v3
+; SDAG-NEXT: v_mov_b32_e32 v20, v2
+; SDAG-NEXT: v_mov_b32_e32 v19, v1
+; SDAG-NEXT: v_mov_b32_e32 v18, v0
; SDAG-NEXT: v_mov_b32_e32 v13, s25
; SDAG-NEXT: v_mov_b32_e32 v14, s26
; SDAG-NEXT: v_mov_b32_e32 v15, s27
; SDAG-NEXT: v_mov_b32_e32 v16, s28
; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
-; SDAG-NEXT: v_mov_b32_e32 v29, s17
-; SDAG-NEXT: v_mov_b32_e32 v30, s18
-; SDAG-NEXT: v_mov_b32_e32 v31, s19
-; SDAG-NEXT: v_mov_b32_e32 v32, s20
-; SDAG-NEXT: v_mov_b32_e32 v33, s21
-; SDAG-NEXT: v_mov_b32_e32 v34, s22
-; SDAG-NEXT: v_mov_b32_e32 v35, s23
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v18, v0
-; SDAG-NEXT: v_mov_b32_e32 v19, v1
-; SDAG-NEXT: v_mov_b32_e32 v20, v2
-; SDAG-NEXT: v_mov_b32_e32 v21, v3
-; SDAG-NEXT: v_mov_b32_e32 v22, v4
-; SDAG-NEXT: v_mov_b32_e32 v23, v5
-; SDAG-NEXT: v_mov_b32_e32 v24, v6
-; SDAG-NEXT: v_mov_b32_e32 v25, v7
-; SDAG-NEXT: v_mov_b32_e32 v26, v8
-; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_mov_b32_e32 v0, s16
+; SDAG-NEXT: v_mov_b32_e32 v1, s17
+; SDAG-NEXT: v_mov_b32_e32 v2, s18
+; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s20
+; SDAG-NEXT: v_mov_b32_e32 v5, s21
+; SDAG-NEXT: v_mov_b32_e32 v6, s22
+; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT: v_smfmac_f32_32x32x32_f16 a[0:15], v[28:31], v[0:7], v10
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x32_f16__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v11, v0
-; GISEL-NEXT: v_mov_b32_e32 v12, v1
-; GISEL-NEXT: v_mov_b32_e32 v13, v2
-; GISEL-NEXT: v_mov_b32_e32 v14, v3
-; GISEL-NEXT: v_mov_b32_e32 v15, v4
-; GISEL-NEXT: v_mov_b32_e32 v16, v5
-; GISEL-NEXT: v_mov_b32_e32 v17, v6
-; GISEL-NEXT: v_mov_b32_e32 v18, v7
-; GISEL-NEXT: v_mov_b32_e32 v19, v8
-; GISEL-NEXT: v_mov_b32_e32 v20, v9
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT: v_mov_b32_e32 v21, v10
-; GISEL-NEXT: v_mov_b32_e32 v0, s24
-; GISEL-NEXT: v_mov_b32_e32 v1, s25
-; GISEL-NEXT: v_mov_b32_e32 v2, s26
-; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
-; GISEL-NEXT: v_mov_b32_e32 v5, s29
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v6, v11
-; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mov_b32_e32 v8, v13
-; GISEL-NEXT: v_mov_b32_e32 v9, v14
-; GISEL-NEXT: v_mov_b32_e32 v10, v15
-; GISEL-NEXT: v_mov_b32_e32 v11, v16
-; GISEL-NEXT: v_mov_b32_e32 v12, v17
-; GISEL-NEXT: v_mov_b32_e32 v13, v18
-; GISEL-NEXT: v_mov_b32_e32 v14, v19
-; GISEL-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v18, s24
+; GISEL-NEXT: v_mov_b32_e32 v19, s25
+; GISEL-NEXT: v_mov_b32_e32 v24, v0
+; GISEL-NEXT: v_mov_b32_e32 v25, v1
+; GISEL-NEXT: v_mov_b32_e32 v26, v2
+; GISEL-NEXT: v_mov_b32_e32 v27, v3
+; GISEL-NEXT: v_mov_b32_e32 v28, v4
+; GISEL-NEXT: v_mov_b32_e32 v29, v5
+; GISEL-NEXT: v_mov_b32_e32 v30, v6
+; GISEL-NEXT: v_mov_b32_e32 v31, v7
+; GISEL-NEXT: v_mov_b32_e32 v32, v8
+; GISEL-NEXT: v_mov_b32_e32 v33, v9
+; GISEL-NEXT: v_mov_b32_e32 v16, v10
+; GISEL-NEXT: v_mov_b32_e32 v20, s26
+; GISEL-NEXT: v_mov_b32_e32 v21, s27
+; GISEL-NEXT: v_mov_b32_e32 v22, s28
+; GISEL-NEXT: v_mov_b32_e32 v23, s29
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[30:33], v[22:29], v21
+; GISEL-NEXT: v_smfmac_f32_32x32x32_f16 v[0:15], v[34:37], v[48:55], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half> %arg0, <16 x half> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
@@ -579,12 +668,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bflo
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16
; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
@@ -594,12 +688,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <1
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <4 x float> %result
@@ -609,12 +708,17 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <1
; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <4 x float> %result
@@ -705,25 +809,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfl
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
@@ -733,25 +854,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <16 x float> %result
@@ -761,25 +899,42 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <16 x float> %result
@@ -789,54 +944,70 @@ define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg
; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v36, s0
-; GCN-NEXT: v_mov_b32_e32 v37, s1
-; GCN-NEXT: v_mov_b32_e32 v38, s2
-; GCN-NEXT: v_mov_b32_e32 v39, s3
+; GCN-NEXT: v_mov_b32_e32 v28, s0
+; GCN-NEXT: v_mov_b32_e32 v29, s1
+; GCN-NEXT: v_mov_b32_e32 v30, s2
+; GCN-NEXT: v_mov_b32_e32 v31, s3
+; GCN-NEXT: v_mov_b32_e32 v12, s24
+; GCN-NEXT: v_mov_b32_e32 v27, v9
+; GCN-NEXT: v_mov_b32_e32 v26, v8
+; GCN-NEXT: v_mov_b32_e32 v25, v7
+; GCN-NEXT: v_mov_b32_e32 v24, v6
+; GCN-NEXT: v_mov_b32_e32 v23, v5
+; GCN-NEXT: v_mov_b32_e32 v22, v4
+; GCN-NEXT: v_mov_b32_e32 v21, v3
+; GCN-NEXT: v_mov_b32_e32 v20, v2
+; GCN-NEXT: v_mov_b32_e32 v19, v1
+; GCN-NEXT: v_mov_b32_e32 v18, v0
; GCN-NEXT: v_mov_b32_e32 v13, s25
; GCN-NEXT: v_mov_b32_e32 v14, s26
; GCN-NEXT: v_mov_b32_e32 v15, s27
; GCN-NEXT: v_mov_b32_e32 v16, s28
; GCN-NEXT: v_mov_b32_e32 v17, s29
-; GCN-NEXT: v_mov_b32_e32 v28, s16
-; GCN-NEXT: v_mov_b32_e32 v29, s17
-; GCN-NEXT: v_mov_b32_e32 v30, s18
-; GCN-NEXT: v_mov_b32_e32 v31, s19
-; GCN-NEXT: v_mov_b32_e32 v32, s20
-; GCN-NEXT: v_mov_b32_e32 v33, s21
-; GCN-NEXT: v_mov_b32_e32 v34, s22
-; GCN-NEXT: v_mov_b32_e32 v35, s23
-; GCN-NEXT: v_mov_b32_e32 v12, s24
-; GCN-NEXT: v_mov_b32_e32 v18, v0
-; GCN-NEXT: v_mov_b32_e32 v19, v1
-; GCN-NEXT: v_mov_b32_e32 v20, v2
-; GCN-NEXT: v_mov_b32_e32 v21, v3
-; GCN-NEXT: v_mov_b32_e32 v22, v4
-; GCN-NEXT: v_mov_b32_e32 v23, v5
-; GCN-NEXT: v_mov_b32_e32 v24, v6
-; GCN-NEXT: v_mov_b32_e32 v25, v7
-; GCN-NEXT: v_mov_b32_e32 v26, v8
-; GCN-NEXT: v_mov_b32_e32 v27, v9
+; GCN-NEXT: v_accvgpr_write_b32 a0, v12
+; GCN-NEXT: v_mov_b32_e32 v0, s16
+; GCN-NEXT: v_mov_b32_e32 v1, s17
+; GCN-NEXT: v_mov_b32_e32 v2, s18
+; GCN-NEXT: v_mov_b32_e32 v3, s19
+; GCN-NEXT: v_mov_b32_e32 v4, s20
+; GCN-NEXT: v_mov_b32_e32 v5, s21
+; GCN-NEXT: v_mov_b32_e32 v6, s22
+; GCN-NEXT: v_mov_b32_e32 v7, s23
+; GCN-NEXT: v_accvgpr_write_b32 a1, v13
+; GCN-NEXT: v_accvgpr_write_b32 a2, v14
+; GCN-NEXT: v_accvgpr_write_b32 a3, v15
+; GCN-NEXT: v_accvgpr_write_b32 a4, v16
+; GCN-NEXT: v_accvgpr_write_b32 a5, v17
+; GCN-NEXT: v_accvgpr_write_b32 a6, v18
+; GCN-NEXT: v_accvgpr_write_b32 a7, v19
+; GCN-NEXT: v_accvgpr_write_b32 a8, v20
+; GCN-NEXT: v_accvgpr_write_b32 a9, v21
+; GCN-NEXT: v_accvgpr_write_b32 a10, v22
+; GCN-NEXT: v_accvgpr_write_b32 a11, v23
+; GCN-NEXT: v_accvgpr_write_b32 a12, v24
+; GCN-NEXT: v_accvgpr_write_b32 a13, v25
+; GCN-NEXT: v_accvgpr_write_b32 a14, v26
+; GCN-NEXT: v_accvgpr_write_b32 a15, v27
; GCN-NEXT: s_nop 1
-; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[12:27], v[36:39], v[28:35], v10
+; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[28:31], v[0:7], v10
; GCN-NEXT: s_nop 7
; GCN-NEXT: s_nop 3
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: v_mov_b32_e32 v4, v16
-; GCN-NEXT: v_mov_b32_e32 v5, v17
-; GCN-NEXT: v_mov_b32_e32 v6, v18
-; GCN-NEXT: v_mov_b32_e32 v7, v19
-; GCN-NEXT: v_mov_b32_e32 v8, v20
-; GCN-NEXT: v_mov_b32_e32 v9, v21
-; GCN-NEXT: v_mov_b32_e32 v10, v22
-; GCN-NEXT: v_mov_b32_e32 v11, v23
-; GCN-NEXT: v_mov_b32_e32 v12, v24
-; GCN-NEXT: v_mov_b32_e32 v13, v25
-; GCN-NEXT: v_mov_b32_e32 v14, v26
-; GCN-NEXT: v_mov_b32_e32 v15, v27
+; GCN-NEXT: v_accvgpr_read_b32 v0, a0
+; GCN-NEXT: v_accvgpr_read_b32 v1, a1
+; GCN-NEXT: v_accvgpr_read_b32 v2, a2
+; GCN-NEXT: v_accvgpr_read_b32 v3, a3
+; GCN-NEXT: v_accvgpr_read_b32 v4, a4
+; GCN-NEXT: v_accvgpr_read_b32 v5, a5
+; GCN-NEXT: v_accvgpr_read_b32 v6, a6
+; GCN-NEXT: v_accvgpr_read_b32 v7, a7
+; GCN-NEXT: v_accvgpr_read_b32 v8, a8
+; GCN-NEXT: v_accvgpr_read_b32 v9, a9
+; GCN-NEXT: v_accvgpr_read_b32 v10, a10
+; GCN-NEXT: v_accvgpr_read_b32 v11, a11
+; GCN-NEXT: v_accvgpr_read_b32 v12, a12
+; GCN-NEXT: v_accvgpr_read_b32 v13, a13
+; GCN-NEXT: v_accvgpr_read_b32 v14, a14
+; GCN-NEXT: v_accvgpr_read_b32 v15, a15
; GCN-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
@@ -916,46 +1087,94 @@ bb:
}
define <4 x i32> @test_smfmac_i32_16x16x128_i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_i32_16x16x128_i8:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_i32_16x16x128_i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_i32_16x16x128_i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x i32> %result
}
define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <4 x i32> %result
}
define <4 x i32> @test_smfmac_i32_16x16x128_i8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_i32_16x16x128_i8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_i32_16x16x128_i8__flags1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_i32_16x16x128_i8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x128.i8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x i32> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <4 x i32> %result
}
@@ -1102,25 +1321,42 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1,
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_i32_32x32x64_i8:
@@ -1138,22 +1374,14 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8(<4 x i32> %arg0, <8 x i32> %arg1,
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1165,25 +1393,42 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags0:
@@ -1201,22 +1446,14 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags0(<4 x i32> %arg0, <8 x i32
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1228,25 +1465,42 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__flags1:
@@ -1264,22 +1518,14 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__flags1(<4 x i32> %arg0, <8 x i32
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -1291,94 +1537,108 @@ define <16 x i32> @test_smfmac_i32_32x32x64_i8__sgpr(<4 x i32> inreg %arg0, <8 x
; SDAG-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v36, s0
-; SDAG-NEXT: v_mov_b32_e32 v37, s1
-; SDAG-NEXT: v_mov_b32_e32 v38, s2
-; SDAG-NEXT: v_mov_b32_e32 v39, s3
+; SDAG-NEXT: v_mov_b32_e32 v28, s0
+; SDAG-NEXT: v_mov_b32_e32 v29, s1
+; SDAG-NEXT: v_mov_b32_e32 v30, s2
+; SDAG-NEXT: v_mov_b32_e32 v31, s3
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: v_mov_b32_e32 v26, v8
+; SDAG-NEXT: v_mov_b32_e32 v25, v7
+; SDAG-NEXT: v_mov_b32_e32 v24, v6
+; SDAG-NEXT: v_mov_b32_e32 v23, v5
+; SDAG-NEXT: v_mov_b32_e32 v22, v4
+; SDAG-NEXT: v_mov_b32_e32 v21, v3
+; SDAG-NEXT: v_mov_b32_e32 v20, v2
+; SDAG-NEXT: v_mov_b32_e32 v19, v1
+; SDAG-NEXT: v_mov_b32_e32 v18, v0
; SDAG-NEXT: v_mov_b32_e32 v13, s25
; SDAG-NEXT: v_mov_b32_e32 v14, s26
; SDAG-NEXT: v_mov_b32_e32 v15, s27
; SDAG-NEXT: v_mov_b32_e32 v16, s28
; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
-; SDAG-NEXT: v_mov_b32_e32 v29, s17
-; SDAG-NEXT: v_mov_b32_e32 v30, s18
-; SDAG-NEXT: v_mov_b32_e32 v31, s19
-; SDAG-NEXT: v_mov_b32_e32 v32, s20
-; SDAG-NEXT: v_mov_b32_e32 v33, s21
-; SDAG-NEXT: v_mov_b32_e32 v34, s22
-; SDAG-NEXT: v_mov_b32_e32 v35, s23
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v18, v0
-; SDAG-NEXT: v_mov_b32_e32 v19, v1
-; SDAG-NEXT: v_mov_b32_e32 v20, v2
-; SDAG-NEXT: v_mov_b32_e32 v21, v3
-; SDAG-NEXT: v_mov_b32_e32 v22, v4
-; SDAG-NEXT: v_mov_b32_e32 v23, v5
-; SDAG-NEXT: v_mov_b32_e32 v24, v6
-; SDAG-NEXT: v_mov_b32_e32 v25, v7
-; SDAG-NEXT: v_mov_b32_e32 v26, v8
-; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_mov_b32_e32 v0, s16
+; SDAG-NEXT: v_mov_b32_e32 v1, s17
+; SDAG-NEXT: v_mov_b32_e32 v2, s18
+; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s20
+; SDAG-NEXT: v_mov_b32_e32 v5, s21
+; SDAG-NEXT: v_mov_b32_e32 v6, s22
+; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT: v_smfmac_i32_32x32x64_i8 a[0:15], v[28:31], v[0:7], v10
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_i32_32x32x64_i8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v11, v0
-; GISEL-NEXT: v_mov_b32_e32 v12, v1
-; GISEL-NEXT: v_mov_b32_e32 v13, v2
-; GISEL-NEXT: v_mov_b32_e32 v14, v3
-; GISEL-NEXT: v_mov_b32_e32 v15, v4
-; GISEL-NEXT: v_mov_b32_e32 v16, v5
-; GISEL-NEXT: v_mov_b32_e32 v17, v6
-; GISEL-NEXT: v_mov_b32_e32 v18, v7
-; GISEL-NEXT: v_mov_b32_e32 v19, v8
-; GISEL-NEXT: v_mov_b32_e32 v20, v9
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT: v_mov_b32_e32 v21, v10
-; GISEL-NEXT: v_mov_b32_e32 v0, s24
-; GISEL-NEXT: v_mov_b32_e32 v1, s25
-; GISEL-NEXT: v_mov_b32_e32 v2, s26
-; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
-; GISEL-NEXT: v_mov_b32_e32 v5, s29
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v6, v11
-; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mov_b32_e32 v8, v13
-; GISEL-NEXT: v_mov_b32_e32 v9, v14
-; GISEL-NEXT: v_mov_b32_e32 v10, v15
-; GISEL-NEXT: v_mov_b32_e32 v11, v16
-; GISEL-NEXT: v_mov_b32_e32 v12, v17
-; GISEL-NEXT: v_mov_b32_e32 v13, v18
-; GISEL-NEXT: v_mov_b32_e32 v14, v19
-; GISEL-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v18, s24
+; GISEL-NEXT: v_mov_b32_e32 v19, s25
+; GISEL-NEXT: v_mov_b32_e32 v24, v0
+; GISEL-NEXT: v_mov_b32_e32 v25, v1
+; GISEL-NEXT: v_mov_b32_e32 v26, v2
+; GISEL-NEXT: v_mov_b32_e32 v27, v3
+; GISEL-NEXT: v_mov_b32_e32 v28, v4
+; GISEL-NEXT: v_mov_b32_e32 v29, v5
+; GISEL-NEXT: v_mov_b32_e32 v30, v6
+; GISEL-NEXT: v_mov_b32_e32 v31, v7
+; GISEL-NEXT: v_mov_b32_e32 v32, v8
+; GISEL-NEXT: v_mov_b32_e32 v33, v9
+; GISEL-NEXT: v_mov_b32_e32 v16, v10
+; GISEL-NEXT: v_mov_b32_e32 v20, s26
+; GISEL-NEXT: v_mov_b32_e32 v21, s27
+; GISEL-NEXT: v_mov_b32_e32 v22, s28
+; GISEL-NEXT: v_mov_b32_e32 v23, s29
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[30:33], v[22:29], v21
+; GISEL-NEXT: v_smfmac_i32_32x32x64_i8 v[0:15], v[34:37], v[48:55], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x64.i8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x i32> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x i32> %result
@@ -1458,46 +1718,94 @@ bb:
}
define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
}
define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <4 x float> %result
}
define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_bf8__flags1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <4 x float> %result
}
@@ -1627,46 +1935,94 @@ bb:
}
define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
}
define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <4 x float> %result
}
define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <4 x float> %result
}
@@ -1796,46 +2152,94 @@ bb:
}
define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
}
define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <4 x float> %result
}
define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <4 x float> %result
}
@@ -1965,46 +2369,94 @@ bb:
}
define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <4 x float> %result
}
define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
ret <4 x float> %result
}
define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
-; GCN-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GCN-NEXT: s_nop 7
-; GCN-NEXT: v_mov_b32_e32 v0, v12
-; GCN-NEXT: v_mov_b32_e32 v1, v13
-; GCN-NEXT: v_mov_b32_e32 v2, v14
-; GCN-NEXT: v_mov_b32_e32 v3, v15
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; SDAG-NEXT: s_nop 7
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GISEL-NEXT: s_nop 7
+; GISEL-NEXT: v_mov_b32_e32 v0, v12
+; GISEL-NEXT: v_mov_b32_e32 v1, v13
+; GISEL-NEXT: v_mov_b32_e32 v2, v14
+; GISEL-NEXT: v_mov_b32_e32 v3, v15
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
ret <4 x float> %result
}
@@ -2151,25 +2603,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8:
@@ -2186,23 +2655,15 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8(<4 x i32> %arg0, <8 x i32>
; GISEL-NEXT: v_mov_b32_e32 v34, v8
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
-; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b32_e32 v37, v11
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2214,25 +2675,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags0:
@@ -2250,22 +2728,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags0(<4 x i32> %arg0, <
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2277,25 +2747,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__flags1:
@@ -2313,22 +2800,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__flags1(<4 x i32> %arg0, <
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2340,94 +2819,108 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v36, s0
-; SDAG-NEXT: v_mov_b32_e32 v37, s1
-; SDAG-NEXT: v_mov_b32_e32 v38, s2
-; SDAG-NEXT: v_mov_b32_e32 v39, s3
+; SDAG-NEXT: v_mov_b32_e32 v28, s0
+; SDAG-NEXT: v_mov_b32_e32 v29, s1
+; SDAG-NEXT: v_mov_b32_e32 v30, s2
+; SDAG-NEXT: v_mov_b32_e32 v31, s3
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: v_mov_b32_e32 v26, v8
+; SDAG-NEXT: v_mov_b32_e32 v25, v7
+; SDAG-NEXT: v_mov_b32_e32 v24, v6
+; SDAG-NEXT: v_mov_b32_e32 v23, v5
+; SDAG-NEXT: v_mov_b32_e32 v22, v4
+; SDAG-NEXT: v_mov_b32_e32 v21, v3
+; SDAG-NEXT: v_mov_b32_e32 v20, v2
+; SDAG-NEXT: v_mov_b32_e32 v19, v1
+; SDAG-NEXT: v_mov_b32_e32 v18, v0
; SDAG-NEXT: v_mov_b32_e32 v13, s25
; SDAG-NEXT: v_mov_b32_e32 v14, s26
; SDAG-NEXT: v_mov_b32_e32 v15, s27
; SDAG-NEXT: v_mov_b32_e32 v16, s28
; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
-; SDAG-NEXT: v_mov_b32_e32 v29, s17
-; SDAG-NEXT: v_mov_b32_e32 v30, s18
-; SDAG-NEXT: v_mov_b32_e32 v31, s19
-; SDAG-NEXT: v_mov_b32_e32 v32, s20
-; SDAG-NEXT: v_mov_b32_e32 v33, s21
-; SDAG-NEXT: v_mov_b32_e32 v34, s22
-; SDAG-NEXT: v_mov_b32_e32 v35, s23
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v18, v0
-; SDAG-NEXT: v_mov_b32_e32 v19, v1
-; SDAG-NEXT: v_mov_b32_e32 v20, v2
-; SDAG-NEXT: v_mov_b32_e32 v21, v3
-; SDAG-NEXT: v_mov_b32_e32 v22, v4
-; SDAG-NEXT: v_mov_b32_e32 v23, v5
-; SDAG-NEXT: v_mov_b32_e32 v24, v6
-; SDAG-NEXT: v_mov_b32_e32 v25, v7
-; SDAG-NEXT: v_mov_b32_e32 v26, v8
-; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_mov_b32_e32 v0, s16
+; SDAG-NEXT: v_mov_b32_e32 v1, s17
+; SDAG-NEXT: v_mov_b32_e32 v2, s18
+; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s20
+; SDAG-NEXT: v_mov_b32_e32 v5, s21
+; SDAG-NEXT: v_mov_b32_e32 v6, s22
+; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 a[0:15], v[28:31], v[0:7], v10
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_bf8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v11, v0
-; GISEL-NEXT: v_mov_b32_e32 v12, v1
-; GISEL-NEXT: v_mov_b32_e32 v13, v2
-; GISEL-NEXT: v_mov_b32_e32 v14, v3
-; GISEL-NEXT: v_mov_b32_e32 v15, v4
-; GISEL-NEXT: v_mov_b32_e32 v16, v5
-; GISEL-NEXT: v_mov_b32_e32 v17, v6
-; GISEL-NEXT: v_mov_b32_e32 v18, v7
-; GISEL-NEXT: v_mov_b32_e32 v19, v8
-; GISEL-NEXT: v_mov_b32_e32 v20, v9
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT: v_mov_b32_e32 v21, v10
-; GISEL-NEXT: v_mov_b32_e32 v0, s24
-; GISEL-NEXT: v_mov_b32_e32 v1, s25
-; GISEL-NEXT: v_mov_b32_e32 v2, s26
-; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
-; GISEL-NEXT: v_mov_b32_e32 v5, s29
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v6, v11
-; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mov_b32_e32 v8, v13
-; GISEL-NEXT: v_mov_b32_e32 v9, v14
-; GISEL-NEXT: v_mov_b32_e32 v10, v15
-; GISEL-NEXT: v_mov_b32_e32 v11, v16
-; GISEL-NEXT: v_mov_b32_e32 v12, v17
-; GISEL-NEXT: v_mov_b32_e32 v13, v18
-; GISEL-NEXT: v_mov_b32_e32 v14, v19
-; GISEL-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v18, s24
+; GISEL-NEXT: v_mov_b32_e32 v19, s25
+; GISEL-NEXT: v_mov_b32_e32 v24, v0
+; GISEL-NEXT: v_mov_b32_e32 v25, v1
+; GISEL-NEXT: v_mov_b32_e32 v26, v2
+; GISEL-NEXT: v_mov_b32_e32 v27, v3
+; GISEL-NEXT: v_mov_b32_e32 v28, v4
+; GISEL-NEXT: v_mov_b32_e32 v29, v5
+; GISEL-NEXT: v_mov_b32_e32 v30, v6
+; GISEL-NEXT: v_mov_b32_e32 v31, v7
+; GISEL-NEXT: v_mov_b32_e32 v32, v8
+; GISEL-NEXT: v_mov_b32_e32 v33, v9
+; GISEL-NEXT: v_mov_b32_e32 v16, v10
+; GISEL-NEXT: v_mov_b32_e32 v20, s26
+; GISEL-NEXT: v_mov_b32_e32 v21, s27
+; GISEL-NEXT: v_mov_b32_e32 v22, s28
+; GISEL-NEXT: v_mov_b32_e32 v23, s29
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[30:33], v[22:29], v21
+; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_bf8 v[0:15], v[34:37], v[48:55], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
@@ -2524,25 +3017,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8:
@@ -2560,22 +3070,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8(<4 x i32> %arg0, <8 x i32>
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2587,25 +3089,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags0:
@@ -2623,22 +3142,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags0(<4 x i32> %arg0, <
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2650,25 +3161,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__flags1:
@@ -2686,22 +3214,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__flags1(<4 x i32> %arg0, <
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2713,94 +3233,108 @@ define <16 x float> @test_smfmac_f32_32x32x64_bf8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v36, s0
-; SDAG-NEXT: v_mov_b32_e32 v37, s1
-; SDAG-NEXT: v_mov_b32_e32 v38, s2
-; SDAG-NEXT: v_mov_b32_e32 v39, s3
+; SDAG-NEXT: v_mov_b32_e32 v28, s0
+; SDAG-NEXT: v_mov_b32_e32 v29, s1
+; SDAG-NEXT: v_mov_b32_e32 v30, s2
+; SDAG-NEXT: v_mov_b32_e32 v31, s3
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: v_mov_b32_e32 v26, v8
+; SDAG-NEXT: v_mov_b32_e32 v25, v7
+; SDAG-NEXT: v_mov_b32_e32 v24, v6
+; SDAG-NEXT: v_mov_b32_e32 v23, v5
+; SDAG-NEXT: v_mov_b32_e32 v22, v4
+; SDAG-NEXT: v_mov_b32_e32 v21, v3
+; SDAG-NEXT: v_mov_b32_e32 v20, v2
+; SDAG-NEXT: v_mov_b32_e32 v19, v1
+; SDAG-NEXT: v_mov_b32_e32 v18, v0
; SDAG-NEXT: v_mov_b32_e32 v13, s25
; SDAG-NEXT: v_mov_b32_e32 v14, s26
; SDAG-NEXT: v_mov_b32_e32 v15, s27
; SDAG-NEXT: v_mov_b32_e32 v16, s28
; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
-; SDAG-NEXT: v_mov_b32_e32 v29, s17
-; SDAG-NEXT: v_mov_b32_e32 v30, s18
-; SDAG-NEXT: v_mov_b32_e32 v31, s19
-; SDAG-NEXT: v_mov_b32_e32 v32, s20
-; SDAG-NEXT: v_mov_b32_e32 v33, s21
-; SDAG-NEXT: v_mov_b32_e32 v34, s22
-; SDAG-NEXT: v_mov_b32_e32 v35, s23
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v18, v0
-; SDAG-NEXT: v_mov_b32_e32 v19, v1
-; SDAG-NEXT: v_mov_b32_e32 v20, v2
-; SDAG-NEXT: v_mov_b32_e32 v21, v3
-; SDAG-NEXT: v_mov_b32_e32 v22, v4
-; SDAG-NEXT: v_mov_b32_e32 v23, v5
-; SDAG-NEXT: v_mov_b32_e32 v24, v6
-; SDAG-NEXT: v_mov_b32_e32 v25, v7
-; SDAG-NEXT: v_mov_b32_e32 v26, v8
-; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_mov_b32_e32 v0, s16
+; SDAG-NEXT: v_mov_b32_e32 v1, s17
+; SDAG-NEXT: v_mov_b32_e32 v2, s18
+; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s20
+; SDAG-NEXT: v_mov_b32_e32 v5, s21
+; SDAG-NEXT: v_mov_b32_e32 v6, s22
+; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 a[0:15], v[28:31], v[0:7], v10
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_bf8_fp8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v11, v0
-; GISEL-NEXT: v_mov_b32_e32 v12, v1
-; GISEL-NEXT: v_mov_b32_e32 v13, v2
-; GISEL-NEXT: v_mov_b32_e32 v14, v3
-; GISEL-NEXT: v_mov_b32_e32 v15, v4
-; GISEL-NEXT: v_mov_b32_e32 v16, v5
-; GISEL-NEXT: v_mov_b32_e32 v17, v6
-; GISEL-NEXT: v_mov_b32_e32 v18, v7
-; GISEL-NEXT: v_mov_b32_e32 v19, v8
-; GISEL-NEXT: v_mov_b32_e32 v20, v9
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT: v_mov_b32_e32 v21, v10
-; GISEL-NEXT: v_mov_b32_e32 v0, s24
-; GISEL-NEXT: v_mov_b32_e32 v1, s25
-; GISEL-NEXT: v_mov_b32_e32 v2, s26
-; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
-; GISEL-NEXT: v_mov_b32_e32 v5, s29
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v6, v11
-; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mov_b32_e32 v8, v13
-; GISEL-NEXT: v_mov_b32_e32 v9, v14
-; GISEL-NEXT: v_mov_b32_e32 v10, v15
-; GISEL-NEXT: v_mov_b32_e32 v11, v16
-; GISEL-NEXT: v_mov_b32_e32 v12, v17
-; GISEL-NEXT: v_mov_b32_e32 v13, v18
-; GISEL-NEXT: v_mov_b32_e32 v14, v19
-; GISEL-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v18, s24
+; GISEL-NEXT: v_mov_b32_e32 v19, s25
+; GISEL-NEXT: v_mov_b32_e32 v24, v0
+; GISEL-NEXT: v_mov_b32_e32 v25, v1
+; GISEL-NEXT: v_mov_b32_e32 v26, v2
+; GISEL-NEXT: v_mov_b32_e32 v27, v3
+; GISEL-NEXT: v_mov_b32_e32 v28, v4
+; GISEL-NEXT: v_mov_b32_e32 v29, v5
+; GISEL-NEXT: v_mov_b32_e32 v30, v6
+; GISEL-NEXT: v_mov_b32_e32 v31, v7
+; GISEL-NEXT: v_mov_b32_e32 v32, v8
+; GISEL-NEXT: v_mov_b32_e32 v33, v9
+; GISEL-NEXT: v_mov_b32_e32 v16, v10
+; GISEL-NEXT: v_mov_b32_e32 v20, s26
+; GISEL-NEXT: v_mov_b32_e32 v21, s27
+; GISEL-NEXT: v_mov_b32_e32 v22, s28
+; GISEL-NEXT: v_mov_b32_e32 v23, s29
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[30:33], v[22:29], v21
+; GISEL-NEXT: v_smfmac_f32_32x32x64_bf8_fp8 v[0:15], v[34:37], v[48:55], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
@@ -2897,25 +3431,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8:
@@ -2933,22 +3484,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8(<4 x i32> %arg0, <8 x i32>
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -2960,25 +3503,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags0:
@@ -2996,22 +3556,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags0(<4 x i32> %arg0, <
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3023,25 +3575,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__flags1:
@@ -3059,22 +3628,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__flags1(<4 x i32> %arg0, <
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3086,94 +3647,108 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_bf8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v36, s0
-; SDAG-NEXT: v_mov_b32_e32 v37, s1
-; SDAG-NEXT: v_mov_b32_e32 v38, s2
-; SDAG-NEXT: v_mov_b32_e32 v39, s3
+; SDAG-NEXT: v_mov_b32_e32 v28, s0
+; SDAG-NEXT: v_mov_b32_e32 v29, s1
+; SDAG-NEXT: v_mov_b32_e32 v30, s2
+; SDAG-NEXT: v_mov_b32_e32 v31, s3
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: v_mov_b32_e32 v26, v8
+; SDAG-NEXT: v_mov_b32_e32 v25, v7
+; SDAG-NEXT: v_mov_b32_e32 v24, v6
+; SDAG-NEXT: v_mov_b32_e32 v23, v5
+; SDAG-NEXT: v_mov_b32_e32 v22, v4
+; SDAG-NEXT: v_mov_b32_e32 v21, v3
+; SDAG-NEXT: v_mov_b32_e32 v20, v2
+; SDAG-NEXT: v_mov_b32_e32 v19, v1
+; SDAG-NEXT: v_mov_b32_e32 v18, v0
; SDAG-NEXT: v_mov_b32_e32 v13, s25
; SDAG-NEXT: v_mov_b32_e32 v14, s26
; SDAG-NEXT: v_mov_b32_e32 v15, s27
; SDAG-NEXT: v_mov_b32_e32 v16, s28
; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
-; SDAG-NEXT: v_mov_b32_e32 v29, s17
-; SDAG-NEXT: v_mov_b32_e32 v30, s18
-; SDAG-NEXT: v_mov_b32_e32 v31, s19
-; SDAG-NEXT: v_mov_b32_e32 v32, s20
-; SDAG-NEXT: v_mov_b32_e32 v33, s21
-; SDAG-NEXT: v_mov_b32_e32 v34, s22
-; SDAG-NEXT: v_mov_b32_e32 v35, s23
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v18, v0
-; SDAG-NEXT: v_mov_b32_e32 v19, v1
-; SDAG-NEXT: v_mov_b32_e32 v20, v2
-; SDAG-NEXT: v_mov_b32_e32 v21, v3
-; SDAG-NEXT: v_mov_b32_e32 v22, v4
-; SDAG-NEXT: v_mov_b32_e32 v23, v5
-; SDAG-NEXT: v_mov_b32_e32 v24, v6
-; SDAG-NEXT: v_mov_b32_e32 v25, v7
-; SDAG-NEXT: v_mov_b32_e32 v26, v8
-; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_mov_b32_e32 v0, s16
+; SDAG-NEXT: v_mov_b32_e32 v1, s17
+; SDAG-NEXT: v_mov_b32_e32 v2, s18
+; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s20
+; SDAG-NEXT: v_mov_b32_e32 v5, s21
+; SDAG-NEXT: v_mov_b32_e32 v6, s22
+; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 a[0:15], v[28:31], v[0:7], v10
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_bf8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v11, v0
-; GISEL-NEXT: v_mov_b32_e32 v12, v1
-; GISEL-NEXT: v_mov_b32_e32 v13, v2
-; GISEL-NEXT: v_mov_b32_e32 v14, v3
-; GISEL-NEXT: v_mov_b32_e32 v15, v4
-; GISEL-NEXT: v_mov_b32_e32 v16, v5
-; GISEL-NEXT: v_mov_b32_e32 v17, v6
-; GISEL-NEXT: v_mov_b32_e32 v18, v7
-; GISEL-NEXT: v_mov_b32_e32 v19, v8
-; GISEL-NEXT: v_mov_b32_e32 v20, v9
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT: v_mov_b32_e32 v21, v10
-; GISEL-NEXT: v_mov_b32_e32 v0, s24
-; GISEL-NEXT: v_mov_b32_e32 v1, s25
-; GISEL-NEXT: v_mov_b32_e32 v2, s26
-; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
-; GISEL-NEXT: v_mov_b32_e32 v5, s29
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v6, v11
-; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mov_b32_e32 v8, v13
-; GISEL-NEXT: v_mov_b32_e32 v9, v14
-; GISEL-NEXT: v_mov_b32_e32 v10, v15
-; GISEL-NEXT: v_mov_b32_e32 v11, v16
-; GISEL-NEXT: v_mov_b32_e32 v12, v17
-; GISEL-NEXT: v_mov_b32_e32 v13, v18
-; GISEL-NEXT: v_mov_b32_e32 v14, v19
-; GISEL-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v18, s24
+; GISEL-NEXT: v_mov_b32_e32 v19, s25
+; GISEL-NEXT: v_mov_b32_e32 v24, v0
+; GISEL-NEXT: v_mov_b32_e32 v25, v1
+; GISEL-NEXT: v_mov_b32_e32 v26, v2
+; GISEL-NEXT: v_mov_b32_e32 v27, v3
+; GISEL-NEXT: v_mov_b32_e32 v28, v4
+; GISEL-NEXT: v_mov_b32_e32 v29, v5
+; GISEL-NEXT: v_mov_b32_e32 v30, v6
+; GISEL-NEXT: v_mov_b32_e32 v31, v7
+; GISEL-NEXT: v_mov_b32_e32 v32, v8
+; GISEL-NEXT: v_mov_b32_e32 v33, v9
+; GISEL-NEXT: v_mov_b32_e32 v16, v10
+; GISEL-NEXT: v_mov_b32_e32 v20, s26
+; GISEL-NEXT: v_mov_b32_e32 v21, s27
+; GISEL-NEXT: v_mov_b32_e32 v22, s28
+; GISEL-NEXT: v_mov_b32_e32 v23, s29
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[30:33], v[22:29], v21
+; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_bf8 v[0:15], v[34:37], v[48:55], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.bf8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
@@ -3270,25 +3845,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8:
@@ -3306,22 +3898,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8(<4 x i32> %arg0, <8 x i32>
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3333,25 +3917,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags0:
@@ -3369,22 +3970,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags0(<4 x i32> %arg0, <
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3396,25 +3989,42 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
+; SDAG-NEXT: s_nop 1
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__flags1:
@@ -3432,22 +4042,14 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__flags1(<4 x i32> %arg0, <
; GISEL-NEXT: v_mov_b32_e32 v35, v9
; GISEL-NEXT: v_mov_b32_e32 v36, v10
; GISEL-NEXT: v_mov_b32_e32 v37, v11
-; GISEL-NEXT: v_mov_b32_e32 v0, v12
-; GISEL-NEXT: v_mov_b32_e32 v1, v13
-; GISEL-NEXT: v_mov_b32_e32 v2, v14
-; GISEL-NEXT: v_mov_b32_e32 v3, v15
-; GISEL-NEXT: v_mov_b32_e32 v4, v16
-; GISEL-NEXT: v_mov_b32_e32 v5, v17
-; GISEL-NEXT: v_mov_b32_e32 v6, v18
-; GISEL-NEXT: v_mov_b32_e32 v7, v19
-; GISEL-NEXT: v_mov_b32_e32 v8, v20
-; GISEL-NEXT: v_mov_b32_e32 v9, v21
-; GISEL-NEXT: v_mov_b32_e32 v10, v22
-; GISEL-NEXT: v_mov_b32_e32 v11, v23
-; GISEL-NEXT: v_mov_b32_e32 v12, v24
-; GISEL-NEXT: v_mov_b32_e32 v13, v25
-; GISEL-NEXT: v_mov_b32_e32 v14, v26
-; GISEL-NEXT: v_mov_b32_e32 v15, v27
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27]
; GISEL-NEXT: s_nop 1
; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3459,94 +4061,108 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
; SDAG-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT: v_mov_b32_e32 v36, s0
-; SDAG-NEXT: v_mov_b32_e32 v37, s1
-; SDAG-NEXT: v_mov_b32_e32 v38, s2
-; SDAG-NEXT: v_mov_b32_e32 v39, s3
+; SDAG-NEXT: v_mov_b32_e32 v28, s0
+; SDAG-NEXT: v_mov_b32_e32 v29, s1
+; SDAG-NEXT: v_mov_b32_e32 v30, s2
+; SDAG-NEXT: v_mov_b32_e32 v31, s3
+; SDAG-NEXT: v_mov_b32_e32 v12, s24
+; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: v_mov_b32_e32 v26, v8
+; SDAG-NEXT: v_mov_b32_e32 v25, v7
+; SDAG-NEXT: v_mov_b32_e32 v24, v6
+; SDAG-NEXT: v_mov_b32_e32 v23, v5
+; SDAG-NEXT: v_mov_b32_e32 v22, v4
+; SDAG-NEXT: v_mov_b32_e32 v21, v3
+; SDAG-NEXT: v_mov_b32_e32 v20, v2
+; SDAG-NEXT: v_mov_b32_e32 v19, v1
+; SDAG-NEXT: v_mov_b32_e32 v18, v0
; SDAG-NEXT: v_mov_b32_e32 v13, s25
; SDAG-NEXT: v_mov_b32_e32 v14, s26
; SDAG-NEXT: v_mov_b32_e32 v15, s27
; SDAG-NEXT: v_mov_b32_e32 v16, s28
; SDAG-NEXT: v_mov_b32_e32 v17, s29
-; SDAG-NEXT: v_mov_b32_e32 v28, s16
-; SDAG-NEXT: v_mov_b32_e32 v29, s17
-; SDAG-NEXT: v_mov_b32_e32 v30, s18
-; SDAG-NEXT: v_mov_b32_e32 v31, s19
-; SDAG-NEXT: v_mov_b32_e32 v32, s20
-; SDAG-NEXT: v_mov_b32_e32 v33, s21
-; SDAG-NEXT: v_mov_b32_e32 v34, s22
-; SDAG-NEXT: v_mov_b32_e32 v35, s23
-; SDAG-NEXT: v_mov_b32_e32 v12, s24
-; SDAG-NEXT: v_mov_b32_e32 v18, v0
-; SDAG-NEXT: v_mov_b32_e32 v19, v1
-; SDAG-NEXT: v_mov_b32_e32 v20, v2
-; SDAG-NEXT: v_mov_b32_e32 v21, v3
-; SDAG-NEXT: v_mov_b32_e32 v22, v4
-; SDAG-NEXT: v_mov_b32_e32 v23, v5
-; SDAG-NEXT: v_mov_b32_e32 v24, v6
-; SDAG-NEXT: v_mov_b32_e32 v25, v7
-; SDAG-NEXT: v_mov_b32_e32 v26, v8
-; SDAG-NEXT: v_mov_b32_e32 v27, v9
+; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
+; SDAG-NEXT: v_mov_b32_e32 v0, s16
+; SDAG-NEXT: v_mov_b32_e32 v1, s17
+; SDAG-NEXT: v_mov_b32_e32 v2, s18
+; SDAG-NEXT: v_mov_b32_e32 v3, s19
+; SDAG-NEXT: v_mov_b32_e32 v4, s20
+; SDAG-NEXT: v_mov_b32_e32 v5, s21
+; SDAG-NEXT: v_mov_b32_e32 v6, s22
+; SDAG-NEXT: v_mov_b32_e32 v7, s23
+; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
+; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
+; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
+; SDAG-NEXT: v_accvgpr_write_b32 a4, v16
+; SDAG-NEXT: v_accvgpr_write_b32 a5, v17
+; SDAG-NEXT: v_accvgpr_write_b32 a6, v18
+; SDAG-NEXT: v_accvgpr_write_b32 a7, v19
+; SDAG-NEXT: v_accvgpr_write_b32 a8, v20
+; SDAG-NEXT: v_accvgpr_write_b32 a9, v21
+; SDAG-NEXT: v_accvgpr_write_b32 a10, v22
+; SDAG-NEXT: v_accvgpr_write_b32 a11, v23
+; SDAG-NEXT: v_accvgpr_write_b32 a12, v24
+; SDAG-NEXT: v_accvgpr_write_b32 a13, v25
+; SDAG-NEXT: v_accvgpr_write_b32 a14, v26
+; SDAG-NEXT: v_accvgpr_write_b32 a15, v27
; SDAG-NEXT: s_nop 1
-; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[12:27], v[36:39], v[28:35], v10
+; SDAG-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 a[0:15], v[28:31], v[0:7], v10
; SDAG-NEXT: s_nop 7
; SDAG-NEXT: s_nop 3
-; SDAG-NEXT: v_mov_b32_e32 v0, v12
-; SDAG-NEXT: v_mov_b32_e32 v1, v13
-; SDAG-NEXT: v_mov_b32_e32 v2, v14
-; SDAG-NEXT: v_mov_b32_e32 v3, v15
-; SDAG-NEXT: v_mov_b32_e32 v4, v16
-; SDAG-NEXT: v_mov_b32_e32 v5, v17
-; SDAG-NEXT: v_mov_b32_e32 v6, v18
-; SDAG-NEXT: v_mov_b32_e32 v7, v19
-; SDAG-NEXT: v_mov_b32_e32 v8, v20
-; SDAG-NEXT: v_mov_b32_e32 v9, v21
-; SDAG-NEXT: v_mov_b32_e32 v10, v22
-; SDAG-NEXT: v_mov_b32_e32 v11, v23
-; SDAG-NEXT: v_mov_b32_e32 v12, v24
-; SDAG-NEXT: v_mov_b32_e32 v13, v25
-; SDAG-NEXT: v_mov_b32_e32 v14, v26
-; SDAG-NEXT: v_mov_b32_e32 v15, v27
+; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
+; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
+; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
+; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
+; SDAG-NEXT: v_accvgpr_read_b32 v4, a4
+; SDAG-NEXT: v_accvgpr_read_b32 v5, a5
+; SDAG-NEXT: v_accvgpr_read_b32 v6, a6
+; SDAG-NEXT: v_accvgpr_read_b32 v7, a7
+; SDAG-NEXT: v_accvgpr_read_b32 v8, a8
+; SDAG-NEXT: v_accvgpr_read_b32 v9, a9
+; SDAG-NEXT: v_accvgpr_read_b32 v10, a10
+; SDAG-NEXT: v_accvgpr_read_b32 v11, a11
+; SDAG-NEXT: v_accvgpr_read_b32 v12, a12
+; SDAG-NEXT: v_accvgpr_read_b32 v13, a13
+; SDAG-NEXT: v_accvgpr_read_b32 v14, a14
+; SDAG-NEXT: v_accvgpr_read_b32 v15, a15
; SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-LABEL: test_smfmac_f32_32x32x64_fp8_fp8__sgpr:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[2:3]
-; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[0:1]
-; GISEL-NEXT: v_mov_b32_e32 v11, v0
-; GISEL-NEXT: v_mov_b32_e32 v12, v1
-; GISEL-NEXT: v_mov_b32_e32 v13, v2
-; GISEL-NEXT: v_mov_b32_e32 v14, v3
-; GISEL-NEXT: v_mov_b32_e32 v15, v4
-; GISEL-NEXT: v_mov_b32_e32 v16, v5
-; GISEL-NEXT: v_mov_b32_e32 v17, v6
-; GISEL-NEXT: v_mov_b32_e32 v18, v7
-; GISEL-NEXT: v_mov_b32_e32 v19, v8
-; GISEL-NEXT: v_mov_b32_e32 v20, v9
-; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[22:23]
-; GISEL-NEXT: v_mov_b32_e32 v21, v10
-; GISEL-NEXT: v_mov_b32_e32 v0, s24
-; GISEL-NEXT: v_mov_b32_e32 v1, s25
-; GISEL-NEXT: v_mov_b32_e32 v2, s26
-; GISEL-NEXT: v_mov_b32_e32 v3, s27
-; GISEL-NEXT: v_mov_b32_e32 v4, s28
-; GISEL-NEXT: v_mov_b32_e32 v5, s29
-; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[20:21]
-; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[18:19]
-; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v6, v11
-; GISEL-NEXT: v_mov_b32_e32 v7, v12
-; GISEL-NEXT: v_mov_b32_e32 v8, v13
-; GISEL-NEXT: v_mov_b32_e32 v9, v14
-; GISEL-NEXT: v_mov_b32_e32 v10, v15
-; GISEL-NEXT: v_mov_b32_e32 v11, v16
-; GISEL-NEXT: v_mov_b32_e32 v12, v17
-; GISEL-NEXT: v_mov_b32_e32 v13, v18
-; GISEL-NEXT: v_mov_b32_e32 v14, v19
-; GISEL-NEXT: v_mov_b32_e32 v15, v20
+; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
+; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v18, s24
+; GISEL-NEXT: v_mov_b32_e32 v19, s25
+; GISEL-NEXT: v_mov_b32_e32 v24, v0
+; GISEL-NEXT: v_mov_b32_e32 v25, v1
+; GISEL-NEXT: v_mov_b32_e32 v26, v2
+; GISEL-NEXT: v_mov_b32_e32 v27, v3
+; GISEL-NEXT: v_mov_b32_e32 v28, v4
+; GISEL-NEXT: v_mov_b32_e32 v29, v5
+; GISEL-NEXT: v_mov_b32_e32 v30, v6
+; GISEL-NEXT: v_mov_b32_e32 v31, v7
+; GISEL-NEXT: v_mov_b32_e32 v32, v8
+; GISEL-NEXT: v_mov_b32_e32 v33, v9
+; GISEL-NEXT: v_mov_b32_e32 v16, v10
+; GISEL-NEXT: v_mov_b32_e32 v20, s26
+; GISEL-NEXT: v_mov_b32_e32 v21, s27
+; GISEL-NEXT: v_mov_b32_e32 v22, s28
+; GISEL-NEXT: v_mov_b32_e32 v23, s29
+; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[18:19]
+; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[16:17]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21]
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23]
+; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25]
+; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27]
+; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29]
+; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31]
+; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33]
; GISEL-NEXT: s_nop 1
-; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[30:33], v[22:29], v21
+; GISEL-NEXT: v_smfmac_f32_32x32x64_fp8_fp8 v[0:15], v[34:37], v[48:55], v16
; GISEL-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x64.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
ret <16 x float> %result
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
index 21af2dde2c4bf..3fffb04de1a63 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-no-register-aliasing.ll
@@ -96,66 +96,66 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY908-NEXT: s_nop 7
; GREEDY908-NEXT: s_nop 7
; GREEDY908-NEXT: s_nop 1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a32
-; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a61
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a61
; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a60
-; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a33
-; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a59
-; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a58
-; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a34
-; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a57
-; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a56
-; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a35
-; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a55
-; GREEDY908-NEXT: v_accvgpr_read_b32 v12, a54
-; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a36
-; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a53
-; GREEDY908-NEXT: v_accvgpr_read_b32 v14, a52
-; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a37
-; GREEDY908-NEXT: v_accvgpr_read_b32 v15, a51
-; GREEDY908-NEXT: v_accvgpr_read_b32 v16, a50
-; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a38
-; GREEDY908-NEXT: v_accvgpr_read_b32 v17, a49
-; GREEDY908-NEXT: v_accvgpr_read_b32 v18, a48
-; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a39
-; GREEDY908-NEXT: v_accvgpr_read_b32 v19, a47
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a46
-; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a40
-; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v19
-; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a41
-; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v18
-; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v17
-; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a42
-; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v16
-; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v15
-; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v1
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a59
+; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a58
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a57
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a56
+; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a55
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a54
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a53
+; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a52
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a51
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a50
+; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a49
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a48
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a47
+; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a17, v2
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a46
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a45
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a44
+; GREEDY908-NEXT: v_accvgpr_write_b32 a16, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v2
; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a43
-; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v14
-; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v13
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a42
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a41
; GREEDY908-NEXT: v_accvgpr_write_b32 a13, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a44
-; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v12
-; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v11
-; GREEDY908-NEXT: v_accvgpr_write_b32 a14, v1
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a45
-; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v10
-; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v9
-; GREEDY908-NEXT: v_accvgpr_write_b32 a15, v1
-; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v8
-; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v7
-; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v6
-; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v5
+; GREEDY908-NEXT: v_accvgpr_write_b32 a12, v6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a11, v2
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a40
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a39
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a38
+; GREEDY908-NEXT: v_accvgpr_write_b32 a10, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a9, v6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a8, v2
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a37
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a36
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a35
+; GREEDY908-NEXT: v_accvgpr_write_b32 a7, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a6, v6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a5, v2
+; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a34
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a33
+; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a32
+; GREEDY908-NEXT: v_accvgpr_write_b32 a4, v1
+; GREEDY908-NEXT: v_accvgpr_write_b32 a3, v6
+; GREEDY908-NEXT: v_accvgpr_write_b32 a2, v2
; GREEDY908-NEXT: s_nop 0
; GREEDY908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v3, v0, a[0:31]
; GREEDY908-NEXT: s_nop 7
@@ -266,36 +266,36 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY90A-NEXT: s_nop 7
; GREEDY90A-NEXT: s_nop 7
; GREEDY90A-NEXT: s_nop 2
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a2, a32
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a3, a33
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a4, a34
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a5, a35
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a6, a36
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a7, a37
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a8, a38
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a9, a39
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a10, a40
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a11, a41
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a12, a42
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a13, a43
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a14, a44
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a15, a45
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a16, a46
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a17, a47
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a18, a48
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a19, a49
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a20, a50
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a21, a51
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a22, a52
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a23, a53
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a24, a54
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a25, a55
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a26, a56
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a27, a57
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a28, a58
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a29, a59
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a30, a60
; GREEDY90A-NEXT: v_accvgpr_mov_b32 a31, a61
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a30, a60
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a29, a59
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a28, a58
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a27, a57
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a26, a56
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a25, a55
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a24, a54
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a23, a53
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a22, a52
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a21, a51
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a20, a50
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a19, a49
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a18, a48
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a17, a47
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a16, a46
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a15, a45
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a14, a44
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a13, a43
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a12, a42
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a11, a41
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a10, a40
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a9, a39
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a8, a38
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a7, a37
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a6, a36
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a5, a35
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a4, a34
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a3, a33
+; GREEDY90A-NEXT: v_accvgpr_mov_b32 a2, a32
; GREEDY90A-NEXT: s_nop 1
; GREEDY90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
; GREEDY90A-NEXT: s_nop 7
@@ -359,36 +359,36 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY942-NEXT: s_nop 7
; GREEDY942-NEXT: s_nop 7
; GREEDY942-NEXT: s_nop 1
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a2, a32
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a3, a33
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a4, a34
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a5, a35
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a6, a36
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a7, a37
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a8, a38
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a9, a39
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a10, a40
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a11, a41
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a12, a42
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a13, a43
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a14, a44
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a15, a45
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a16, a46
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a17, a47
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a18, a48
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a19, a49
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a20, a50
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a21, a51
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a22, a52
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a23, a53
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a24, a54
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a25, a55
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a26, a56
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a27, a57
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a28, a58
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a29, a59
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a30, a60
; GREEDY942-NEXT: v_accvgpr_mov_b32 a31, a61
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a30, a60
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a29, a59
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a28, a58
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a27, a57
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a26, a56
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a25, a55
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a24, a54
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a23, a53
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a22, a52
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a21, a51
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a20, a50
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a19, a49
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a18, a48
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a17, a47
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a16, a46
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a15, a45
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a14, a44
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a13, a43
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a12, a42
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a11, a41
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a10, a40
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a9, a39
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a8, a38
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a7, a37
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a6, a36
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a5, a35
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a4, a34
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a3, a33
+; GREEDY942-NEXT: v_accvgpr_mov_b32 a2, a32
; GREEDY942-NEXT: s_nop 1
; GREEDY942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
; GREEDY942-NEXT: s_nop 7
@@ -507,106 +507,76 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32(ptr addrspace(1) %arg) #0 {
; FAST90A-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0
; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x40
; FAST90A-NEXT: s_waitcnt lgkmcnt(0)
-; FAST90A-NEXT: v_accvgpr_write_b32 a32, s36
-; FAST90A-NEXT: v_accvgpr_write_b32 a33, s37
-; FAST90A-NEXT: v_accvgpr_write_b32 a34, s38
-; FAST90A-NEXT: v_accvgpr_write_b32 a35, s39
-; FAST90A-NEXT: v_accvgpr_write_b32 a36, s40
-; FAST90A-NEXT: v_accvgpr_write_b32 a37, s41
-; FAST90A-NEXT: v_accvgpr_write_b32 a38, s42
-; FAST90A-NEXT: v_accvgpr_write_b32 a39, s43
-; FAST90A-NEXT: v_accvgpr_write_b32 a40, s44
-; FAST90A-NEXT: v_accvgpr_write_b32 a41, s45
-; FAST90A-NEXT: v_accvgpr_write_b32 a42, s46
-; FAST90A-NEXT: v_accvgpr_write_b32 a43, s47
-; FAST90A-NEXT: v_accvgpr_write_b32 a44, s48
-; FAST90A-NEXT: v_accvgpr_write_b32 a45, s49
-; FAST90A-NEXT: v_accvgpr_write_b32 a46, s50
-; FAST90A-NEXT: v_accvgpr_write_b32 a47, s51
-; FAST90A-NEXT: v_accvgpr_write_b32 a48, s4
-; FAST90A-NEXT: v_accvgpr_write_b32 a49, s5
-; FAST90A-NEXT: v_accvgpr_write_b32 a50, s6
-; FAST90A-NEXT: v_accvgpr_write_b32 a51, s7
-; FAST90A-NEXT: v_accvgpr_write_b32 a52, s8
-; FAST90A-NEXT: v_accvgpr_write_b32 a53, s9
-; FAST90A-NEXT: v_accvgpr_write_b32 a54, s10
-; FAST90A-NEXT: v_accvgpr_write_b32 a55, s11
-; FAST90A-NEXT: v_accvgpr_write_b32 a56, s12
-; FAST90A-NEXT: v_accvgpr_write_b32 a57, s13
-; FAST90A-NEXT: v_accvgpr_write_b32 a58, s14
-; FAST90A-NEXT: v_accvgpr_write_b32 a59, s15
-; FAST90A-NEXT: v_accvgpr_write_b32 a60, s16
-; FAST90A-NEXT: v_accvgpr_write_b32 a61, s17
-; FAST90A-NEXT: v_accvgpr_write_b32 a62, s18
-; FAST90A-NEXT: v_accvgpr_write_b32 a63, s19
+; FAST90A-NEXT: v_accvgpr_write_b32 a64, s36
+; FAST90A-NEXT: v_accvgpr_write_b32 a65, s37
+; FAST90A-NEXT: v_accvgpr_write_b32 a66, s38
+; FAST90A-NEXT: v_accvgpr_write_b32 a67, s39
+; FAST90A-NEXT: v_accvgpr_write_b32 a68, s40
+; FAST90A-NEXT: v_accvgpr_write_b32 a69, s41
+; FAST90A-NEXT: v_accvgpr_write_b32 a70, s42
+; FAST90A-NEXT: v_accvgpr_write_b32 a71, s43
+; FAST90A-NEXT: v_accvgpr_write_b32 a72, s44
+; FAST90A-NEXT: v_accvgpr_write_b32 a73, s45
+; FAST90A-NEXT: v_accvgpr_write_b32 a74, s46
+; FAST90A-NEXT: v_accvgpr_write_b32 a75, s47
+; FAST90A-NEXT: v_accvgpr_write_b32 a76, s48
+; FAST90A-NEXT: v_accvgpr_write_b32 a77, s49
+; FAST90A-NEXT: v_accvgpr_write_b32 a78, s50
+; FAST90A-NEXT: v_accvgpr_write_b32 a79, s51
+; FAST90A-NEXT: v_accvgpr_write_b32 a80, s4
+; FAST90A-NEXT: v_accvgpr_write_b32 a81, s5
+; FAST90A-NEXT: v_accvgpr_write_b32 a82, s6
+; FAST90A-NEXT: v_accvgpr_write_b32 a83, s7
+; FAST90A-NEXT: v_accvgpr_write_b32 a84, s8
+; FAST90A-NEXT: v_accvgpr_write_b32 a85, s9
+; FAST90A-NEXT: v_accvgpr_write_b32 a86, s10
+; FAST90A-NEXT: v_accvgpr_write_b32 a87, s11
+; FAST90A-NEXT: v_accvgpr_write_b32 a88, s12
+; FAST90A-NEXT: v_accvgpr_write_b32 a89, s13
+; FAST90A-NEXT: v_accvgpr_write_b32 a90, s14
+; FAST90A-NEXT: v_accvgpr_write_b32 a91, s15
+; FAST90A-NEXT: v_accvgpr_write_b32 a92, s16
+; FAST90A-NEXT: v_accvgpr_write_b32 a93, s17
+; FAST90A-NEXT: v_accvgpr_write_b32 a94, s18
+; FAST90A-NEXT: v_accvgpr_write_b32 a95, s19
; FAST90A-NEXT: s_nop 1
-; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[32:63]
-; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[32:63]
+; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[64:95], v1, v2, a[64:95]
+; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[32:63], v1, v2, a[64:95]
; FAST90A-NEXT: s_nop 7
; FAST90A-NEXT: s_nop 7
-; FAST90A-NEXT: s_nop 2
-; FAST90A-NEXT: v_accvgpr_read_b32 v3, a29
-; FAST90A-NEXT: v_accvgpr_read_b32 v4, a28
-; FAST90A-NEXT: v_accvgpr_read_b32 v5, a27
-; FAST90A-NEXT: v_accvgpr_read_b32 v6, a26
-; FAST90A-NEXT: v_accvgpr_read_b32 v7, a25
-; FAST90A-NEXT: v_accvgpr_read_b32 v8, a24
-; FAST90A-NEXT: v_accvgpr_read_b32 v9, a23
-; FAST90A-NEXT: v_accvgpr_read_b32 v10, a22
-; FAST90A-NEXT: v_accvgpr_read_b32 v11, a21
-; FAST90A-NEXT: v_accvgpr_read_b32 v12, a20
-; FAST90A-NEXT: v_accvgpr_read_b32 v13, a19
-; FAST90A-NEXT: v_accvgpr_read_b32 v14, a18
-; FAST90A-NEXT: v_accvgpr_read_b32 v15, a17
-; FAST90A-NEXT: v_accvgpr_read_b32 v16, a16
-; FAST90A-NEXT: v_accvgpr_read_b32 v17, a15
-; FAST90A-NEXT: v_accvgpr_read_b32 v18, a14
-; FAST90A-NEXT: v_accvgpr_read_b32 v19, a13
-; FAST90A-NEXT: v_accvgpr_read_b32 v20, a12
-; FAST90A-NEXT: v_accvgpr_read_b32 v21, a11
-; FAST90A-NEXT: v_accvgpr_read_b32 v22, a10
-; FAST90A-NEXT: v_accvgpr_read_b32 v23, a9
-; FAST90A-NEXT: v_accvgpr_read_b32 v24, a8
-; FAST90A-NEXT: v_accvgpr_read_b32 v25, a7
-; FAST90A-NEXT: v_accvgpr_read_b32 v26, a6
-; FAST90A-NEXT: v_accvgpr_read_b32 v27, a5
-; FAST90A-NEXT: v_accvgpr_read_b32 v28, a4
-; FAST90A-NEXT: v_accvgpr_read_b32 v29, a3
-; FAST90A-NEXT: v_accvgpr_read_b32 v30, a2
-; FAST90A-NEXT: v_accvgpr_read_b32 v31, a1
-; FAST90A-NEXT: v_accvgpr_read_b32 v32, a0
-; FAST90A-NEXT: v_accvgpr_mov_b32 a0, a32
-; FAST90A-NEXT: v_accvgpr_mov_b32 a1, a33
-; FAST90A-NEXT: v_accvgpr_write_b32 a2, v32
-; FAST90A-NEXT: v_accvgpr_write_b32 a3, v31
-; FAST90A-NEXT: v_accvgpr_write_b32 a4, v30
-; FAST90A-NEXT: v_accvgpr_write_b32 a5, v29
-; FAST90A-NEXT: v_accvgpr_write_b32 a6, v28
-; FAST90A-NEXT: v_accvgpr_write_b32 a7, v27
-; FAST90A-NEXT: v_accvgpr_write_b32 a8, v26
-; FAST90A-NEXT: v_accvgpr_write_b32 a9, v25
-; FAST90A-NEXT: v_accvgpr_write_b32 a10, v24
-; FAST90A-NEXT: v_accvgpr_write_b32 a11, v23
-; FAST90A-NEXT: v_accvgpr_write_b32 a12, v22
-; FAST90A-NEXT: v_accvgpr_write_b32 a13, v21
-; FAST90A-NEXT: v_accvgpr_write_b32 a14, v20
-; FAST90A-NEXT: v_accvgpr_write_b32 a15, v19
-; FAST90A-NEXT: v_accvgpr_write_b32 a16, v18
-; FAST90A-NEXT: v_accvgpr_write_b32 a17, v17
-; FAST90A-NEXT: v_accvgpr_write_b32 a18, v16
-; FAST90A-NEXT: v_accvgpr_write_b32 a19, v15
-; FAST90A-NEXT: v_accvgpr_write_b32 a20, v14
-; FAST90A-NEXT: v_accvgpr_write_b32 a21, v13
-; FAST90A-NEXT: v_accvgpr_write_b32 a22, v12
-; FAST90A-NEXT: v_accvgpr_write_b32 a23, v11
-; FAST90A-NEXT: v_accvgpr_write_b32 a24, v10
-; FAST90A-NEXT: v_accvgpr_write_b32 a25, v9
-; FAST90A-NEXT: v_accvgpr_write_b32 a26, v8
-; FAST90A-NEXT: v_accvgpr_write_b32 a27, v7
-; FAST90A-NEXT: v_accvgpr_write_b32 a28, v6
-; FAST90A-NEXT: v_accvgpr_write_b32 a29, v5
-; FAST90A-NEXT: v_accvgpr_write_b32 a30, v4
-; FAST90A-NEXT: v_accvgpr_write_b32 a31, v3
+; FAST90A-NEXT: s_nop 1
+; FAST90A-NEXT: v_accvgpr_mov_b32 a1, a65
+; FAST90A-NEXT: v_accvgpr_mov_b32 a0, a64
+; FAST90A-NEXT: v_accvgpr_mov_b32 a31, a61
+; FAST90A-NEXT: v_accvgpr_mov_b32 a30, a60
+; FAST90A-NEXT: v_accvgpr_mov_b32 a29, a59
+; FAST90A-NEXT: v_accvgpr_mov_b32 a28, a58
+; FAST90A-NEXT: v_accvgpr_mov_b32 a27, a57
+; FAST90A-NEXT: v_accvgpr_mov_b32 a26, a56
+; FAST90A-NEXT: v_accvgpr_mov_b32 a25, a55
+; FAST90A-NEXT: v_accvgpr_mov_b32 a24, a54
+; FAST90A-NEXT: v_accvgpr_mov_b32 a23, a53
+; FAST90A-NEXT: v_accvgpr_mov_b32 a22, a52
+; FAST90A-NEXT: v_accvgpr_mov_b32 a21, a51
+; FAST90A-NEXT: v_accvgpr_mov_b32 a20, a50
+; FAST90A-NEXT: v_accvgpr_mov_b32 a19, a49
+; FAST90A-NEXT: v_accvgpr_mov_b32 a18, a48
+; FAST90A-NEXT: v_accvgpr_mov_b32 a17, a47
+; FAST90A-NEXT: v_accvgpr_mov_b32 a16, a46
+; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a45
+; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a44
+; FAST90A-NEXT: v_accvgpr_mov_b32 a13, a43
+; FAST90A-NEXT: v_accvgpr_mov_b32 a12, a42
+; FAST90A-NEXT: v_accvgpr_mov_b32 a11, a41
+; FAST90A-NEXT: v_accvgpr_mov_b32 a10, a40
+; FAST90A-NEXT: v_accvgpr_mov_b32 a9, a39
+; FAST90A-NEXT: v_accvgpr_mov_b32 a8, a38
+; FAST90A-NEXT: v_accvgpr_mov_b32 a7, a37
+; FAST90A-NEXT: v_accvgpr_mov_b32 a6, a36
+; FAST90A-NEXT: v_accvgpr_mov_b32 a5, a35
+; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a34
+; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a33
+; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a32
; FAST90A-NEXT: s_nop 1
; FAST90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v2, a[0:31]
; FAST90A-NEXT: s_nop 7
@@ -636,42 +606,42 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY908: ; %bb.0: ; %bb
; GREEDY908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
; GREEDY908-NEXT: v_mov_b32_e32 v0, 1.0
-; GREEDY908-NEXT: v_mov_b32_e32 v4, 0
+; GREEDY908-NEXT: v_mov_b32_e32 v12, 0
; GREEDY908-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY908-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GREEDY908-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY908-NEXT: v_mov_b32_e32 v5, s15
+; GREEDY908-NEXT: v_mov_b32_e32 v13, s15
; GREEDY908-NEXT: v_mov_b32_e32 v2, s14
; GREEDY908-NEXT: v_mov_b32_e32 v1, s13
-; GREEDY908-NEXT: v_accvgpr_write_b32 a33, v5
-; GREEDY908-NEXT: v_mov_b32_e32 v5, s12
+; GREEDY908-NEXT: v_accvgpr_write_b32 a33, v13
+; GREEDY908-NEXT: v_mov_b32_e32 v13, s12
; GREEDY908-NEXT: v_accvgpr_write_b32 a32, v2
; GREEDY908-NEXT: v_accvgpr_write_b32 a31, v1
-; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v5
+; GREEDY908-NEXT: v_accvgpr_write_b32 a30, v13
; GREEDY908-NEXT: v_mov_b32_e32 v2, s11
; GREEDY908-NEXT: v_mov_b32_e32 v1, s10
-; GREEDY908-NEXT: v_mov_b32_e32 v5, s9
+; GREEDY908-NEXT: v_mov_b32_e32 v13, s9
; GREEDY908-NEXT: v_accvgpr_write_b32 a29, v2
; GREEDY908-NEXT: v_accvgpr_write_b32 a28, v1
-; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v5
+; GREEDY908-NEXT: v_accvgpr_write_b32 a27, v13
; GREEDY908-NEXT: v_mov_b32_e32 v2, s8
; GREEDY908-NEXT: v_mov_b32_e32 v1, s7
-; GREEDY908-NEXT: v_mov_b32_e32 v5, s6
+; GREEDY908-NEXT: v_mov_b32_e32 v13, s6
; GREEDY908-NEXT: v_accvgpr_write_b32 a26, v2
; GREEDY908-NEXT: v_accvgpr_write_b32 a25, v1
-; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v5
+; GREEDY908-NEXT: v_accvgpr_write_b32 a24, v13
; GREEDY908-NEXT: v_mov_b32_e32 v2, s5
; GREEDY908-NEXT: v_mov_b32_e32 v1, s4
-; GREEDY908-NEXT: v_mov_b32_e32 v5, s3
+; GREEDY908-NEXT: v_mov_b32_e32 v13, s3
; GREEDY908-NEXT: v_accvgpr_write_b32 a23, v2
; GREEDY908-NEXT: v_accvgpr_write_b32 a22, v1
-; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v5
+; GREEDY908-NEXT: v_accvgpr_write_b32 a21, v13
; GREEDY908-NEXT: v_mov_b32_e32 v2, s2
; GREEDY908-NEXT: v_mov_b32_e32 v1, s1
-; GREEDY908-NEXT: v_mov_b32_e32 v5, s0
+; GREEDY908-NEXT: v_mov_b32_e32 v13, s0
; GREEDY908-NEXT: v_accvgpr_write_b32 a20, v2
; GREEDY908-NEXT: v_accvgpr_write_b32 a19, v1
-; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v5
+; GREEDY908-NEXT: v_accvgpr_write_b32 a18, v13
; GREEDY908-NEXT: v_mov_b32_e32 v1, 2.0
; GREEDY908-NEXT: s_nop 1
; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
@@ -679,10 +649,10 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY908-NEXT: s_nop 7
; GREEDY908-NEXT: s_nop 0
; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a19
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a18
+; GREEDY908-NEXT: v_accvgpr_read_b32 v13, a18
; GREEDY908-NEXT: s_nop 0
; GREEDY908-NEXT: v_accvgpr_write_b32 a1, v2
-; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v3
+; GREEDY908-NEXT: v_accvgpr_write_b32 a0, v13
; GREEDY908-NEXT: s_nop 0
; GREEDY908-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
; GREEDY908-NEXT: s_nop 7
@@ -691,113 +661,167 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a14
; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a13
; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a12
-; GREEDY908-NEXT: s_nop 1
-; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:48
-; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a11
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a10
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a9
-; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a8
-; GREEDY908-NEXT: s_nop 1
-; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:32
-; GREEDY908-NEXT: s_nop 0
-; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a7
-; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a6
-; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a5
-; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a4
-; GREEDY908-NEXT: s_nop 1
-; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] offset:16
-; GREEDY908-NEXT: s_nop 0
+; GREEDY908-NEXT: v_accvgpr_read_b32 v7, a11
+; GREEDY908-NEXT: v_accvgpr_read_b32 v6, a10
+; GREEDY908-NEXT: v_accvgpr_read_b32 v5, a9
+; GREEDY908-NEXT: v_accvgpr_read_b32 v4, a8
+; GREEDY908-NEXT: v_accvgpr_read_b32 v11, a7
+; GREEDY908-NEXT: v_accvgpr_read_b32 v10, a6
+; GREEDY908-NEXT: v_accvgpr_read_b32 v9, a5
+; GREEDY908-NEXT: v_accvgpr_read_b32 v8, a4
+; GREEDY908-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17] offset:48
+; GREEDY908-NEXT: global_store_dwordx4 v12, v[4:7], s[16:17] offset:32
+; GREEDY908-NEXT: global_store_dwordx4 v12, v[8:11], s[16:17] offset:16
; GREEDY908-NEXT: v_accvgpr_read_b32 v3, a3
; GREEDY908-NEXT: v_accvgpr_read_b32 v2, a2
; GREEDY908-NEXT: v_accvgpr_read_b32 v1, a1
; GREEDY908-NEXT: v_accvgpr_read_b32 v0, a0
; GREEDY908-NEXT: s_nop 1
-; GREEDY908-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17]
+; GREEDY908-NEXT: global_store_dwordx4 v12, v[0:3], s[16:17]
; GREEDY908-NEXT: s_endpgm
;
; GREEDY90A-LABEL: test_mfma_f32_16x16x1f32:
; GREEDY90A: ; %bb.0: ; %bb
; GREEDY90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GREEDY90A-NEXT: v_mov_b32_e32 v0, 1.0
-; GREEDY90A-NEXT: v_mov_b32_e32 v1, 2.0
-; GREEDY90A-NEXT: v_mov_b32_e32 v2, 0
+; GREEDY90A-NEXT: v_mov_b32_e32 v16, 1.0
+; GREEDY90A-NEXT: v_mov_b32_e32 v17, 2.0
; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GREEDY90A-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a33, s15
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a32, s14
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s13
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s12
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s11
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s10
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s9
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s8
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s7
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s6
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s5
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s4
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s3
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s2
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s1
-; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s0
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a31, s15
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a30, s14
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a29, s13
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a28, s12
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a27, s11
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a26, s10
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a25, s9
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a24, s8
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a23, s7
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a22, s6
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a21, s5
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a20, s4
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a19, s3
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a18, s2
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a17, s1
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a16, s0
; GREEDY90A-NEXT: s_nop 1
-; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[18:33], v0, v1, a[18:33]
-; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[2:17], v0, v1, a[18:33]
+; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v16, v17, a[16:31]
+; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v16, v17, a[16:31]
; GREEDY90A-NEXT: s_nop 7
; GREEDY90A-NEXT: s_nop 1
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a0, a18
-; GREEDY90A-NEXT: v_accvgpr_mov_b32 a1, a19
-; GREEDY90A-NEXT: s_nop 1
-; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
+; GREEDY90A-NEXT: v_accvgpr_read_b32 v0, a16
+; GREEDY90A-NEXT: v_accvgpr_read_b32 v1, a17
+; GREEDY90A-NEXT: v_accvgpr_read_b32 v15, a13
+; GREEDY90A-NEXT: v_accvgpr_read_b32 v14, a12
+; GREEDY90A-NEXT: v_accvgpr_read_b32 v13, a11
+; GREEDY90A-NEXT: v_accvgpr_read_b32 v12, a10
+; GREEDY90A-NEXT: v_accvgpr_read_b32 v11, a9
+; GREEDY90A-NEXT: v_accvgpr_read_b32 v10, a8
+; GREEDY90A-NEXT: v_accvgpr_read_b32 v9, a7
+; GREEDY90A-NEXT: v_accvgpr_read_b32 v8, a6
+; GREEDY90A-NEXT: v_accvgpr_read_b32 v7, a5
+; GREEDY90A-NEXT: v_accvgpr_read_b32 v6, a4
+; GREEDY90A-NEXT: v_accvgpr_read_b32 v5, a3
+; GREEDY90A-NEXT: v_accvgpr_read_b32 v4, a2
+; GREEDY90A-NEXT: v_accvgpr_read_b32 v3, a1
+; GREEDY90A-NEXT: v_accvgpr_read_b32 v2, a0
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a1, v1
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a2, v2
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a3, v3
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a4, v4
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a5, v5
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a6, v6
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a7, v7
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a8, v8
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a9, v9
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a10, v10
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a11, v11
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a12, v12
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a13, v13
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a14, v14
+; GREEDY90A-NEXT: v_accvgpr_write_b32 a15, v15
+; GREEDY90A-NEXT: v_mov_b32_e32 v0, 0
+; GREEDY90A-NEXT: s_nop 0
+; GREEDY90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v16, v17, a[0:15]
; GREEDY90A-NEXT: s_nop 7
; GREEDY90A-NEXT: s_nop 2
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
-; GREEDY90A-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17]
+; GREEDY90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GREEDY90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GREEDY90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GREEDY90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GREEDY90A-NEXT: s_endpgm
;
; GREEDY942-LABEL: test_mfma_f32_16x16x1f32:
; GREEDY942: ; %bb.0: ; %bb
; GREEDY942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
-; GREEDY942-NEXT: v_mov_b32_e32 v0, 1.0
-; GREEDY942-NEXT: v_mov_b32_e32 v1, 2.0
-; GREEDY942-NEXT: v_mov_b32_e32 v2, 0
+; GREEDY942-NEXT: v_mov_b32_e32 v16, 1.0
+; GREEDY942-NEXT: v_mov_b32_e32 v17, 2.0
; GREEDY942-NEXT: s_waitcnt lgkmcnt(0)
; GREEDY942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
; GREEDY942-NEXT: s_waitcnt lgkmcnt(0)
-; GREEDY942-NEXT: v_accvgpr_write_b32 a33, s15
-; GREEDY942-NEXT: v_accvgpr_write_b32 a32, s14
-; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s13
-; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s12
-; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s11
-; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s10
-; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s9
-; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s8
-; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s7
-; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s6
-; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s5
-; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s4
-; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s3
-; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s2
-; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s1
-; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s0
+; GREEDY942-NEXT: v_accvgpr_write_b32 a31, s15
+; GREEDY942-NEXT: v_accvgpr_write_b32 a30, s14
+; GREEDY942-NEXT: v_accvgpr_write_b32 a29, s13
+; GREEDY942-NEXT: v_accvgpr_write_b32 a28, s12
+; GREEDY942-NEXT: v_accvgpr_write_b32 a27, s11
+; GREEDY942-NEXT: v_accvgpr_write_b32 a26, s10
+; GREEDY942-NEXT: v_accvgpr_write_b32 a25, s9
+; GREEDY942-NEXT: v_accvgpr_write_b32 a24, s8
+; GREEDY942-NEXT: v_accvgpr_write_b32 a23, s7
+; GREEDY942-NEXT: v_accvgpr_write_b32 a22, s6
+; GREEDY942-NEXT: v_accvgpr_write_b32 a21, s5
+; GREEDY942-NEXT: v_accvgpr_write_b32 a20, s4
+; GREEDY942-NEXT: v_accvgpr_write_b32 a19, s3
+; GREEDY942-NEXT: v_accvgpr_write_b32 a18, s2
+; GREEDY942-NEXT: v_accvgpr_write_b32 a17, s1
+; GREEDY942-NEXT: v_accvgpr_write_b32 a16, s0
; GREEDY942-NEXT: s_nop 1
-; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[18:33], v0, v1, a[18:33]
-; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[2:17], v0, v1, a[18:33]
+; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[16:31], v16, v17, a[16:31]
+; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v16, v17, a[16:31]
; GREEDY942-NEXT: s_nop 7
; GREEDY942-NEXT: s_nop 0
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a0, a18
-; GREEDY942-NEXT: v_accvgpr_mov_b32 a1, a19
-; GREEDY942-NEXT: s_nop 1
-; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v0, v1, a[0:15]
+; GREEDY942-NEXT: v_accvgpr_read_b32 v0, a16
+; GREEDY942-NEXT: v_accvgpr_read_b32 v1, a17
+; GREEDY942-NEXT: v_accvgpr_read_b32 v15, a13
+; GREEDY942-NEXT: v_accvgpr_read_b32 v14, a12
+; GREEDY942-NEXT: v_accvgpr_read_b32 v13, a11
+; GREEDY942-NEXT: v_accvgpr_read_b32 v12, a10
+; GREEDY942-NEXT: v_accvgpr_read_b32 v11, a9
+; GREEDY942-NEXT: v_accvgpr_read_b32 v10, a8
+; GREEDY942-NEXT: v_accvgpr_read_b32 v9, a7
+; GREEDY942-NEXT: v_accvgpr_read_b32 v8, a6
+; GREEDY942-NEXT: v_accvgpr_read_b32 v7, a5
+; GREEDY942-NEXT: v_accvgpr_read_b32 v6, a4
+; GREEDY942-NEXT: v_accvgpr_read_b32 v5, a3
+; GREEDY942-NEXT: v_accvgpr_read_b32 v4, a2
+; GREEDY942-NEXT: v_accvgpr_read_b32 v3, a1
+; GREEDY942-NEXT: v_accvgpr_read_b32 v2, a0
+; GREEDY942-NEXT: v_accvgpr_write_b32 a0, v0
+; GREEDY942-NEXT: v_accvgpr_write_b32 a1, v1
+; GREEDY942-NEXT: v_accvgpr_write_b32 a2, v2
+; GREEDY942-NEXT: v_accvgpr_write_b32 a3, v3
+; GREEDY942-NEXT: v_accvgpr_write_b32 a4, v4
+; GREEDY942-NEXT: v_accvgpr_write_b32 a5, v5
+; GREEDY942-NEXT: v_accvgpr_write_b32 a6, v6
+; GREEDY942-NEXT: v_accvgpr_write_b32 a7, v7
+; GREEDY942-NEXT: v_accvgpr_write_b32 a8, v8
+; GREEDY942-NEXT: v_accvgpr_write_b32 a9, v9
+; GREEDY942-NEXT: v_accvgpr_write_b32 a10, v10
+; GREEDY942-NEXT: v_accvgpr_write_b32 a11, v11
+; GREEDY942-NEXT: v_accvgpr_write_b32 a12, v12
+; GREEDY942-NEXT: v_accvgpr_write_b32 a13, v13
+; GREEDY942-NEXT: v_accvgpr_write_b32 a14, v14
+; GREEDY942-NEXT: v_accvgpr_write_b32 a15, v15
+; GREEDY942-NEXT: v_mov_b32_e32 v0, 0
+; GREEDY942-NEXT: s_nop 0
+; GREEDY942-NEXT: v_mfma_f32_16x16x1_4b_f32 a[0:15], v16, v17, a[0:15]
; GREEDY942-NEXT: s_nop 7
; GREEDY942-NEXT: s_nop 1
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[12:15], s[16:17] offset:48
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[8:11], s[16:17] offset:32
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[4:7], s[16:17] offset:16
-; GREEDY942-NEXT: global_store_dwordx4 v2, a[0:3], s[16:17]
+; GREEDY942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
+; GREEDY942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
+; GREEDY942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
+; GREEDY942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
; GREEDY942-NEXT: s_endpgm
;
; GREEDY90A-GISEL-LABEL: test_mfma_f32_16x16x1f32:
@@ -857,51 +881,53 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32(ptr addrspace(1) %arg) #0 {
; FAST90A-LABEL: test_mfma_f32_16x16x1f32:
; FAST90A: ; %bb.0: ; %bb
; FAST90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; FAST90A-NEXT: v_mov_b32_e32 v1, 1.0
-; FAST90A-NEXT: v_mov_b32_e32 v2, 2.0
-; FAST90A-NEXT: v_mov_b32_e32 v0, 0
+; FAST90A-NEXT: v_mov_b32_e32 v0, 1.0
+; FAST90A-NEXT: v_mov_b32_e32 v1, 2.0
; FAST90A-NEXT: s_waitcnt lgkmcnt(0)
; FAST90A-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x0
; FAST90A-NEXT: s_waitcnt lgkmcnt(0)
-; FAST90A-NEXT: v_accvgpr_write_b32 a0, s4
-; FAST90A-NEXT: v_accvgpr_write_b32 a1, s5
-; FAST90A-NEXT: v_accvgpr_write_b32 a2, s6
-; FAST90A-NEXT: v_accvgpr_write_b32 a3, s7
-; FAST90A-NEXT: v_accvgpr_write_b32 a4, s8
-; FAST90A-NEXT: v_accvgpr_write_b32 a5, s9
-; FAST90A-NEXT: v_accvgpr_write_b32 a6, s10
-; FAST90A-NEXT: v_accvgpr_write_b32 a7, s11
-; FAST90A-NEXT: v_accvgpr_write_b32 a8, s12
-; FAST90A-NEXT: v_accvgpr_write_b32 a9, s13
-; FAST90A-NEXT: v_accvgpr_write_b32 a10, s14
-; FAST90A-NEXT: v_accvgpr_write_b32 a11, s15
-; FAST90A-NEXT: v_accvgpr_write_b32 a12, s16
-; FAST90A-NEXT: v_accvgpr_write_b32 a13, s17
-; FAST90A-NEXT: v_accvgpr_write_b32 a14, s18
-; FAST90A-NEXT: v_accvgpr_write_b32 a15, s19
+; FAST90A-NEXT: v_accvgpr_write_b32 a47, s19
+; FAST90A-NEXT: v_accvgpr_write_b32 a46, s18
+; FAST90A-NEXT: v_accvgpr_write_b32 a45, s17
+; FAST90A-NEXT: v_accvgpr_write_b32 a44, s16
+; FAST90A-NEXT: v_accvgpr_write_b32 a43, s15
+; FAST90A-NEXT: v_accvgpr_write_b32 a42, s14
+; FAST90A-NEXT: v_accvgpr_write_b32 a41, s13
+; FAST90A-NEXT: v_accvgpr_write_b32 a40, s12
+; FAST90A-NEXT: v_accvgpr_write_b32 a39, s11
+; FAST90A-NEXT: v_accvgpr_write_b32 a38, s10
+; FAST90A-NEXT: v_accvgpr_write_b32 a37, s9
+; FAST90A-NEXT: v_accvgpr_write_b32 a36, s8
+; FAST90A-NEXT: v_accvgpr_write_b32 a35, s7
+; FAST90A-NEXT: v_accvgpr_write_b32 a34, s6
+; FAST90A-NEXT: v_accvgpr_write_b32 a33, s5
+; FAST90A-NEXT: v_accvgpr_write_b32 a32, s4
; FAST90A-NEXT: s_nop 1
-; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
-; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v1, v2, a[0:15]
+; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[32:47], v0, v1, a[32:47]
+; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[16:31], v0, v1, a[32:47]
; FAST90A-NEXT: s_nop 7
-; FAST90A-NEXT: s_nop 2
-; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a16
-; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a17
-; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a18
-; FAST90A-NEXT: v_accvgpr_mov_b32 a5, a19
-; FAST90A-NEXT: v_accvgpr_mov_b32 a6, a20
-; FAST90A-NEXT: v_accvgpr_mov_b32 a7, a21
-; FAST90A-NEXT: v_accvgpr_mov_b32 a8, a22
-; FAST90A-NEXT: v_accvgpr_mov_b32 a9, a23
-; FAST90A-NEXT: v_accvgpr_mov_b32 a10, a24
-; FAST90A-NEXT: v_accvgpr_mov_b32 a11, a25
-; FAST90A-NEXT: v_accvgpr_mov_b32 a12, a26
-; FAST90A-NEXT: v_accvgpr_mov_b32 a13, a27
-; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a28
+; FAST90A-NEXT: s_nop 1
+; FAST90A-NEXT: v_accvgpr_mov_b32 a1, a33
+; FAST90A-NEXT: v_accvgpr_mov_b32 a0, a32
; FAST90A-NEXT: v_accvgpr_mov_b32 a15, a29
+; FAST90A-NEXT: v_accvgpr_mov_b32 a14, a28
+; FAST90A-NEXT: v_accvgpr_mov_b32 a13, a27
+; FAST90A-NEXT: v_accvgpr_mov_b32 a12, a26
+; FAST90A-NEXT: v_accvgpr_mov_b32 a11, a25
+; FAST90A-NEXT: v_accvgpr_mov_b32 a10, a24
+; FAST90A-NEXT: v_accvgpr_mov_b32 a9, a23
+; FAST90A-NEXT: v_accvgpr_mov_b32 a8, a22
+; FAST90A-NEXT: v_accvgpr_mov_b32 a7, a21
+; FAST90A-NEXT: v_accvgpr_mov_b32 a6, a20
+; FAST90A-NEXT: v_accvgpr_mov_b32 a5, a19
+; FAST90A-NEXT: v_accvgpr_mov_b32 a4, a18
+; FAST90A-NEXT: v_accvgpr_mov_b32 a3, a17
+; FAST90A-NEXT: v_accvgpr_mov_b32 a2, a16
; FAST90A-NEXT: s_nop 1
-; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v1, v2, a[0:15]
+; FAST90A-NEXT: v_mfma_f32_16x16x1f32 a[0:15], v0, v1, a[0:15]
+; FAST90A-NEXT: v_mov_b32_e32 v0, 0
; FAST90A-NEXT: s_nop 7
-; FAST90A-NEXT: s_nop 2
+; FAST90A-NEXT: s_nop 1
; FAST90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
; FAST90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
; FAST90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
More information about the llvm-commits
mailing list