[llvm-branch-commits] [llvm] AMDGPU: Allocate different registers for vdst & src in v_cvt_scalef32* (PR #117822)

Matt Arsenault via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Tue Nov 26 17:02:24 PST 2024


https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/117822

>From bbc7178519e6bbb7477bc66bb8e10685d4245381 Mon Sep 17 00:00:00 2001
From: Pravin Jagtap <Pravin.Jagtap at amd.com>
Date: Thu, 11 Jul 2024 05:12:42 -0400
Subject: [PATCH] AMDGPU: Allocate different registers for vdst & src in
 v_cvt_scalef32*
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For multipass instructions, overlap on VDST and SRC’s
would result in HW race & undefined results.

Co-authored-by: Pravin Jagtap <Pravin.Jagtap at amd.com>
---
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |  12 +-
 .../llvm.amdgcn.cvt.scalef32.pk.gfx950.ll     | 214 +++++++++++++-----
 .../AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll     | 168 +++++++-------
 .../llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll  |  48 ++--
 .../AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll  | 168 +++++++-------
 5 files changed, 353 insertions(+), 257 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 00caea1f923391..9ef52c0feb7233 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1088,9 +1088,11 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
   defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>;
   let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
     defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
-    defm V_CVT_SCALEF32_SR_PK_FP4_F16:  VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
-    defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
-    defm V_CVT_SCALEF32_SR_PK_FP4_F32:  VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
+    let Constraints = "@earlyclobber $vdst" in {
+      defm V_CVT_SCALEF32_SR_PK_FP4_F16:  VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
+      defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
+      defm V_CVT_SCALEF32_SR_PK_FP4_F32:  VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
+    }
   }
   defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f16>>;
   defm V_CVT_SCALEF32_PK_BF16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_bf16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2bf16>>;
@@ -1103,7 +1105,7 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
   }
 }
 
-let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in {
+let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0, Constraints = "@earlyclobber $vdst" in {
   defm V_CVT_SCALEF32_PK32_F32_FP6  : VOP3Inst<"v_cvt_scalef32_pk32_f32_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_fp6>;
   defm V_CVT_SCALEF32_PK32_F32_BF6  : VOP3Inst<"v_cvt_scalef32_pk32_f32_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_bf6>;
   defm V_CVT_SCALEF32_PK32_F16_FP6  : VOP3Inst<"v_cvt_scalef32_pk32_f16_fp6",  VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f16_fp6>;
@@ -1112,7 +1114,7 @@ let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0
   defm V_CVT_SCALEF32_PK32_BF16_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_bf16_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32BF16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_bf16_bf6>;
 }
 
-let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in {
+let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0, Constraints = "@earlyclobber $vdst" in {
   defm V_CVT_SCALEF32_PK32_FP6_F16   : VOP3Inst<"v_cvt_scalef32_pk32_fp6_f16",  VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>,  int_amdgcn_cvt_scalef32_pk32_fp6_f16>;
   defm V_CVT_SCALEF32_PK32_BF6_F16   : VOP3Inst<"v_cvt_scalef32_pk32_bf6_f16",  VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>,  int_amdgcn_cvt_scalef32_pk32_bf6_f16>;
   defm V_CVT_SCALEF32_PK32_FP6_BF16  : VOP3Inst<"v_cvt_scalef32_pk32_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_bf16>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
index 6d627186d25816..f80f2935856e36 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
@@ -864,31 +864,91 @@ define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte3(i32 %src, float %scale) {
 }
 
 define <32 x float> @test_cvt_scale_pk32_f32_fp6(<6 x i32> %src, float %scale) {
-; GCN-LABEL: test_cvt_scale_pk32_f32_fp6:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[0:5], v6
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_fp6:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v38, v6
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v37, v5
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v36, v4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v35, v3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v34, v2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v33, v1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v32, v0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_fp6:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v32, v0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v33, v1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v34, v2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v35, v3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v36, v4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v37, v5
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v38, v6
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x float>  @llvm.amdgcn.cvt.scalef32.pk32.f32.fp6(<6 x i32> %src, float %scale)
   ret <32 x float> %ret
 }
 
 define <32 x float> @test_cvt_scale_pk32_f32_bf6(<6 x i32> %src, float %scale) {
-; GCN-LABEL: test_cvt_scale_pk32_f32_bf6:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[0:5], v6
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_bf6:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v38, v6
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v37, v5
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v36, v4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v35, v3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v34, v2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v33, v1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v32, v0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_bf6:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v32, v0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v33, v1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v34, v2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v35, v3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v36, v4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v37, v5
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v38, v6
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x float>  @llvm.amdgcn.cvt.scalef32.pk32.f32.bf6(<6 x i32> %src, float %scale)
   ret <32 x float> %ret
 }
 
 define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_vv(<6 x i32> %src, float %scale) {
-; GCN-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], v6
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, v0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v17, v1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v18, v2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v19, v3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v20, v4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v21, v5
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, v6
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float %scale)
   ret <32 x half> %ret
 }
@@ -897,14 +957,14 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) {
 ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s3
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s16
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s16
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s17
 ; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], s0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], s0
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl:
@@ -912,11 +972,11 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) {
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_mov_b32 s4, s16
 ; GFX950-GISEL-NEXT:    s_mov_b32 s5, s17
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], v6
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float 100.0)
   ret <32 x half> %ret
@@ -926,7 +986,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_vv(<6 x i32> %src, float %
 ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[0:5], v6
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], v22
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
@@ -958,14 +1025,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) {
 ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s3
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s16
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s16
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s17
 ; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[0:5], s0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
@@ -1000,11 +1067,31 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) {
 }
 
 define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_vv(<6 x i32> %src, float %scale) {
-; GCN-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], v6
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, v0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v17, v1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v18, v2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v19, v3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v20, v4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v21, v5
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, v6
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float %scale)
   ret <32 x half> %ret
 }
@@ -1013,14 +1100,14 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) {
 ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s3
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s16
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s16
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s17
 ; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], s0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], s0
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl:
@@ -1028,11 +1115,11 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) {
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_mov_b32 s4, s16
 ; GFX950-GISEL-NEXT:    s_mov_b32 s5, s17
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], v6
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float 100.0)
   ret <32 x half> %ret
@@ -1042,7 +1129,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_vv(<6 x i32> %src, float %
 ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[0:5], v6
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], v22
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
@@ -1074,14 +1168,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_sl(<6 x i32> inreg %src) {
 ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s3
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s16
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s16
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s17
 ; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[0:5], s0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
index 4153bc8f43563b..f9fd7e253b1243 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
@@ -10,24 +10,24 @@ declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float
 define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_vv(<32 x bfloat> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_bf16_vv:
 ; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, v18
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, v17
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf6_bf16 v[0:5], v[0:15], v16
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[18:19], v[4:5], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v25, v18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v24, v17
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[0:15], v16
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[24:25], v[18:21], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_vv:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v20, v17
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v21, v18
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, v17
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v25, v18
 ; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
 ; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
 ; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v3
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v24, 16, v5
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
+; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
+; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
+; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
+; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v6
 ; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v26, 16, v7
 ; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v27, 16, v8
 ; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
@@ -40,10 +40,10 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_vv(<32 x bfloat> %src, float
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v3, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v4, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v5, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v6, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v4, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v5, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v7, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v8, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v9, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
@@ -54,9 +54,9 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_vv(<32 x bfloat> %src, float
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v15, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf6_bf16 v[0:5], v[0:15], v16
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[20:21], v[0:3], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[20:21], v[4:5], off offset:16
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[0:15], v16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[24:25], v[18:21], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[24:25], v[22:23], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -83,9 +83,9 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_sl(<32 x bfloat> inreg %src,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s14
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s15
 ; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf6_bf16 v[2:7], v[2:17], s0
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], s0
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_sl:
@@ -162,10 +162,10 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_sl(<32 x bfloat> inreg %src,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v18, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf6_bf16 v[2:7], v[2:17], v18
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], v24
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -175,20 +175,20 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_sl(<32 x bfloat> inreg %src,
 define amdgpu_ps void @test_scalef32_pk32_bf6_f16_vv(<32 x half> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f16_vv:
 ; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, v18
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, v17
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf6_f16 v[0:5], v[0:15], v16
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[18:19], v[4:5], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v25, v18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v24, v17
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[0:15], v16
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[24:25], v[18:21], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f16_vv:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v20, v17
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v21, v18
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf6_f16 v[0:5], v[0:15], v16
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[20:21], v[0:3], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[20:21], v[4:5], off offset:16
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, v17
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v25, v18
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[0:15], v16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[24:25], v[18:21], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[24:25], v[22:23], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -215,9 +215,9 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_f16_sl(<32 x half> inreg %src, ptr
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s14
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s15
 ; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf6_f16 v[2:7], v[2:17], s0
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[2:17], s0
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f16_sl:
@@ -230,10 +230,10 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_f16_sl(<32 x half> inreg %src, ptr
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v18, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf6_f16 v[2:7], v[2:17], v18
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[2:17], v24
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -243,24 +243,24 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_f16_sl(<32 x half> inreg %src, ptr
 define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_vv(<32 x bfloat> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_bf16_vv:
 ; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, v18
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, v17
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_fp6_bf16 v[0:5], v[0:15], v16
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[18:19], v[4:5], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v25, v18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v24, v17
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[0:15], v16
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[24:25], v[18:21], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_vv:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v20, v17
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v21, v18
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, v17
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v25, v18
 ; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
 ; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
 ; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v3
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v4
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v24, 16, v5
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
+; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
+; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
+; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
+; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v6
 ; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v26, 16, v7
 ; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v27, 16, v8
 ; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
@@ -273,10 +273,10 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_vv(<32 x bfloat> %src, float
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v3, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v4, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v5, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v6, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v4, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v5, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
+; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v7, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v8, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v9, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
@@ -287,9 +287,9 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_vv(<32 x bfloat> %src, float
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v15, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_fp6_bf16 v[0:5], v[0:15], v16
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[20:21], v[0:3], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[20:21], v[4:5], off offset:16
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[0:15], v16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[24:25], v[18:21], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[24:25], v[22:23], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -316,9 +316,9 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_sl(<32 x bfloat> inreg %src,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s14
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s15
 ; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_fp6_bf16 v[2:7], v[2:17], s0
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], s0
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_sl:
@@ -395,10 +395,10 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_sl(<32 x bfloat> inreg %src,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v18, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_fp6_bf16 v[2:7], v[2:17], v18
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], v24
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -408,20 +408,20 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_sl(<32 x bfloat> inreg %src,
 define amdgpu_ps void @test_scalef32_pk32_fp6_f16_vv(<32 x half> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f16_vv:
 ; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, v18
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, v17
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_fp6_f16 v[0:5], v[0:15], v16
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[18:19], v[4:5], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v25, v18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v24, v17
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[0:15], v16
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[24:25], v[18:21], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f16_vv:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v20, v17
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v21, v18
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_fp6_f16 v[0:5], v[0:15], v16
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[20:21], v[0:3], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[20:21], v[4:5], off offset:16
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, v17
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v25, v18
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[0:15], v16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[24:25], v[18:21], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[24:25], v[22:23], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -448,9 +448,9 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_f16_sl(<32 x half> inreg %src, ptr
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s14
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s15
 ; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_fp6_f16 v[2:7], v[2:17], s0
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[2:17], s0
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f16_sl:
@@ -463,10 +463,10 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_f16_sl(<32 x half> inreg %src, ptr
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v18, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_fp6_f16 v[2:7], v[2:17], v18
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[2:17], v24
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll
index b64ca3fb67f605..1107b46f8f6d38 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll
@@ -11,8 +11,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_0(ptr addrspace(1) %o
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_f16 v5, v2, v3, v4
-; GFX950-NEXT:    global_store_dword v[0:1], v5, off
+; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4
+; GFX950-NEXT:    global_store_dword v[0:1], v6, off
 ; GFX950-NEXT:    s_endpgm
   %old = load i32, ptr addrspace(1) %out, align 4
   %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 0)
@@ -25,8 +25,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_1(ptr addrspace(1) %o
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_f16 v5, v2, v3, v4 op_sel:[0,0,1,0]
-; GFX950-NEXT:    global_store_dword v[0:1], v5, off
+; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,1,0]
+; GFX950-NEXT:    global_store_dword v[0:1], v6, off
 ; GFX950-NEXT:    s_endpgm
   %old = load i32, ptr addrspace(1) %out, align 4
   %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 1)
@@ -39,8 +39,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_2(ptr addrspace(1) %o
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_f16 v5, v2, v3, v4 op_sel:[0,0,0,1]
-; GFX950-NEXT:    global_store_dword v[0:1], v5, off
+; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,0,1]
+; GFX950-NEXT:    global_store_dword v[0:1], v6, off
 ; GFX950-NEXT:    s_endpgm
   %old = load i32, ptr addrspace(1) %out, align 4
   %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 2)
@@ -53,8 +53,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f16_dst_sel_3(ptr addrspace(1) %o
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_f16 v5, v2, v3, v4 op_sel:[0,0,1,1]
-; GFX950-NEXT:    global_store_dword v[0:1], v5, off
+; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_f16 v6, v2, v3, v4 op_sel:[0,0,1,1]
+; GFX950-NEXT:    global_store_dword v[0:1], v6, off
 ; GFX950-NEXT:    s_endpgm
   %old = load i32, ptr addrspace(1) %out, align 4
   %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 3)
@@ -67,8 +67,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_0(ptr addrspace(1) %
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_bf16 v5, v2, v3, v4
-; GFX950-NEXT:    global_store_dword v[0:1], v5, off
+; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4
+; GFX950-NEXT:    global_store_dword v[0:1], v6, off
 ; GFX950-NEXT:    s_endpgm
   %old = load i32, ptr addrspace(1) %out, align 4
   %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 0)
@@ -81,8 +81,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_1(ptr addrspace(1) %
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_bf16 v5, v2, v3, v4 op_sel:[0,0,1,0]
-; GFX950-NEXT:    global_store_dword v[0:1], v5, off
+; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,1,0]
+; GFX950-NEXT:    global_store_dword v[0:1], v6, off
 ; GFX950-NEXT:    s_endpgm
   %old = load i32, ptr addrspace(1) %out, align 4
   %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 1)
@@ -95,8 +95,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_2(ptr addrspace(1) %
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_bf16 v5, v2, v3, v4 op_sel:[0,0,0,1]
-; GFX950-NEXT:    global_store_dword v[0:1], v5, off
+; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,0,1]
+; GFX950-NEXT:    global_store_dword v[0:1], v6, off
 ; GFX950-NEXT:    s_endpgm
   %old = load i32, ptr addrspace(1) %out, align 4
   %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 2)
@@ -109,8 +109,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_bf16_dst_sel_3(ptr addrspace(1) %
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    global_load_dword v5, v[0:1], off
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_bf16 v5, v2, v3, v4 op_sel:[0,0,1,1]
-; GFX950-NEXT:    global_store_dword v[0:1], v5, off
+; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_bf16 v6, v2, v3, v4 op_sel:[0,0,1,1]
+; GFX950-NEXT:    global_store_dword v[0:1], v6, off
 ; GFX950-NEXT:    s_endpgm
   %old = load i32, ptr addrspace(1) %out, align 4
   %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 3)
@@ -123,8 +123,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_0(ptr addrspace(1) %o
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_f32 v6, v[2:3], v4, v5
-; GFX950-NEXT:    global_store_dword v[0:1], v6, off
+; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5
+; GFX950-NEXT:    global_store_dword v[0:1], v7, off
 ; GFX950-NEXT:    s_endpgm
   %old = load i32, ptr addrspace(1) %out, align 4
   %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 %old, <2 x float> %src, i32 %seed, float %scale, i32 0)
@@ -137,8 +137,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_1(ptr addrspace(1) %o
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_f32 v6, v[2:3], v4, v5 op_sel:[0,0,1,0]
-; GFX950-NEXT:    global_store_dword v[0:1], v6, off
+; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,1,0]
+; GFX950-NEXT:    global_store_dword v[0:1], v7, off
 ; GFX950-NEXT:    s_endpgm
   %old = load i32, ptr addrspace(1) %out, align 4
   %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 %old, <2 x float> %src, i32 %seed, float %scale, i32 1)
@@ -151,8 +151,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_2(ptr addrspace(1) %o
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_f32 v6, v[2:3], v4, v5 op_sel:[0,0,0,1]
-; GFX950-NEXT:    global_store_dword v[0:1], v6, off
+; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,0,1]
+; GFX950-NEXT:    global_store_dword v[0:1], v7, off
 ; GFX950-NEXT:    s_endpgm
   %old = load i32, ptr addrspace(1) %out, align 4
   %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 %old, <2 x float> %src, i32 %seed, float %scale, i32 2)
@@ -165,8 +165,8 @@ define amdgpu_ps void @test_scalef32_sr_pk_fp4_f32_dst_sel_3(ptr addrspace(1) %o
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    global_load_dword v6, v[0:1], off
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_f32 v6, v[2:3], v4, v5 op_sel:[0,0,1,1]
-; GFX950-NEXT:    global_store_dword v[0:1], v6, off
+; GFX950-NEXT:    v_cvt_scalef32_sr_pk_fp4_f32 v7, v[2:3], v4, v5 op_sel:[0,0,1,1]
+; GFX950-NEXT:    global_store_dword v[0:1], v7, off
 ; GFX950-NEXT:    s_endpgm
   %old = load i32, ptr addrspace(1) %out, align 4
   %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f32(i32 %old, <2 x float> %src, i32 %seed, float %scale, i32 3)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll
index 3e9ac6cbe3ba6e..0d4598f316c411 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll
@@ -12,9 +12,9 @@ declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f32(<32 x float> %src, i
 define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_vv(<32 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX950-SDAG-LABEL: test_scalef32_sr_pk32_bf6_bf16_vv:
 ; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_bf6_bf16 v[0:5], v[0:15], v16, v17
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[18:19], v[4:5], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_bf6_bf16 v[20:25], v[0:15], v16, v17
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[18:19], v[24:25], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[18:19], v[20:23], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_bf16_vv:
@@ -52,9 +52,9 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_vv(<32 x bfloat> %src, i32
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v14, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v15, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_bf6_bf16 v[0:5], v[0:15], v16, v17
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[18:19], v[4:5], off offset:16
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_bf6_bf16 v[20:25], v[0:15], v16, v17
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[18:19], v[20:23], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[18:19], v[24:25], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -80,10 +80,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_sl(<32 x bfloat> inreg %sr
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v15, s13
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s14
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s15
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_bf6_bf16 v[2:7], v[2:17], s16, v18
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v24, 0x42c80000
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_bf6_bf16 v[18:23], v[2:17], s16, v24
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_bf16_sl:
@@ -160,10 +160,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_sl(<32 x bfloat> inreg %sr
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v18, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_bf6_bf16 v[2:7], v[2:17], s16, v18
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_bf6_bf16 v[18:23], v[2:17], s16, v24
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -173,16 +173,16 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_sl(<32 x bfloat> inreg %sr
 define amdgpu_ps void @test_scalef32_sr_pk32_bf6_f16_vv(<32 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX950-SDAG-LABEL: test_scalef32_sr_pk32_bf6_f16_vv:
 ; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_bf6_f16 v[0:5], v[0:15], v16, v17
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[18:19], v[4:5], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_bf6_f16 v[20:25], v[0:15], v16, v17
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[18:19], v[24:25], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[18:19], v[20:23], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_f16_vv:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_bf6_f16 v[0:5], v[0:15], v16, v17
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[18:19], v[4:5], off offset:16
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_bf6_f16 v[20:25], v[0:15], v16, v17
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[18:19], v[20:23], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[18:19], v[24:25], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> %src, i32 %sr, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -208,10 +208,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_f16_sl(<32 x half> inreg %src,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v15, s13
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s14
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s15
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_bf6_f16 v[2:7], v[2:17], s16, v18
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v24, 0x42c80000
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_bf6_f16 v[18:23], v[2:17], s16, v24
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_f16_sl:
@@ -224,10 +224,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_f16_sl(<32 x half> inreg %src,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v18, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_bf6_f16 v[2:7], v[2:17], s16, v18
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_bf6_f16 v[18:23], v[2:17], s16, v24
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> %src, i32 %sr, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -237,9 +237,9 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_f16_sl(<32 x half> inreg %src,
 define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_vv(<32 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX950-SDAG-LABEL: test_scalef32_sr_pk32_fp6_bf16_vv:
 ; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_fp6_bf16 v[0:5], v[0:15], v16, v17
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[18:19], v[4:5], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_fp6_bf16 v[20:25], v[0:15], v16, v17
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[18:19], v[24:25], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[18:19], v[20:23], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_bf16_vv:
@@ -277,9 +277,9 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_vv(<32 x bfloat> %src, i32
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v14, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v15, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GFX950-GISEL-NEXT:    s_nop 0
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_fp6_bf16 v[0:5], v[0:15], v16, v17
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[18:19], v[4:5], off offset:16
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_fp6_bf16 v[20:25], v[0:15], v16, v17
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[18:19], v[20:23], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[18:19], v[24:25], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> %src, i32 %sr, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -305,10 +305,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_sl(<32 x bfloat> inreg %sr
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v15, s13
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s14
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s15
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_fp6_bf16 v[2:7], v[2:17], s16, v18
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v24, 0x42c80000
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_fp6_bf16 v[18:23], v[2:17], s16, v24
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_bf16_sl:
@@ -385,10 +385,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_sl(<32 x bfloat> inreg %sr
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v18, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_fp6_bf16 v[2:7], v[2:17], s16, v18
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_fp6_bf16 v[18:23], v[2:17], s16, v24
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> %src, i32 %sr, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -398,16 +398,16 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_sl(<32 x bfloat> inreg %sr
 define amdgpu_ps void @test_scalef32_sr_pk32_fp6_f16_vv(<32 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX950-SDAG-LABEL: test_scalef32_sr_pk32_fp6_f16_vv:
 ; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_fp6_f16 v[0:5], v[0:15], v16, v17
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[18:19], v[4:5], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_fp6_f16 v[20:25], v[0:15], v16, v17
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[18:19], v[24:25], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[18:19], v[20:23], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_f16_vv:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_fp6_f16 v[0:5], v[0:15], v16, v17
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[18:19], v[4:5], off offset:16
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_fp6_f16 v[20:25], v[0:15], v16, v17
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[18:19], v[20:23], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[18:19], v[24:25], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f16(<32 x half> %src, i32 %sr, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -433,10 +433,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_f16_sl(<32 x half> inreg %src,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v15, s13
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s14
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s15
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_fp6_f16 v[2:7], v[2:17], s16, v18
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v24, 0x42c80000
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_fp6_f16 v[18:23], v[2:17], s16, v24
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_f16_sl:
@@ -449,10 +449,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_f16_sl(<32 x half> inreg %src,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v18, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_fp6_f16 v[2:7], v[2:17], s16, v18
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_fp6_f16 v[18:23], v[2:17], s16, v24
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f16(<32 x half> %src, i32 %sr, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -462,16 +462,16 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_f16_sl(<32 x half> inreg %src,
 define amdgpu_ps void @test_scalef32_sr_pk32_bf6_f32_vv(<32 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX950-SDAG-LABEL: test_scalef32_sr_pk32_bf6_f32_vv:
 ; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[0:31], v32, v33
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[34:35], v[4:5], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[34:35], v[0:3], off
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_bf6_f32 v[36:41], v[0:31], v32, v33
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[34:35], v[40:41], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[34:35], v[36:39], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_f32_vv:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[0:31], v32, v33
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[34:35], v[0:3], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[34:35], v[4:5], off offset:16
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_bf6_f32 v[36:41], v[0:31], v32, v33
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[34:35], v[36:39], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[34:35], v[40:41], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f32(<32 x float> %src, i32 %sr, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -513,10 +513,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_f32_sl(<32 x float> inreg %src,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v31, s29
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v32, s30
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v33, s31
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v34, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_bf6_f32 v[2:7], v[2:33], s32, v34
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v40, 0x42c80000
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_bf6_f32 v[34:39], v[2:33], s32, v40
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[38:39], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[34:37], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_f32_sl:
@@ -537,10 +537,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_f32_sl(<32 x float> inreg %src,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v34, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_bf6_f32 v[2:7], v[2:33], s32, v34
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v40, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_bf6_f32 v[34:39], v[2:33], s32, v40
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[34:37], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[38:39], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f32(<32 x float> %src, i32 %sr, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -550,16 +550,16 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_f32_sl(<32 x float> inreg %src,
 define amdgpu_ps void @test_scalef32_sr_pk32_fp6_f32_vv(<32 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX950-SDAG-LABEL: test_scalef32_sr_pk32_fp6_f32_vv:
 ; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[0:31], v32, v33
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[34:35], v[4:5], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[34:35], v[0:3], off
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_fp6_f32 v[36:41], v[0:31], v32, v33
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[34:35], v[40:41], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[34:35], v[36:39], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_f32_vv:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[0:31], v32, v33
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[34:35], v[0:3], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[34:35], v[4:5], off offset:16
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_fp6_f32 v[36:41], v[0:31], v32, v33
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[34:35], v[36:39], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[34:35], v[40:41], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f32(<32 x float> %src, i32 %sr, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -601,10 +601,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_f32_sl(<32 x float> inreg %src,
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v31, s29
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v32, s30
 ; GFX950-SDAG-NEXT:    v_mov_b32_e32 v33, s31
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v34, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_fp6_f32 v[2:7], v[2:33], s32, v34
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v40, 0x42c80000
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_sr_pk32_fp6_f32 v[34:39], v[2:33], s32, v40
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[38:39], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[34:37], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_f32_sl:
@@ -625,10 +625,10 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_f32_sl(<32 x float> inreg %src,
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v34, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_fp6_f32 v[2:7], v[2:33], s32, v34
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v40, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_fp6_f32 v[34:39], v[2:33], s32, v40
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[34:37], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[38:39], off offset:16
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.f32(<32 x float> %src, i32 %sr, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8



More information about the llvm-branch-commits mailing list