[llvm-branch-commits] [llvm] AMDGPU: Allocate different registers for vdst & src in v_cvt_scalef32* (PR #117822)

via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Tue Nov 26 16:55:32 PST 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

<details>
<summary>Changes</summary>

For multipass instructions, overlap on VDST and SRC’s
would result in HW race & undefined results.

Co-authored-by: Pravin Jagtap <Pravin.Jagtap@<!-- -->amd.com>

---

Patch is 68.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117822.diff


5 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+7-5) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll (+154-60) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll (+84-84) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll (+24-24) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll (+84-84) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 00caea1f923391..9ef52c0feb7233 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1088,9 +1088,11 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
   defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>;
   let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
     defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
-    defm V_CVT_SCALEF32_SR_PK_FP4_F16:  VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
-    defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
-    defm V_CVT_SCALEF32_SR_PK_FP4_F32:  VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
+    let Constraints = "@earlyclobber $vdst" in {
+      defm V_CVT_SCALEF32_SR_PK_FP4_F16:  VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
+      defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
+      defm V_CVT_SCALEF32_SR_PK_FP4_F32:  VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
+    }
   }
   defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f16>>;
   defm V_CVT_SCALEF32_PK_BF16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_bf16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2bf16>>;
@@ -1103,7 +1105,7 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
   }
 }
 
-let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in {
+let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0, Constraints = "@earlyclobber $vdst" in {
   defm V_CVT_SCALEF32_PK32_F32_FP6  : VOP3Inst<"v_cvt_scalef32_pk32_f32_fp6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_fp6>;
   defm V_CVT_SCALEF32_PK32_F32_BF6  : VOP3Inst<"v_cvt_scalef32_pk32_f32_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F32_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f32_bf6>;
   defm V_CVT_SCALEF32_PK32_F16_FP6  : VOP3Inst<"v_cvt_scalef32_pk32_f16_fp6",  VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32F16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_f16_fp6>;
@@ -1112,7 +1114,7 @@ let SubtargetPredicate = HasFP6BF6ConversionScaleInsts, mayRaiseFPException = 0
   defm V_CVT_SCALEF32_PK32_BF16_BF6 : VOP3Inst<"v_cvt_scalef32_pk32_bf16_bf6", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V32BF16_V6I32_F32>, int_amdgcn_cvt_scalef32_pk32_bf16_bf6>;
 }
 
-let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0 in {
+let SubtargetPredicate = HasF16BF16ToFP6BF6ConversionScaleInsts, mayRaiseFPException = 0, Constraints = "@earlyclobber $vdst" in {
   defm V_CVT_SCALEF32_PK32_FP6_F16   : VOP3Inst<"v_cvt_scalef32_pk32_fp6_f16",  VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>,  int_amdgcn_cvt_scalef32_pk32_fp6_f16>;
   defm V_CVT_SCALEF32_PK32_BF6_F16   : VOP3Inst<"v_cvt_scalef32_pk32_bf6_f16",  VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32F16_F32>,  int_amdgcn_cvt_scalef32_pk32_bf6_f16>;
   defm V_CVT_SCALEF32_PK32_FP6_BF16  : VOP3Inst<"v_cvt_scalef32_pk32_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V32BF16_F32>, int_amdgcn_cvt_scalef32_pk32_fp6_bf16>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
index 6d627186d25816..f80f2935856e36 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
@@ -864,31 +864,91 @@ define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte3(i32 %src, float %scale) {
 }
 
 define <32 x float> @test_cvt_scale_pk32_f32_fp6(<6 x i32> %src, float %scale) {
-; GCN-LABEL: test_cvt_scale_pk32_f32_fp6:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[0:5], v6
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_fp6:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v38, v6
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v37, v5
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v36, v4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v35, v3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v34, v2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v33, v1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v32, v0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_fp6:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v32, v0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v33, v1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v34, v2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v35, v3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v36, v4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v37, v5
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v38, v6
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[32:37], v38
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x float>  @llvm.amdgcn.cvt.scalef32.pk32.f32.fp6(<6 x i32> %src, float %scale)
   ret <32 x float> %ret
 }
 
 define <32 x float> @test_cvt_scale_pk32_f32_bf6(<6 x i32> %src, float %scale) {
-; GCN-LABEL: test_cvt_scale_pk32_f32_bf6:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[0:5], v6
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_bf6:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v38, v6
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v37, v5
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v36, v4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v35, v3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v34, v2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v33, v1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v32, v0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_bf6:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v32, v0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v33, v1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v34, v2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v35, v3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v36, v4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v37, v5
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v38, v6
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[32:37], v38
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x float>  @llvm.amdgcn.cvt.scalef32.pk32.f32.bf6(<6 x i32> %src, float %scale)
   ret <32 x float> %ret
 }
 
 define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_vv(<6 x i32> %src, float %scale) {
-; GCN-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], v6
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, v0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v17, v1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v18, v2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v19, v3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v20, v4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v21, v5
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, v6
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float %scale)
   ret <32 x half> %ret
 }
@@ -897,14 +957,14 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) {
 ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s3
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s16
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s16
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s17
 ; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], s0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], s0
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl:
@@ -912,11 +972,11 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) {
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_mov_b32 s4, s16
 ; GFX950-GISEL-NEXT:    s_mov_b32 s5, s17
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[0:5], v6
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float 100.0)
   ret <32 x half> %ret
@@ -926,7 +986,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_vv(<6 x i32> %src, float %
 ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[0:5], v6
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], v22
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
@@ -958,14 +1025,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) {
 ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s3
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s16
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s16
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s17
 ; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[0:5], s0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
@@ -1000,11 +1067,31 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) {
 }
 
 define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_vv(<6 x i32> %src, float %scale) {
-; GCN-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], v6
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
+; GFX950-SDAG:       ; %bb.0:
+; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
+; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv:
+; GFX950-GISEL:       ; %bb.0:
+; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, v0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v17, v1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v18, v2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v19, v3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v20, v4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v21, v5
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, v6
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
+; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float %scale)
   ret <32 x half> %ret
 }
@@ -1013,14 +1100,14 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) {
 ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s3
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s16
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s16
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s17
 ; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], s0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], s0
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl:
@@ -1028,11 +1115,11 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) {
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-GISEL-NEXT:    s_mov_b32 s4, s16
 ; GFX950-GISEL-NEXT:    s_mov_b32 s5, s17
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[0:5], v6
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[4:5]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[2:3]
+; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v22, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
 ; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float 100.0)
   ret <32 x half> %ret
@@ -1042,7 +1129,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_vv(<6 x i32> %src, float %
 ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[0:5], v6
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, v6
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, v5
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, v4
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, v3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, v2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, v1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, v0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], v22
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
@@ -1074,14 +1168,14 @@ define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_sl(<6 x i32> inreg %src) {
 ; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
 ; GFX950-SDAG:       ; %bb.0:
 ; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, s3
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v4, s16
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v5, s17
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s0
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s1
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s2
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s3
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s16
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s17
 ; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[0:5], s0
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0
 ; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
index 4153bc8f43563b..f9fd7e253b1243 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
@@ -10,24 +10,24 @@ declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float
 define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_vv(<32 x bfloat> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_bf16_vv:
 ; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, v18
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, v17
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf6_bf16 v[0:5], v[0:15], v16
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[18:19], v[4:5], off offset:16
-; GFX950-SDAG-NEXT:    global_store_dwordx4 v[18:19], v[0:3], off
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v25, v18
+; GFX950-SDAG-NEXT:    v_mov_b32_e32 v24, v17
+; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[0:15], v16
+; GFX950-SDAG-NEXT:    global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-SDAG-NEXT:    global_store_dwordx4 v[24:25], v[18:21], off
 ; GFX950-SDAG-NEXT:    s_endpgm
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_vv:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v20, v17
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v21, v18
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, v17
+; GFX950-GISEL-NEXT:    v_mov_b...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/117822


More information about the llvm-branch-commits mailing list