[llvm] AMDGPU: Restrict src0 to VGPRs only for certain cvt scale opcodes. (PR #127464)
Pravin Jagtap via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 20 08:00:33 PST 2025
https://github.com/pravinjagtap updated https://github.com/llvm/llvm-project/pull/127464
>From 824b47b29580fe5d8be3d93c50c6c31deabc9e34 Mon Sep 17 00:00:00 2001
From: Pravin Jagtap <Pravin.Jagtap at amd.com>
Date: Mon, 17 Feb 2025 15:15:17 +0530
Subject: [PATCH 1/4] AMDGPU: Allow only VGPR wide sources in fp6/4/8
conversions
All convert opcodes operating on FP6/BF6/FP4
data must use VGPR sources for any operand slots
providing more than 32-bits of data.
Change-Id: Ic48ef739875c341e85f1a6c3984851ae497592af
---
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 13 +++++
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 12 ++++-
llvm/test/MC/AMDGPU/gfx950_err.s | 63 ++++++++++++++++++++++
3 files changed, 86 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index bb78e77a9dc1a..e2d0b28a391fb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1818,6 +1818,18 @@ class getVOP3VRegSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
1 : RegisterOperand<VGPR_32>);
}
+// VGPR only VOP3 src with 9 bit encoding
+class getVOP3VSrcReg9ForVT<ValueType VT> {
+ RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VRegSrc_1024,
+ !eq(VT.Size, 512) : VRegSrc_512,
+ !eq(VT.Size, 256) : VRegSrc_256,
+ !eq(VT.Size, 192) : VRegSrc_192,
+ !eq(VT.Size, 128) : VRegSrc_128,
+ !eq(VT.Size, 96) : VRegSrc_96,
+ !eq(VT.Size, 64) : VRegSrc_64,
+ 1 : VRegSrc_32);
+}
+
// Src2 of VOP3 DPP instructions cannot be a literal
class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
RegisterOperand ret =
@@ -2852,6 +2864,7 @@ def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>;
def VOP_V2I16_V2F16_F32 : VOPProfile<[v2i16, v2f16, f32, untyped]>;
def VOP_V2I16_V2BF16_F32 : VOPProfile<[v2i16, v2bf16, f32, untyped]>;
def VOP_I32_F32_F32_F32 : VOPProfile<[i32, f32, f32, f32]>;
+def VOP_I32_V2F32_I32_F32 : VOPProfile<[i32, v2f32, i32, f32]>;
def VOP_I32_V2F16_F32_F32 : VOPProfile<[i32, v2f16, f32, f32]>;
def VOP_I32_V2BF16_F32_F32: VOPProfile<[i32, v2bf16, f32, f32]>;
def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index afafc2ecccfaf..6a6d8bd94826f 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1052,7 +1052,10 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
let HasFP4DstByteSel = 1;
}
-def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
+class VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOPProfile P> : VOP3_Profile<P, VOP3_OPSEL> {
+
+ let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VSrcReg9ForVT<P.Src0VT>.ret,
+ getVOP3SrcForVT<P.Src0VT>.ret);
let InsVOP3OpSel = (ins PackedF32InputMods: $src0_modifiers, Src0RC64:$src0,
Int32InputMods: $src1_modifiers, Src1RC64:$src1,
FP32InputMods: $src2_modifiers, Src2RC64:$src2,
@@ -1100,6 +1103,11 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> {
let HasExt32BitDPP = 0;
let HasExtVOP3DPP = 0;
let HasExt64BitDPP = 0;
+
+ // All convert opcodes operating on FP6/BF6/FP4 data must use VGPR sources for
+ // any operand slots > 32 bit.
+ let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VSrcReg9ForVT<P.Src0VT>.ret,
+ getVOP3SrcForVT<P.Src0VT>.ret);
}
let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in {
@@ -1141,7 +1149,7 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
let Constraints = "@earlyclobber $vdst" in {
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
- defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
+ defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOP_I32_V2F32_I32_F32>>;
}
}
defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f16>>;
diff --git a/llvm/test/MC/AMDGPU/gfx950_err.s b/llvm/test/MC/AMDGPU/gfx950_err.s
index e0b832d8fe297..099916f48b5e7 100644
--- a/llvm/test/MC/AMDGPU/gfx950_err.s
+++ b/llvm/test/MC/AMDGPU/gfx950_err.s
@@ -434,3 +434,66 @@ v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[6:37], v38, v39 clamp
// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], v38, v39 clamp
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f32_fp6 v[0:31], s[32:37], v6
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f32_bf6 v[0:31], s[32:37], v6
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f16_fp6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf16_fp6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f16_bf6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf16_bf6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_fp6_f16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf6_f16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_fp6_bf16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf6_bf16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_bf6_bf16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_bf6_f16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid or unsupported register size
+v_cvt_scalef32_sr_pk32_bf6_f32 v[36:41], s[0:31], v32, v33
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_fp6_bf16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_fp6_f16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid or unsupported register size
+v_cvt_scalef32_sr_pk32_fp6_f32 v[36:41], s[0:31], v32, v33
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], s[0:15], v[6:21], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[6:21], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], s[0:15], v[6:21], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[6:21], s[0:15], v16
+
+// GFX950: v_cvt_scalef32_sr_pk_fp4_f32 v0, s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v4, v5
+v_cvt_scalef32_sr_pk_fp4_f32 v0, s[2:3], v4, v5
>From 13e7dde79eabc0f1502a29cb0730505fc401d514 Mon Sep 17 00:00:00 2001
From: Pravin Jagtap <Pravin.Jagtap at amd.com>
Date: Mon, 17 Feb 2025 15:57:56 +0530
Subject: [PATCH 2/4] clang-format
---
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 16 ++++++++--------
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 12 ++++++++----
2 files changed, 16 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index e2d0b28a391fb..576f6a2547fb8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1820,14 +1820,14 @@ class getVOP3VRegSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
// VGPR only VOP3 src with 9 bit encoding
class getVOP3VSrcReg9ForVT<ValueType VT> {
- RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VRegSrc_1024,
- !eq(VT.Size, 512) : VRegSrc_512,
- !eq(VT.Size, 256) : VRegSrc_256,
- !eq(VT.Size, 192) : VRegSrc_192,
- !eq(VT.Size, 128) : VRegSrc_128,
- !eq(VT.Size, 96) : VRegSrc_96,
- !eq(VT.Size, 64) : VRegSrc_64,
- 1 : VRegSrc_32);
+ RegisterOperand ret = !cond(!eq(VT.Size, 1024): VRegSrc_1024,
+ !eq(VT.Size, 512): VRegSrc_512,
+ !eq(VT.Size, 256): VRegSrc_256,
+ !eq(VT.Size, 192): VRegSrc_192,
+ !eq(VT.Size, 128): VRegSrc_128,
+ !eq(VT.Size, 96): VRegSrc_96,
+ !eq(VT.Size, 64): VRegSrc_64,
+ 1: VRegSrc_32);
}
// Src2 of VOP3 DPP instructions cannot be a literal
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 6a6d8bd94826f..93bce75f72deb 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1052,10 +1052,11 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
let HasFP4DstByteSel = 1;
}
-class VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOPProfile P> : VOP3_Profile<P, VOP3_OPSEL> {
+class VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOPProfile P>
+ : VOP3_Profile<P, VOP3_OPSEL> {
let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VSrcReg9ForVT<P.Src0VT>.ret,
- getVOP3SrcForVT<P.Src0VT>.ret);
+ getVOP3SrcForVT<P.Src0VT>.ret);
let InsVOP3OpSel = (ins PackedF32InputMods: $src0_modifiers, Src0RC64:$src0,
Int32InputMods: $src1_modifiers, Src1RC64:$src1,
FP32InputMods: $src2_modifiers, Src2RC64:$src2,
@@ -1107,7 +1108,7 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> {
// All convert opcodes operating on FP6/BF6/FP4 data must use VGPR sources for
// any operand slots > 32 bit.
let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VSrcReg9ForVT<P.Src0VT>.ret,
- getVOP3SrcForVT<P.Src0VT>.ret);
+ getVOP3SrcForVT<P.Src0VT>.ret);
}
let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in {
@@ -1149,7 +1150,10 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
let Constraints = "@earlyclobber $vdst" in {
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
- defm V_CVT_SCALEF32_SR_PK_FP4_F32: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOP_I32_V2F32_I32_F32>>;
+ defm V_CVT_SCALEF32_SR_PK_FP4_F32
+ : VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32",
+ VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<
+ VOP_I32_V2F32_I32_F32>>;
}
}
defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f16>>;
>From ec892c94873046e217b2038fd6fcb1dab22b59ff Mon Sep 17 00:00:00 2001
From: Pravin Jagtap <Pravin.Jagtap at amd.com>
Date: Mon, 17 Feb 2025 20:44:46 +0530
Subject: [PATCH 3/4] added codegen tests for SGPR to VGPR copies.
---
.../llvm.amdgcn.cvt.scalef32.pk.gfx950.ll | 1159 +++++++++++++++++
.../AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll | 401 ++++++
2 files changed, 1560 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
index a0ba97d3b639c..1ab27337632b6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
@@ -1282,3 +1282,1162 @@ define i32 @test_cvt_scalef32_fp4_bf16_byte3(<2 x bfloat> %src0, float %scale, i
%ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 3)
ret i32 %ret
}
+
+define amdgpu_ps void @test_scalef32_pk32_fp6_f32_vv_inreg_src(<16 x float> inreg %src, float %scale, ptr addrspace(1) %out) {
+; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_vv_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v2
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v1
+; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[2:17], v[2:17], v0
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f32_vv_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v1
+; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[2:17], v[2:17], v0
+; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16
+; GFX950-GISEL-NEXT: s_endpgm
+ %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src, <16 x float> %src, float %scale)
+ store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk32_fp6_f32_sl_inreg_src(<16 x float> inreg %src, ptr addrspace(1) %out) {
+; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f32_sl_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
+; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[2:7], v[2:17], v[2:17], s16
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f32_sl_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000
+; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_fp6_f32 v[2:7], v[2:17], v[2:17], v18
+; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX950-GISEL-NEXT: s_endpgm
+ %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src, <16 x float> %src, float 100.0)
+ store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk32_bf6_f32_vv_inreg_src(<16 x float> inreg %src, float %scale, ptr addrspace(1) %out) {
+; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f32_vv_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v2
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v1
+; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[2:17], v[2:17], v0
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v[18:19], v[0:3], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f32_vv_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v19, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, v1
+; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[2:17], v[2:17], v0
+; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[0:3], off
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[4:5], off offset:16
+; GFX950-GISEL-NEXT: s_endpgm
+ %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src, <16 x float> %src, float %scale)
+ store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk32_bf6_f32_sl_inreg_src(<16 x float> inreg inreg %src, ptr addrspace(1) %out) {
+; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f32_sl_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
+; GFX950-SDAG-NEXT: s_mov_b32 s16, 0x42c80000
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-SDAG-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[2:7], v[2:17], v[2:17], s16
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f32_sl_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v18, 0x42c80000
+; GFX950-GISEL-NEXT: v_cvt_scalef32_2xpk16_bf6_f32 v[2:7], v[2:17], v[2:17], v18
+; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[2:5], off
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16
+; GFX950-GISEL-NEXT: s_endpgm
+ %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src, <16 x float> %src, float 100.0)
+ store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define <2 x half> @test_cvt_scalef32_f16_fp8_byte0_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) {
+; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte0_dst_lo_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 0, i1 false)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) {
+; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_lo_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[0,1,0]
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 false)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) {
+; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_lo_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,0,0]
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 false)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_f16_fp8_byte3_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) {
+; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte3_dst_lo_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,1,0]
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 3, i1 false)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_f16_fp8_byte0_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) {
+; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte0_dst_hi_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[0,0,1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 0, i1 true)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_f16_fp8_byte1_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) {
+; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte1_dst_hi_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[0,1,1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 true)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_f16_fp8_byte2_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) {
+; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte2_dst_hi_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,0,1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 true)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_f16_fp8_byte3_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) {
+; GCN-LABEL: test_cvt_scalef32_f16_fp8_byte3_dst_hi_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f16_fp8 v1, s0, v0 op_sel:[1,1,1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> %old, i32 %src, float %scale, i32 3, i1 true)
+ ret <2 x half> %ret
+}
+
+define float @test_cvt_scalef32_f32_fp8_byte0_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 0)
+ ret float %ret
+}
+
+define float @test_cvt_scalef32_f32_fp8_byte1_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 op_sel:[0,1,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 1)
+ ret float %ret
+}
+
+define float @test_cvt_scalef32_f32_fp8_byte2_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte2_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 op_sel:[1,0,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 2)
+ ret float %ret
+}
+
+define float @test_cvt_scalef32_f32_fp8_byte3_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_f32_fp8_byte3_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f32_fp8 v0, s0, v0 op_sel:[1,1,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 %src, float %scale, i32 3)
+ ret float %ret
+}
+
+define <2 x half> @test_cvt_scalef32_f16_bf8_byte0_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) {
+; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte0_dst_lo_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 0, i1 false)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_f16_bf8_byte1_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) {
+; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte1_dst_lo_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[0,1,0]
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 false)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_f16_bf8_byte2_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) {
+; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte2_dst_lo_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[1,0,0]
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 false)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_f16_bf8_byte3_dst_lo_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) {
+; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte3_dst_lo_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[1,1,0]
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 3, i1 false)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_f16_bf8_byte0_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) {
+; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte0_dst_hi_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[0,0,1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 0, i1 true)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_f16_bf8_byte1_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) {
+; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte1_dst_hi_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[0,1,1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 1, i1 true)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_f16_bf8_byte2_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) {
+; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte2_dst_hi_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[1,0,1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 2, i1 true)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_f16_bf8_byte3_dst_hi_inreg_src(i32 inreg %src, float %scale, <2 x half> %old) {
+; GCN-LABEL: test_cvt_scalef32_f16_bf8_byte3_dst_hi_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f16_bf8 v1, s0, v0 op_sel:[1,1,1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> %old, i32 %src, float %scale, i32 3, i1 true)
+ ret <2 x half> %ret
+}
+
+define float @test_cvt_scalef32_f32_bf8_byte0_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, s0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 0)
+ ret float %ret
+}
+
+define float @test_cvt_scalef32_f32_bf8_byte1_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, s0, v0 op_sel:[0,1,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 1)
+ ret float %ret
+}
+
+define float @test_cvt_scalef32_f32_bf8_byte2_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte2_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, s0, v0 op_sel:[1,0,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 2)
+ ret float %ret
+}
+
+define float @test_cvt_scalef32_f32_bf8_byte3_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_f32_bf8_byte3_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_f32_bf8 v0, s0, v0 op_sel:[1,1,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call float @llvm.amdgcn.cvt.scalef32.f32.bf8(i32 %src, float %scale, i32 3)
+ ret float %ret
+}
+
+define <2 x i16> @test_cvt_scalef32_pk_fp8_f32_word0_inreg_src(<2 x i16> inreg %old, float %src0, float %src1, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_fp8_f32_word0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_cvt_scalef32_pk_fp8_f32 v3, v0, v1, v2
+; GCN-NEXT: v_mov_b32_e32 v0, v3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> %old, float %src0, float %src1, float %scale, i1 false)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_cvt_scalef32_pk_fp8_f32_word1_inreg_src(<2 x i16> inreg %old, float %src0, float %src1, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_fp8_f32_word1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_cvt_scalef32_pk_fp8_f32 v3, v0, v1, v2 op_sel:[0,0,0,1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, v3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f32(<2 x i16> %old, float %src0, float %src1, float %scale, i1 true)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_cvt_scalef32_pk_bf8_f32_word0_inreg_src(<2 x i16> inreg %old, float %src0, float %src1, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_bf8_f32_word0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_cvt_scalef32_pk_bf8_f32 v3, v0, v1, v2
+; GCN-NEXT: v_mov_b32_e32 v0, v3
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> %old, float %src0, float %src1, float %scale, i1 false)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_cvt_scalef32_pk_bf8_f32_word1_inreg_src(<2 x i16> %old, float inreg %src0, float %src1, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_bf8_f32_word1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_bf8_f32 v0, s0, v1, v2 op_sel:[0,0,0,1]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f32(<2 x i16> %old, float %src0, float %src1, float %scale, i1 true)
+ ret <2 x i16> %ret
+}
+
+define <2 x float> @test_cvt_scalef32_pk_f32_fp8_word0_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_f32_fp8_word0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_f32_fp8 v[0:1], s0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 %src, float %scale, i1 false)
+ ret <2 x float> %ret
+}
+
+define <2 x float> @test_cvt_scalef32_pk_f32_fp8_word1_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_f32_fp8_word1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_f32_fp8 v[0:1], s0, v0 op_sel:[1,0,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp8(i32 %src, float %scale, i1 true)
+ ret <2 x float> %ret
+}
+
+define <2 x float> @test_cvt_scalef32_pk_f32_bf8_word0_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_f32_bf8_word0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_f32_bf8 v[0:1], s0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 %src, float %scale, i1 false)
+ ret <2 x float> %ret
+}
+
+define <2 x float> @test_cvt_scalef32_pk_f32_bf8_word1_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_f32_bf8_word1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_f32_bf8 v[0:1], s0, v0 op_sel:[1,0,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.bf8(i32 %src, float %scale, i1 true)
+ ret <2 x float> %ret
+}
+
+define <2 x i16> @test_cvt_scalef32_pk_fp8_f16_word0_inreg_src(<2 x i16> %old, <2 x half> inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_fp8_f16_word0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_fp8_f16 v0, s0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> %old, <2 x half> %src, float %scale, i1 false)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_cvt_scalef32_pk_fp8_f16_word1_inreg_src(<2 x i16> %old, <2 x half> inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_fp8_f16_word1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_fp8_f16 v0, s0, v1 op_sel:[0,0,1]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> %old, <2 x half> %src, float %scale, i1 true)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_cvt_scalef32_pk_fp8_bf16_word0_inreg_src(<2 x i16> %old, <2 x bfloat> inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_word0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_fp8_bf16 v0, s0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> %old, <2 x bfloat> %src, float %scale, i1 false)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_cvt_scalef32_pk_fp8_bf16_word1_inreg_src(<2 x i16> %old, <2 x bfloat> inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_fp8_bf16_word1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_fp8_bf16 v0, s0, v1 op_sel:[0,0,1]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> %old, <2 x bfloat> %src, float %scale, i1 true)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_cvt_scalef32_pk_bf8_f16_word0_inreg_src(<2 x i16> %old, <2 x half> inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_bf8_f16_word0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_bf8_f16 v0, s0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> %old, <2 x half> %src, float %scale, i1 false)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_cvt_scalef32_pk_bf8_f16_word1_inreg_src(<2 x i16> %old, <2 x half> inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_bf8_f16_word1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_bf8_f16 v0, s0, v1 op_sel:[0,0,1]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> %old, <2 x half> %src, float %scale, i1 true)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_cvt_scalef32_pk_bf8_bf16_word0_inreg_src(<2 x i16> %old, <2 x bfloat> inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_word0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_bf8_bf16 v0, s0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> %old, <2 x bfloat> %src, float %scale, i1 false)
+ ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_cvt_scalef32_pk_bf8_bf16_word1_inreg_src(<2 x i16> %old, <2 x bfloat> inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_bf8_bf16_word1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_bf8_bf16 v0, s0, v1 op_sel:[0,0,1]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> %old, <2 x bfloat> %src, float %scale, i1 true)
+ ret <2 x i16> %ret
+}
+
+define <2 x float> @test_cvt_scale_f32_fp4_byte0_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scale_f32_fp4_byte0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], s0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 0)
+ ret <2 x float> %ret
+}
+
+define <2 x float> @test_cvt_scale_f32_fp4_byte1_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scale_f32_fp4_byte1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], s0, v0 op_sel:[0,1,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 1)
+ ret <2 x float> %ret
+}
+
+define <2 x float> @test_cvt_scale_f32_fp4_byte2_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scale_f32_fp4_byte2_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], s0, v0 op_sel:[1,0,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 2)
+ ret <2 x float> %ret
+}
+
+define <2 x float> @test_cvt_scale_f32_fp4_byte3_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scale_f32_fp4_byte3_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_f32_fp4 v[0:1], s0, v0 op_sel:[1,1,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x float> @llvm.amdgcn.cvt.scalef32.pk.f32.fp4(i32 %src, float %scale, i32 3)
+ ret <2 x float> %ret
+}
+
+define i32 @test_cvt_scale_fp4_f32_byte0_inreg_src(i32 %old, float inreg %src0, float %src1, float %scale) {
+; GCN-LABEL: test_cvt_scale_fp4_f32_byte0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 0)
+ ret i32 %ret
+}
+
+define i32 @test_cvt_scale_fp4_f32_byte1_inreg_src(i32 %old, float inreg %src0, float %src1, float %scale) {
+; GCN-LABEL: test_cvt_scale_fp4_f32_byte1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,1,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 1)
+ ret i32 %ret
+}
+
+define i32 @test_cvt_scale_fp4_f32_byte2_inreg_src(i32 %old, float inreg %src0, float %src1, float %scale) {
+; GCN-LABEL: test_cvt_scale_fp4_f32_byte2_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,0,1]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 2)
+ ret i32 %ret
+}
+
+define i32 @test_cvt_scale_fp4_f32_byte3_inreg_src(i32 %old, float inreg %src0, float %src1, float %scale) {
+; GCN-LABEL: test_cvt_scale_fp4_f32_byte3_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_f32 v0, s0, v1, v2 op_sel:[0,0,1,1]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f32(i32 %old, float %src0, float %src1, float %scale, i32 3)
+ ret i32 %ret
+}
+
+define <2 x half> @test_cvt_scale_f16_fp4_byte0_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scale_f16_fp4_byte0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, s0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 0)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scale_f16_fp4_byte1_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scale_f16_fp4_byte1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, s0, v0 op_sel:[0,1,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 1)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scale_f16_fp4_byte2_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scale_f16_fp4_byte2_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, s0, v0 op_sel:[1,0,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 2)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scale_f16_fp4_byte3_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scale_f16_fp4_byte3_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_f16_fp4 v0, s0, v0 op_sel:[1,1,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp4(i32 %src, float %scale, i32 3)
+ ret <2 x half> %ret
+}
+
+define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte0_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scale_bf16_fp4_byte0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, s0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 0)
+ ret <2 x bfloat> %ret
+}
+
+define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte1_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scale_bf16_fp4_byte1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, s0, v0 op_sel:[0,1,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 1)
+ ret <2 x bfloat> %ret
+}
+
+define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte2_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scale_bf16_fp4_byte2_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, s0, v0 op_sel:[1,0,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 2)
+ ret <2 x bfloat> %ret
+}
+
+define <2 x bfloat> @test_cvt_scale_bf16_fp4_byte3_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scale_bf16_fp4_byte3_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp4 v0, s0, v0 op_sel:[1,1,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp4(i32 %src, float %scale, i32 3)
+ ret <2 x bfloat> %ret
+}
+
+define <32 x float> @test_cvt_scale_pk32_f32_fp6_inreg_src(<6 x i32> inreg %src, float %scale) {
+; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_fp6_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, s16
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v39, s17
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[34:39], v32
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_fp6_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_mov_b32 s4, s16
+; GFX950-GISEL-NEXT: s_mov_b32 s5, s17
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_fp6 v[0:31], v[34:39], v32
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.fp6(<6 x i32> %src, float %scale)
+ ret <32 x float> %ret
+}
+
+define <32 x float> @test_cvt_scale_pk32_f32_bf6_inreg_src(<6 x i32> inreg %src, float %scale) {
+; GFX950-SDAG-LABEL: test_cvt_scale_pk32_f32_bf6_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v32, v0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v34, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v35, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v36, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v37, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v38, s16
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v39, s17
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[34:39], v32
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scale_pk32_f32_bf6_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_mov_b32 s4, s16
+; GFX950-GISEL-NEXT: s_mov_b32 s5, s17
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v32, v0
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1]
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f32_bf6 v[0:31], v[34:39], v32
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <32 x float> @llvm.amdgcn.cvt.scalef32.pk32.f32.bf6(<6 x i32> %src, float %scale)
+ ret <32 x float> %ret
+}
+
+define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_vv_inreg_src(<6 x i32> inreg %src, float %scale) {
+; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[18:23], v16
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_vv_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_mov_b32 s4, s16
+; GFX950-GISEL-NEXT: s_mov_b32 s5, s17
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[18:23], v16
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float %scale)
+ ret <32 x half> %ret
+}
+
+define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl_inreg_src(<6 x i32> inreg inreg %src) {
+; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
+; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], s0
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_fp6_sl_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_mov_b32 s4, s16
+; GFX950-GISEL-NEXT: s_mov_b32 s5, s17
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_fp6 v[0:15], v[16:21], v22
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.fp6(<6 x i32> %src, float 100.0)
+ ret <32 x half> %ret
+}
+
+define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_vv_inreg_src(<6 x i32> inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v16, v0
+; GCN-NEXT: v_mov_b32_e32 v18, s0
+; GCN-NEXT: v_mov_b32_e32 v19, s1
+; GCN-NEXT: v_mov_b32_e32 v20, s2
+; GCN-NEXT: v_mov_b32_e32 v21, s3
+; GCN-NEXT: v_mov_b32_e32 v22, s16
+; GCN-NEXT: v_mov_b32_e32 v23, s17
+; GCN-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[18:23], v16
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> %src, float %scale)
+ ret <32 x bfloat> %ret
+}
+
+define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl_inreg_src(<6 x i32> inreg inreg %src) {
+; GCN-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: v_mov_b32_e32 v17, s1
+; GCN-NEXT: v_mov_b32_e32 v18, s2
+; GCN-NEXT: v_mov_b32_e32 v19, s3
+; GCN-NEXT: v_mov_b32_e32 v20, s16
+; GCN-NEXT: v_mov_b32_e32 v21, s17
+; GCN-NEXT: s_mov_b32 s0, 0x42c80000
+; GCN-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> %src, float 100.0)
+ ret <32 x bfloat> %ret
+}
+
+define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_vv_inreg_src(<6 x i32> inreg %src, float %scale) {
+; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, s16
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v23, s17
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[18:23], v16
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_vv_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_mov_b32 s4, s16
+; GFX950-GISEL-NEXT: s_mov_b32 s5, s17
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, v0
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1]
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[18:23], v16
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float %scale)
+ ret <32 x half> %ret
+}
+
+define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl_inreg_src(<6 x i32> inreg inreg %src) {
+; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17
+; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], s0
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_f16_bf6_sl_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_mov_b32 s4, s16
+; GFX950-GISEL-NEXT: s_mov_b32 s5, s17
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[20:21], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[18:19], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v22, 0x42c80000
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_f16_bf6 v[0:15], v[16:21], v22
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <32 x half> @llvm.amdgcn.cvt.scalef32.pk32.f16.bf6(<6 x i32> %src, float 100.0)
+ ret <32 x half> %ret
+}
+
+define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_vv_inreg_src(<6 x i32> inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v16, v0
+; GCN-NEXT: v_mov_b32_e32 v18, s0
+; GCN-NEXT: v_mov_b32_e32 v19, s1
+; GCN-NEXT: v_mov_b32_e32 v20, s2
+; GCN-NEXT: v_mov_b32_e32 v21, s3
+; GCN-NEXT: v_mov_b32_e32 v22, s16
+; GCN-NEXT: v_mov_b32_e32 v23, s17
+; GCN-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[18:23], v16
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> %src, float %scale)
+ ret <32 x bfloat> %ret
+}
+
+define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_sl_inreg_src(<6 x i32> inreg inreg %src) {
+; GCN-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: v_mov_b32_e32 v17, s1
+; GCN-NEXT: v_mov_b32_e32 v18, s2
+; GCN-NEXT: v_mov_b32_e32 v19, s3
+; GCN-NEXT: v_mov_b32_e32 v20, s16
+; GCN-NEXT: v_mov_b32_e32 v21, s17
+; GCN-NEXT: s_mov_b32 s0, 0x42c80000
+; GCN-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> %src, float 100.0)
+ ret <32 x bfloat> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_pk_f16_fp8_word0_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_f16_fp8_word0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_f16_fp8 v0, s0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 %src, float %scale, i1 false)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_pk_f16_fp8_word1_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_f16_fp8_word1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_f16_fp8 v0, s0, v0 op_sel:[1,0,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.fp8(i32 %src, float %scale, i1 true)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_pk_f16_bf8_word0_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_f16_bf8_word0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_f16_bf8 v0, s0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 %src, float %scale, i1 false)
+ ret <2 x half> %ret
+}
+
+define <2 x half> @test_cvt_scalef32_pk_f16_bf8_word1_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_f16_bf8_word1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_f16_bf8 v0, s0, v0 op_sel:[1,0,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x half> @llvm.amdgcn.cvt.scalef32.pk.f16.bf8(i32 %src, float %scale, i1 true)
+ ret <2 x half> %ret
+}
+
+define <2 x bfloat> @test_cvt_scalef32_pk_bf16_fp8_word0_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_bf16_fp8_word0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp8 v0, s0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 %src, float %scale, i1 false)
+ ret <2 x bfloat> %ret
+}
+
+define <2 x bfloat> @test_cvt_scalef32_pk_bf16_fp8_word1_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_bf16_fp8_word1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_bf16_fp8 v0, s0, v0 op_sel:[1,0,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.fp8(i32 %src, float %scale, i1 true)
+ ret <2 x bfloat> %ret
+}
+
+define <2 x bfloat> @test_cvt_scalef32_pk_bf16_bf8_word0_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_bf16_bf8_word0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_bf16_bf8 v0, s0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 %src, float %scale, i1 false)
+ ret <2 x bfloat> %ret
+}
+
+define <2 x bfloat> @test_cvt_scalef32_pk_bf16_bf8_word1_inreg_src(i32 inreg %src, float %scale) {
+; GCN-LABEL: test_cvt_scalef32_pk_bf16_bf8_word1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_bf16_bf8 v0, s0, v0 op_sel:[1,0,0]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call <2 x bfloat> @llvm.amdgcn.cvt.scalef32.pk.bf16.bf8(i32 %src, float %scale, i1 true)
+ ret <2 x bfloat> %ret
+}
+
+define i32 @test_cvt_scalef32_fp4_f16_byte0_inreg_src(<2 x half> inreg %src0, float %scale, i32 %old) {
+; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 0)
+ ret i32 %ret
+}
+
+define i32 @test_cvt_scalef32_fp4_f16_byte1_inreg_src(<2 x half> inreg %src0, float %scale, i32 %old) {
+; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,1,0]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 1)
+ ret i32 %ret
+}
+
+define i32 @test_cvt_scalef32_fp4_f16_byte2_inreg_src(<2 x half> inreg %src0, float %scale, i32 %old) {
+; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte2_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,0,1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 2)
+ ret i32 %ret
+}
+
+define i32 @test_cvt_scalef32_fp4_f16_byte3_inreg_src(<2 x half> inreg %src0, float %scale, i32 %old) {
+; GCN-LABEL: test_cvt_scalef32_fp4_f16_byte3_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_f16 v1, s0, v0 op_sel:[0,0,1,1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.f16(i32 %old, <2 x half> %src0, float %scale, i32 3)
+ ret i32 %ret
+}
+
+define i32 @test_cvt_scalef32_fp4_bf16_byte0_inreg_src(<2 x bfloat> inreg %src0, float %scale, i32 %old) {
+; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte0_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 0)
+ ret i32 %ret
+}
+
+define i32 @test_cvt_scalef32_fp4_bf16_byte1_inreg_src(<2 x bfloat> inreg %src0, float %scale, i32 %old) {
+; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte1_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,1,0]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 1)
+ ret i32 %ret
+}
+
+define i32 @test_cvt_scalef32_fp4_bf16_byte2_inreg_src(<2 x bfloat> inreg %src0, float %scale, i32 %old) {
+; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte2_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,0,1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 2)
+ ret i32 %ret
+}
+
+define i32 @test_cvt_scalef32_fp4_bf16_byte3_inreg_src(<2 x bfloat> inreg %src0, float %scale, i32 %old) {
+; GCN-LABEL: test_cvt_scalef32_fp4_bf16_byte3_inreg_src:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_scalef32_pk_fp4_bf16 v1, s0, v0 op_sel:[0,0,1,1]
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %ret = tail call i32 @llvm.amdgcn.cvt.scalef32.pk.fp4.bf16(i32 %old, <2 x bfloat> %src0, float %scale, i32 3)
+ ret i32 %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
index 517c87193598d..4e5b85344197a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
@@ -294,3 +294,404 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_f16_sl(<32 x half> inreg %src, ptr
store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
ret void
}
+
+define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_vv_inreg_src(<32 x bfloat> inreg %src, float %scale, ptr addrspace(1) %out) {
+; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_bf16_vv_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v1
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], v0
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_vv_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v1
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], v0
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off
+; GFX950-GISEL-NEXT: s_endpgm
+ %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float %scale)
+ store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_sl_inreg_src(<32 x bfloat> inreg %src, ptr addrspace(1) %out) {
+; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_bf16_sl_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15
+; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], s0
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_sl_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15
+; GFX950-GISEL-NEXT: s_mov_b32 s0, 0x42c80000
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], s0
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-GISEL-NEXT: s_endpgm
+ %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float 100.0)
+ store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk32_bf6_f16_vv_inreg_src(<32 x half> inreg %src, float %scale, ptr addrspace(1) %out) {
+; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f16_vv_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v1
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[2:17], v0
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f16_vv_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v1
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[2:17], v0
+; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-GISEL-NEXT: s_endpgm
+ %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float %scale)
+ store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk32_bf6_f16_sl_inreg_src(<32 x half> inreg %src, ptr addrspace(1) %out) {
+; GFX950-SDAG-LABEL: test_scalef32_pk32_bf6_f16_sl_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15
+; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[2:17], s0
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_f16_sl_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_f16 v[18:23], v[2:17], v24
+; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-GISEL-NEXT: s_endpgm
+ %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float 100.0)
+ store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_vv_inreg_src(<32 x bfloat> inreg %src, float %scale, ptr addrspace(1) %out) {
+; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_bf16_vv_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v1
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], v0
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_vv_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v1
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], v0
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off
+; GFX950-GISEL-NEXT: s_endpgm
+ %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float %scale)
+ store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_sl_inreg_src(<32 x bfloat> inreg %src, ptr addrspace(1) %out) {
+; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_bf16_sl_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15
+; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], s0
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_sl_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15
+; GFX950-GISEL-NEXT: s_mov_b32 s0, 0x42c80000
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], s0
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-GISEL-NEXT: s_endpgm
+ %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float 100.0)
+ store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk32_fp6_f16_vv_inreg_src(<32 x half> inreg %src, float %scale, ptr addrspace(1) %out) {
+; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f16_vv_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v25, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v24, v1
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[2:17], v0
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v[24:25], v[18:21], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f16_vv_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v1
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[2:17], v0
+; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-GISEL-NEXT: s_endpgm
+ %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float %scale)
+ store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk32_fp6_f16_sl_inreg_src(<32 x half> inreg %src, ptr addrspace(1) %out) {
+; GFX950-SDAG-LABEL: test_scalef32_pk32_fp6_f16_sl_inreg_src:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, s2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, s3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, s4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, s5
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v8, s6
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v9, s7
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v10, s8
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v11, s9
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v12, s10
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v13, s11
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v14, s12
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v15, s13
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s14
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s15
+; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000
+; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[2:17], s0
+; GFX950-SDAG-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-SDAG-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_f16_sl_inreg_src:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000
+; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_f16 v[18:23], v[2:17], v24
+; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-GISEL-NEXT: s_endpgm
+ %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.f16(<32 x half> %src, float 100.0)
+ store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
>From b9a5e3ba58e7d1f1e60fae3b212cec96e6387f38 Mon Sep 17 00:00:00 2001
From: Pravin Jagtap <Pravin.Jagtap at amd.com>
Date: Thu, 20 Feb 2025 21:26:46 +0530
Subject: [PATCH 4/4] Unified with downstream changes
---
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 26 +++++++++++-----------
llvm/lib/Target/AMDGPU/VOP2Instructions.td | 2 +-
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 4 ++--
3 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 576f6a2547fb8..39d772d6816e3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1803,9 +1803,21 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
1 : VSrc_b32);
}
+// VGPR only VOP3 src with 9 bit encoding
+class getVOP3VRegSrcForVT<ValueType VT> {
+ RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VRegSrc_1024,
+ !eq(VT.Size, 512) : VRegSrc_512,
+ !eq(VT.Size, 256) : VRegSrc_256,
+ !eq(VT.Size, 192) : VRegSrc_192,
+ !eq(VT.Size, 128) : VRegSrc_128,
+ !eq(VT.Size, 96) : VRegSrc_96,
+ !eq(VT.Size, 64) : VRegSrc_64,
+ 1 : VRegSrc_32);
+}
+
// Returns the vreg register class to use for sources of VOP3 instructions for the
// given VT.
-class getVOP3VRegSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
+class getVOP3VRegForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
RegisterOperand ret =
!cond(!eq(VT.Size, 128) : RegisterOperand<VReg_128>,
!eq(VT.Size, 96) : RegisterOperand<VReg_96>,
@@ -1818,18 +1830,6 @@ class getVOP3VRegSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
1 : RegisterOperand<VGPR_32>);
}
-// VGPR only VOP3 src with 9 bit encoding
-class getVOP3VSrcReg9ForVT<ValueType VT> {
- RegisterOperand ret = !cond(!eq(VT.Size, 1024): VRegSrc_1024,
- !eq(VT.Size, 512): VRegSrc_512,
- !eq(VT.Size, 256): VRegSrc_256,
- !eq(VT.Size, 192): VRegSrc_192,
- !eq(VT.Size, 128): VRegSrc_128,
- !eq(VT.Size, 96): VRegSrc_96,
- !eq(VT.Size, 64): VRegSrc_64,
- 1: VRegSrc_32);
-}
-
// Src2 of VOP3 DPP instructions cannot be a literal
class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
RegisterOperand ret =
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 900c91731aa1b..82cb3507e8160 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -423,7 +423,7 @@ def VOP_MADMK_F32 : VOP_MADMK <f32>;
class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, vt0]> {
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT>.ret:$src2);
// Src2 must accept the same operand types as vdst, namely VGPRs only
- let Src2RC64 = getVOP3VRegSrcForVT<Src2VT, IsTrue16, !not(IsRealTrue16)>.ret;
+ let Src2RC64 = getVOP3VRegForVT<Src2VT, IsTrue16, !not(IsRealTrue16)>.ret;
let Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, 3,
0, HasModifiers, HasModifiers, HasOMod,
Src0Mod, Src1Mod, Src2Mod>.ret;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 93bce75f72deb..1447804871809 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1055,7 +1055,7 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
class VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOPProfile P>
: VOP3_Profile<P, VOP3_OPSEL> {
- let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VSrcReg9ForVT<P.Src0VT>.ret,
+ let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VRegSrcForVT<P.Src0VT>.ret,
getVOP3SrcForVT<P.Src0VT>.ret);
let InsVOP3OpSel = (ins PackedF32InputMods: $src0_modifiers, Src0RC64:$src0,
Int32InputMods: $src1_modifiers, Src1RC64:$src1,
@@ -1107,7 +1107,7 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> {
// All convert opcodes operating on FP6/BF6/FP4 data must use VGPR sources for
// any operand slots > 32 bit.
- let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VSrcReg9ForVT<P.Src0VT>.ret,
+ let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VRegSrcForVT<P.Src0VT>.ret,
getVOP3SrcForVT<P.Src0VT>.ret);
}
More information about the llvm-commits
mailing list