[llvm-branch-commits] [clang] [llvm] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions (PR #151758)
Stanislav Mekhanoshin via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Aug 1 13:15:58 PDT 2025
https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/151758
>From 159959bda89857ab52cf656b1e3fca19bf662e79 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Fri, 1 Aug 2025 12:38:04 -0700
Subject: [PATCH] [AMDGPU] gfx1250 v_cvt_scalef32_pk8_* instructions
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 9 +
.../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 94 ++++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 14 +-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 9 +
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 6 +
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 23 +
.../AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll | 403 ++++++++++++++++++
llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s | 54 +++
llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s | 54 +++
.../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt | 54 +++
10 files changed, 718 insertions(+), 2 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 0c4a485d60936..e117e993fc572 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -716,6 +716,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp4, "V8yUiUiIUi", "nc", "gfx
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp8, "V8fV2UiUiIUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_bf8, "V8fV2UiUiIUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp4, "V8fUiUiIUi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16, "V2UiV8yf", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16, "V2UiV8yf", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f16, "V2UiV8hf", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f16, "V2UiV8hf", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp8_f32, "V2UiV8ff", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_bf8_f32, "V2UiV8ff", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f32, "UiV8ff", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_f16, "UiV8hf", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16, "UiV8yf", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts")
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 2fd816cebd365..150c6ce0b76ee 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -674,6 +674,100 @@ void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2,
*outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7);
}
+// CHECK-LABEL: @test_cvt_scalef32_pk(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT2_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[SRCBF8_ADDR:%.*]] = alloca <8 x bfloat>, align 16, addrspace(5)
+// CHECK-NEXT: [[SRCH8_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5)
+// CHECK-NEXT: [[SRCF8_ADDR:%.*]] = alloca <8 x float>, align 32, addrspace(5)
+// CHECK-NEXT: [[OUT3_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[SRCBF16_ADDR:%.*]] = alloca <16 x bfloat>, align 32, addrspace(5)
+// CHECK-NEXT: [[SRCH16_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5)
+// CHECK-NEXT: [[SRCF16_ADDR:%.*]] = alloca <16 x float>, align 64, addrspace(5)
+// CHECK-NEXT: [[OUT1_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT: [[OUT2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT2_ADDR]] to ptr
+// CHECK-NEXT: [[SRCBF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF8_ADDR]] to ptr
+// CHECK-NEXT: [[SRCH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH8_ADDR]] to ptr
+// CHECK-NEXT: [[SRCF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF8_ADDR]] to ptr
+// CHECK-NEXT: [[OUT3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT3_ADDR]] to ptr
+// CHECK-NEXT: [[SRCBF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCBF16_ADDR]] to ptr
+// CHECK-NEXT: [[SRCH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCH16_ADDR]] to ptr
+// CHECK-NEXT: [[SRCF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRCF16_ADDR]] to ptr
+// CHECK-NEXT: [[OUT1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT1_ADDR]] to ptr
+// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT2:%.*]], ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <8 x bfloat> [[SRCBF8:%.*]], ptr [[SRCBF8_ADDR_ASCAST]], align 16
+// CHECK-NEXT: store <8 x half> [[SRCH8:%.*]], ptr [[SRCH8_ADDR_ASCAST]], align 16
+// CHECK-NEXT: store <8 x float> [[SRCF8:%.*]], ptr [[SRCF8_ADDR_ASCAST]], align 32
+// CHECK-NEXT: store ptr addrspace(1) [[OUT3:%.*]], ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <16 x bfloat> [[SRCBF16:%.*]], ptr [[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: store <16 x half> [[SRCH16:%.*]], ptr [[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: store <16 x float> [[SRCF16:%.*]], ptr [[SRCF16_ADDR_ASCAST]], align 64
+// CHECK-NEXT: store ptr addrspace(1) [[OUT1:%.*]], ptr [[OUT1_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store float [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
+// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.bf16(<8 x bfloat> [[TMP0]], float [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP2]], ptr addrspace(1) [[TMP3]], align 8
+// CHECK-NEXT: [[TMP4:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
+// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.bf16(<8 x bfloat> [[TMP4]], float [[TMP5]])
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8
+// CHECK-NEXT: [[TMP8:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
+// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP10:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f16(<8 x half> [[TMP8]], float [[TMP9]])
+// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP10]], ptr addrspace(1) [[TMP11]], align 8
+// CHECK-NEXT: [[TMP12:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
+// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP14:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f16(<8 x half> [[TMP12]], float [[TMP13]])
+// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8
+// CHECK-NEXT: [[TMP16:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> [[TMP16]], float [[TMP17]])
+// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP18]], ptr addrspace(1) [[TMP19]], align 8
+// CHECK-NEXT: [[TMP20:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP21:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP22:%.*]] = call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f32(<8 x float> [[TMP20]], float [[TMP21]])
+// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT2_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8
+// CHECK-NEXT: [[TMP24:%.*]] = load <8 x float>, ptr [[SRCF8_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f32(<8 x float> [[TMP24]], float [[TMP25]])
+// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP26]], ptr addrspace(1) [[TMP27]], align 4
+// CHECK-NEXT: [[TMP28:%.*]] = load <8 x half>, ptr [[SRCH8_ADDR_ASCAST]], align 16
+// CHECK-NEXT: [[TMP29:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f16(<8 x half> [[TMP28]], float [[TMP29]])
+// CHECK-NEXT: [[TMP31:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP30]], ptr addrspace(1) [[TMP31]], align 4
+// CHECK-NEXT: [[TMP32:%.*]] = load <8 x bfloat>, ptr [[SRCBF8_ADDR_ASCAST]], align 16
+// CHECK-NEXT: [[TMP33:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP34:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> [[TMP32]], float [[TMP33]])
+// CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store i32 [[TMP34]], ptr addrspace(1) [[TMP35]], align 4
+// CHECK-NEXT: ret void
+//
+void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float8 srcf8,
+ global uint3 *out3, bfloat16 srcbf16, half16 srch16, float16 srcf16,
+ global uint *out1, float scale)
+{
+ *out2 = __builtin_amdgcn_cvt_scalef32_pk8_fp8_bf16(srcbf8, scale);
+ *out2 = __builtin_amdgcn_cvt_scalef32_pk8_bf8_bf16(srcbf8, scale);
+ *out2 = __builtin_amdgcn_cvt_scalef32_pk8_fp8_f16(srch8, scale);
+ *out2 = __builtin_amdgcn_cvt_scalef32_pk8_bf8_f16(srch8, scale);
+ *out2 = __builtin_amdgcn_cvt_scalef32_pk8_fp8_f32(srcf8, scale);
+ *out2 = __builtin_amdgcn_cvt_scalef32_pk8_bf8_f32(srcf8, scale);
+ *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_f32(srcf8, scale);
+ *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_f16(srch8, scale);
+ *out1 = __builtin_amdgcn_cvt_scalef32_pk8_fp4_bf16(srcbf8, scale);
+}
+
// CHECK-LABEL: @test_sat_pk4_i4_i8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index eabdf521bb6e8..e85f9864cb1ce 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -662,10 +662,17 @@ class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMTy
def int_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_fp6_f16">;
def int_amdgcn_cvt_scalef32_pk32_bf6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_bf6_f16">;
+def int_amdgcn_cvt_scalef32_pk8_fp8_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v2i32_ty, llvm_v8bf16_ty, "cvt_scalef32_pk8_fp8_bf16">;
+def int_amdgcn_cvt_scalef32_pk8_bf8_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v2i32_ty, llvm_v8bf16_ty, "cvt_scalef32_pk8_bf8_bf16">;
def int_amdgcn_cvt_scalef32_pk32_fp6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_fp6_bf16">;
def int_amdgcn_cvt_scalef32_pk32_bf6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_bf6_bf16">;
-def int_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_fp6_f32">;
-def int_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_bf6_f32">;
+def int_amdgcn_cvt_scalef32_pk8_fp8_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v2i32_ty, llvm_v8f16_ty, "cvt_scalef32_pk8_fp8_f16">;
+def int_amdgcn_cvt_scalef32_pk8_bf8_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v2i32_ty, llvm_v8f16_ty, "cvt_scalef32_pk8_bf8_f16">;
+def int_amdgcn_cvt_scalef32_pk8_fp8_f32 : AMDGPUCvtScaleF32Intrinsic<llvm_v2i32_ty, llvm_v8f32_ty, "cvt_scalef32_pk8_fp8_f32">;
+def int_amdgcn_cvt_scalef32_pk8_bf8_f32 : AMDGPUCvtScaleF32Intrinsic<llvm_v2i32_ty, llvm_v8f32_ty, "cvt_scalef32_pk8_bf8_f32">;
+def int_amdgcn_cvt_scalef32_pk8_fp4_f32 : AMDGPUCvtScaleF32Intrinsic<llvm_i32_ty, llvm_v8f32_ty, "cvt_scalef32_pk8_fp4_f32">;
+def int_amdgcn_cvt_scalef32_pk8_fp4_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_i32_ty, llvm_v8f16_ty, "cvt_scalef32_pk8_fp4_f16">;
+def int_amdgcn_cvt_scalef32_pk8_fp4_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_i32_ty, llvm_v8bf16_ty, "cvt_scalef32_pk8_fp4_bf16">;
def int_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_sr_pk32_bf6_bf16">;
def int_amdgcn_cvt_scalef32_sr_pk32_bf6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_sr_pk32_bf6_f16">;
@@ -674,6 +681,9 @@ def int_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm
def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_sr_pk32_fp6_f16">;
def int_amdgcn_cvt_scalef32_sr_pk32_fp6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v6i32_ty, llvm_v32f32_ty, "cvt_scalef32_sr_pk32_fp6_f32">;
+def int_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_fp6_f32">;
+def int_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_bf6_f32">;
+
class AMDGPUCvtScaleFP4FP8BF8ToF1632Intrinsic<LLVMType DstTy, string name> : DefaultAttrsIntrinsic<
[DstTy],
[llvm_i32_ty, // src
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5aa0ebfcce0e8..0894e26a9a42d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4603,6 +4603,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8:
case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8:
case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp8_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_bf8_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_pk8_fp4_bf16:
case Intrinsic::amdgcn_sat_pk4_i4_i8:
case Intrinsic::amdgcn_sat_pk4_u4_u8:
case Intrinsic::amdgcn_fmed3:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index a3e20baa9e298..38b609ca47f90 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2955,6 +2955,12 @@ def VOP_V8BF16_I32_I32 : VOPProfile<[v8bf16, i32, i32, untyped]>;
def VOP_V16F32_V3I32_I32 : VOPProfile<[v16f32, v3i32, i32, untyped]>;
def VOP_V8F32_V2I32_I32 : VOPProfile<[v8f32, v2i32, i32, untyped]>;
def VOP_V8F32_I32_I32 : VOPProfile<[v8f32, i32, i32, untyped]>;
+def VOP_V2I32_V8BF16_F32 : VOPProfile<[v2i32, v8bf16, f32, untyped]>;
+def VOP_V2I32_V8F16_F32 : VOPProfile<[v2i32, v8f16, f32, untyped]>;
+def VOP_V2I32_V8F32_F32 : VOPProfile<[v2i32, v8f32, f32, untyped]>;
+def VOP_I32_V8F32_F32 : VOPProfile<[i32, v8f32, f32, untyped]>;
+def VOP_I32_V8F16_F32 : VOPProfile<[i32, v8f16, f32, untyped]>;
+def VOP_I32_V8BF16_F32 : VOPProfile<[i32, v8bf16, f32, untyped]>;
def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>;
def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 19ce7f58f312c..f1ed9380f8449 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1778,6 +1778,20 @@ let SubtargetPredicate = isGFX1250Plus in {
defm V_CVT_SCALE_PK8_F32_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp4", VOP_V8F32_I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp4>;
} // End ReadsModeReg = 0
+ let Constraints = "@earlyclobber $vdst" in {
+ let WaveSizePredicate = isWave32 in {
+ defm V_CVT_SCALEF32_PK8_FP8_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_bf16>;
+ defm V_CVT_SCALEF32_PK8_BF8_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_bf16>;
+ defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_f16>;
+ defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_f16>;
+ defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Inst<"v_cvt_scalef32_pk8_fp8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_fp8_f32>;
+ defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Inst<"v_cvt_scalef32_pk8_bf8_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V2I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_bf8_f32>;
+ defm V_CVT_SCALEF32_PK8_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F32_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_f32>;
+ defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_f16>;
+ defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_F32>, int_amdgcn_cvt_scalef32_pk8_fp4_bf16>;
+ } // End WaveSizePredicate = isWave32
+ } // End Constraints = "@earlyclobber $vdst"
+
let True16Predicate = UseRealTrue16Insts in {
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f16, V_CVT_SR_FP8_F16_t16_e64, f16>;
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f16, V_CVT_SR_BF8_F16_t16_e64, f16>;
@@ -2198,6 +2212,15 @@ defm V_CVT_SCALE_PK8_F32_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2aa>;
defm V_CVT_SCALE_PK8_F16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ab>;
defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ac>;
defm V_CVT_SCALE_PK8_F32_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ad>;
+defm V_CVT_SCALEF32_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x2b0>;
+defm V_CVT_SCALEF32_PK8_FP4_F16 : VOP3Only_Real_Base_gfx1250<0x2b3>;
+defm V_CVT_SCALEF32_PK8_FP8_BF16 : VOP3Only_Real_Base_gfx1250<0x2b4>;
+defm V_CVT_SCALEF32_PK8_BF8_BF16 : VOP3Only_Real_Base_gfx1250<0x2b5>;
+defm V_CVT_SCALEF32_PK8_FP4_BF16 : VOP3Only_Real_Base_gfx1250<0x2b8>;
+defm V_CVT_SCALEF32_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x2c3>;
+defm V_CVT_SCALEF32_PK8_FP8_F16 : VOP3Only_Real_Base_gfx1250<0x2c4>;
+defm V_CVT_SCALEF32_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x2c5>;
+defm V_CVT_SCALEF32_PK8_BF8_F16 : VOP3Only_Real_Base_gfx1250<0x2c6>;
defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>;
defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>;
defm V_CVT_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x36f>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll
new file mode 100644
index 0000000000000..cd0b081bf6f10
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll
@@ -0,0 +1,403 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-GISEL %s
+
+declare <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.bf16(<8 x bfloat> %src, float %scale)
+declare <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.bf16(<8 x bfloat> %src, float %scale)
+declare <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f16(<8 x half> %src, float %scale)
+declare <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f16(<8 x half> %src, float %scale)
+declare <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> %src, float %scale)
+declare <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f32(<8 x float> %src, float %scale)
+declare i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f32(<8 x float> %src, float %scale)
+declare i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f16(<8 x half> %src, float %scale)
+declare i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> %src, float %scale)
+
+define amdgpu_ps void @test_scalef32_pk8_fp8_bf16_vv(<8 x bfloat> %src, float %scale, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_bf16_vv:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp8_bf16 v[8:9], v[0:3], v4
+; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_bf16_vv:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp8_bf16 v[8:9], v[0:3], v4
+; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.bf16(<8 x bfloat> %src, float %scale)
+ store <2 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk8_fp8_bf16_sl(<8 x bfloat> inreg %src, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_bf16_sl:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp8_bf16 v[6:7], v[2:5], 0x42c80000
+; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_bf16_sl:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp8_bf16 v[6:7], v[2:5], 0x42c80000
+; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.bf16(<8 x bfloat> %src, float 100.0)
+ store <2 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk8_bf8_bf16_vv(<8 x bfloat> %src, float %scale, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_bf16_vv:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_bf8_bf16 v[8:9], v[0:3], v4
+; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_bf16_vv:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_bf8_bf16 v[8:9], v[0:3], v4
+; GFX1250-GISEL-NEXT: global_store_b64 v[6:7], v[8:9], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.bf16(<8 x bfloat> %src, float %scale)
+ store <2 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk8_bf8_bf16_sl(<8 x bfloat> inreg %src, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_bf16_sl:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_bf8_bf16 v[6:7], v[2:5], 0x42c80000
+; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_bf16_sl:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_bf8_bf16 v[6:7], v[2:5], 0x42c80000
+; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.bf16(<8 x bfloat> %src, float 100.0)
+ store <2 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk8_fp8_f16_vv(<8 x half> %src, float %scale, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_f16_vv:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp8_f16 v[8:9], v[0:3], v4
+; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_f16_vv:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v9, v6
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp8_f16 v[6:7], v[0:3], v4
+; GFX1250-GISEL-NEXT: global_store_b64 v[8:9], v[6:7], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f16(<8 x half> %src, float %scale)
+ store <2 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk8_fp8_f16_sl(<8 x half> inreg %src, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_f16_sl:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp8_f16 v[6:7], v[2:5], 0x42c80000
+; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_f16_sl:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp8_f16 v[6:7], v[2:5], 0x42c80000
+; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f16(<8 x half> %src, float 100.0)
+ store <2 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk8_bf8_f16_vv(<8 x half> %src, float %scale, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_f16_vv:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_bf8_f16 v[8:9], v[0:3], v4
+; GFX1250-SDAG-NEXT: global_store_b64 v[6:7], v[8:9], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_f16_vv:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v9, v6
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_bf8_f16 v[6:7], v[0:3], v4
+; GFX1250-GISEL-NEXT: global_store_b64 v[8:9], v[6:7], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f16(<8 x half> %src, float %scale)
+ store <2 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk8_bf8_f16_sl(<8 x half> inreg %src, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_f16_sl:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_bf8_f16 v[6:7], v[2:5], 0x42c80000
+; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[6:7], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_f16_sl:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_bf8_f16 v[6:7], v[2:5], 0x42c80000
+; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[6:7], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f16(<8 x half> %src, float 100.0)
+ store <2 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk8_bf8_f32_vv(<8 x float> %src, float %scale, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_f32_vv:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v9
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_bf8_f32 v[12:13], v[0:7], v8
+; GFX1250-SDAG-NEXT: global_store_b64 v[10:11], v[12:13], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_f32_vv:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v10
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[0:7], v8
+; GFX1250-GISEL-NEXT: global_store_b64 v[12:13], v[10:11], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f32(<8 x float> %src, float %scale)
+ store <2 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk8_bf8_f32_sl(<8 x float> inreg %src, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_f32_sl:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[2:9], 0x42c80000
+; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[10:11], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_f32_sl:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[2:9], 0x42c80000
+; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[10:11], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.bf8.f32(<8 x float> %src, float 100.0)
+ store <2 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk8_fp8_f32_vv(<8 x float> %src, float %scale, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_f32_vv:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v9
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp8_f32 v[12:13], v[0:7], v8
+; GFX1250-SDAG-NEXT: global_store_b64 v[10:11], v[12:13], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_f32_vv:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v10
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[0:7], v8
+; GFX1250-GISEL-NEXT: global_store_b64 v[12:13], v[10:11], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> %src, float %scale)
+ store <2 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk8_fp8_f32_sl(<8 x float> inreg %src, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_f32_sl:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[2:9], 0x42c80000
+; GFX1250-SDAG-NEXT: global_store_b64 v[0:1], v[10:11], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_f32_sl:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[2:9], 0x42c80000
+; GFX1250-GISEL-NEXT: global_store_b64 v[0:1], v[10:11], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <2 x i32> @llvm.amdgcn.cvt.scalef32.pk8.fp8.f32(<8 x float> %src, float 100.0)
+ store <2 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk8_fp4_f32_vv(<8 x float> %src, float %scale, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_f32_vv:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v9
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp4_f32 v9, v[0:7], v8
+; GFX1250-SDAG-NEXT: global_store_b32 v[10:11], v9, off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_f32_vv:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v10
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp4_f32 v9, v[0:7], v8
+; GFX1250-GISEL-NEXT: global_store_b32 v[12:13], v9, off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f32(<8 x float> %src, float %scale)
+ store i32 %cvt, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk8_fp4_f32_sl(<8 x float> inreg %src, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_f32_sl:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp4_f32 v10, v[2:9], 0x42c80000
+; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v10, off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_f32_sl:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp4_f32 v10, v[2:9], 0x42c80000
+; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v10, off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f32(<8 x float> %src, float 100.0)
+ store i32 %cvt, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk8_fp4_f16_vv(<8 x half> %src, float %scale, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_f16_vv:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp4_f16 v5, v[0:3], v4
+; GFX1250-SDAG-NEXT: global_store_b32 v[6:7], v5, off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_f16_vv:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v9, v6
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp4_f16 v5, v[0:3], v4
+; GFX1250-GISEL-NEXT: global_store_b32 v[8:9], v5, off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f16(<8 x half> %src, float %scale)
+ store i32 %cvt, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk8_fp4_f16_sl(<8 x half> inreg %src, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_f16_sl:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp4_f16 v6, v[2:5], 0x42c80000
+; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v6, off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_f16_sl:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp4_f16 v6, v[2:5], 0x42c80000
+; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v6, off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.f16(<8 x half> %src, float 100.0)
+ store i32 %cvt, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk8_fp4_bf16_vv(<8 x bfloat> %src, float %scale, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_bf16_vv:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp4_bf16 v5, v[0:3], v4
+; GFX1250-SDAG-NEXT: global_store_b32 v[6:7], v5, off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_bf16_vv:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp4_bf16 v5, v[0:3], v4
+; GFX1250-GISEL-NEXT: global_store_b32 v[6:7], v5, off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> %src, float %scale)
+ store i32 %cvt, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_pk8_fp4_bf16_sl(<8 x bfloat> inreg %src, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_bf16_sl:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_pk8_fp4_bf16 v6, v[2:5], 0x42c80000
+; GFX1250-SDAG-NEXT: global_store_b32 v[0:1], v6, off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_bf16_sl:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_pk8_fp4_bf16 v6, v[2:5], 0x42c80000
+; GFX1250-GISEL-NEXT: global_store_b32 v[0:1], v6, off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> %src, float 100.0)
+ store i32 %cvt, ptr addrspace(1) %out, align 4
+ ret void
+}
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
index 887d4847c29a3..9a63afc2c3fed 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
@@ -886,3 +886,57 @@ v_permlane_idx_gen_b32 v5, v1, exec_hi
v_permlane_idx_gen_b32 v5, v1, exec_lo
// GFX1250: v_permlane_idx_gen_b32 v5, v1, exec_lo ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xfd,0x00,0x00]
+
+v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], v8
+// GFX1250: v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0xb4,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], 100.0
+// GFX1250: v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xb4,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], v8
+// GFX1250: v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0xb5,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], 100.0
+// GFX1250: v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xb5,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], v8
+// GFX1250: v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0xc4,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], 100.0
+// GFX1250: v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xc4,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], v8
+// GFX1250: v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0xc6,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], 100.0
+// GFX1250: v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xc6,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], v8
+// GFX1250: v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], v8 ; encoding: [0x0a,0x00,0xc3,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], 100.0
+// GFX1250: v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xc3,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], v8
+// GFX1250: v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], v8 ; encoding: [0x0a,0x00,0xc5,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], 100.0
+// GFX1250: v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xc5,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], v8
+// GFX1250: v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], v8 ; encoding: [0x0a,0x00,0xb0,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], 100.0
+// GFX1250: v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xb0,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], v8
+// GFX1250: v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], v8 ; encoding: [0x0a,0x00,0xb3,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], 100.0
+// GFX1250: v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xb3,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], v8
+// GFX1250: v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], v8 ; encoding: [0x0a,0x00,0xb8,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], 100.0
+// GFX1250: v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xb8,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
index c1d23beba3f7f..7f1185906d3f9 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
@@ -886,3 +886,57 @@ v_permlane_idx_gen_b32 v5, v1, exec_hi
v_permlane_idx_gen_b32 v5, v1, exec_lo
// GFX1250: v_permlane_idx_gen_b32 v5, v1, exec_lo ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xfd,0x00,0x00]
+
+v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], v8
+// GFX1250: v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0xb4,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], 100.0
+// GFX1250: v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xb4,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], v8
+// GFX1250: v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0xb5,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], 100.0
+// GFX1250: v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xb5,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], v8
+// GFX1250: v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0xc4,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], 100.0
+// GFX1250: v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xc4,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], v8
+// GFX1250: v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0xc6,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], 100.0
+// GFX1250: v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xc6,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], v8
+// GFX1250: v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], v8 ; encoding: [0x0a,0x00,0xc3,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], 100.0
+// GFX1250: v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xc3,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], v8
+// GFX1250: v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], v8 ; encoding: [0x0a,0x00,0xc5,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], 100.0
+// GFX1250: v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xc5,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], v8
+// GFX1250: v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], v8 ; encoding: [0x0a,0x00,0xb0,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], 100.0
+// GFX1250: v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xb0,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], v8
+// GFX1250: v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], v8 ; encoding: [0x0a,0x00,0xb3,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], 100.0
+// GFX1250: v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xb3,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], v8
+// GFX1250: v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], v8 ; encoding: [0x0a,0x00,0xb8,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], 100.0
+// GFX1250: v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xb8,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
index e4968fef71935..53b795844b5d2 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
@@ -937,3 +937,57 @@
0x05,0x00,0x14,0xd7,0x01,0xd5,0x00,0x00
# GFX1250: v_permlane_idx_gen_b32 v5, v1, vcc_lo ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd5,0x00,0x00]
+
+0x0a,0x00,0xb4,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42
+# GFX1250: v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xb4,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+0x0a,0x00,0xb4,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scalef32_pk8_fp8_bf16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0xb4,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x00,0xb5,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42
+# GFX1250: v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xb5,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+0x0a,0x00,0xb5,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scalef32_pk8_bf8_bf16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0xb5,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x00,0xc4,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42
+# GFX1250: v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xc4,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+0x0a,0x00,0xc4,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scalef32_pk8_fp8_f16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0xc4,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x00,0xc6,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42
+# GFX1250: v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xc6,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+0x0a,0x00,0xc6,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scalef32_pk8_bf8_f16 v[10:11], v[20:23], v8 ; encoding: [0x0a,0x00,0xc6,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x00,0xc3,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42
+# GFX1250: v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xc3,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+0x0a,0x00,0xc3,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[20:27], v8 ; encoding: [0x0a,0x00,0xc3,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x00,0xc5,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42
+# GFX1250: v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xc5,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+0x0a,0x00,0xc5,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[20:27], v8 ; encoding: [0x0a,0x00,0xc5,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x00,0xb0,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42
+# GFX1250: v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], 0x42c80000 ; encoding: [0x0a,0x00,0xb0,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+0x0a,0x00,0xb0,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scalef32_pk8_fp4_f32 v10, v[20:27], v8 ; encoding: [0x0a,0x00,0xb0,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x00,0xb3,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42
+# GFX1250: v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xb3,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+0x0a,0x00,0xb3,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scalef32_pk8_fp4_f16 v10, v[20:23], v8 ; encoding: [0x0a,0x00,0xb3,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x00,0xb8,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42
+# GFX1250: v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], 0x42c80000 ; encoding: [0x0a,0x00,0xb8,0xd6,0x14,0xff,0x01,0x00,0x00,0x00,0xc8,0x42]
+
+0x0a,0x00,0xb8,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scalef32_pk8_fp4_bf16 v10, v[20:23], v8 ; encoding: [0x0a,0x00,0xb8,0xd6,0x14,0x11,0x02,0x00]
More information about the llvm-branch-commits
mailing list