[clang] [llvm] [AMDGPU] Add v_cvt_scale_pk8_* gfx1250 instructions (PR #151616)
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 31 18:05:57 PDT 2025
https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/151616
>From ab3017a04a86f64329c37bf3ba22dc17a7630f3e Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Thu, 31 Jul 2025 17:14:33 -0700
Subject: [PATCH] [AMDGPU] Add v_cvt_scale_pk8_* gfx1250 instructions
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 9 +
clang/lib/Sema/SemaAMDGPU.cpp | 10 ++
.../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 111 ++++++++++++
.../builtins-amdgcn-error-gfx1250-param.cl | 40 +++++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 23 ++-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 9 +
.../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 6 +
.../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 10 ++
.../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 +
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 11 ++
llvm/lib/Target/AMDGPU/VOP3Instructions.td | 39 +++++
llvm/lib/Target/AMDGPU/VOPInstructions.td | 14 ++
.../AMDGPU/llvm.amdgcn.cvt.scale.pk.ll | 164 ++++++++++++++++++
llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s | 81 +++++++++
llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s | 81 +++++++++
llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s | 40 +++++
.../Disassembler/AMDGPU/gfx1250_dasm_vop3.txt | 84 +++++++++
17 files changed, 730 insertions(+), 4 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 1879a9da753e5..bb3953ea1253d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -707,6 +707,15 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f16, "sV2h", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_bf8_f16, "sV2h", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f16, "ihiUiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_bf8_f16, "ihiUiIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f16_fp8, "V8hV2UiUiIUi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp8, "V8yV2UiUiIUi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f16_bf8, "V8hV2UiUiIUi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_bf8, "V8yV2UiUiIUi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f16_fp4, "V8hUiUiIUi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_bf16_fp4, "V8yUiUiIUi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp8, "V8fV2UiUiIUi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_bf8, "V8fV2UiUiIUi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scale_pk8_f32_fp4, "V8fUiUiIUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts")
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp
index c23c98aa3aaeb..8580de2f0c03c 100644
--- a/clang/lib/Sema/SemaAMDGPU.cpp
+++ b/clang/lib/Sema/SemaAMDGPU.cpp
@@ -84,6 +84,16 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID,
return checkMovDPPFunctionCall(TheCall, 2, 1);
case AMDGPU::BI__builtin_amdgcn_update_dpp: {
return checkMovDPPFunctionCall(TheCall, 6, 2);
+ case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f16_fp8:
+ case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_bf16_fp8:
+ case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f16_bf8:
+ case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_bf16_bf8:
+ case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f16_fp4:
+ case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_bf16_fp4:
+ case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f32_fp8:
+ case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f32_bf8:
+ case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk8_f32_fp4:
+ return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 7);
}
default:
return false;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 67cb742ea32ef..51ab970655b4a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -7,8 +7,20 @@
typedef unsigned int uint;
typedef unsigned short int ushort;
typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
+typedef unsigned int __attribute__((ext_vector_type(3))) uint3;
+typedef unsigned int __attribute__((ext_vector_type(4))) uint4;
typedef __bf16 __attribute__((ext_vector_type(2))) bfloat2;
+typedef __bf16 __attribute__((ext_vector_type(8))) bfloat8;
+typedef __bf16 __attribute__((ext_vector_type(16))) bfloat16;
+typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32;
typedef half __attribute__((ext_vector_type(2))) half2;
+typedef half __attribute__((ext_vector_type(8))) half8;
+typedef half __attribute__((ext_vector_type(16))) half16;
+typedef half __attribute__((ext_vector_type(32))) half32;
+typedef float __attribute__((ext_vector_type(8))) float8;
+typedef float __attribute__((ext_vector_type(16))) float16;
+typedef float __attribute__((ext_vector_type(32))) float32;
+typedef short __attribute__((ext_vector_type(2))) short2;
// CHECK-LABEL: @test_setprio_inc_wg(
// CHECK-NEXT: entry:
@@ -563,6 +575,105 @@ void test_cvt_sr_fp8_f16(global int* out, half a, short sr, int old)
*out = __builtin_amdgcn_cvt_sr_fp8_f16(a, sr, old, 3);
}
+// CHECK-LABEL: @test_cvt_scale_pk(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUTH8_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[OUTY8_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// CHECK-NEXT: [[OUTF32_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[OUTF8_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[OUTH16_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[OUTY16_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[OUTF16_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[SRC3_ADDR:%.*]] = alloca <3 x i32>, align 16, addrspace(5)
+// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[OUTH8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTH8_ADDR]] to ptr
+// CHECK-NEXT: [[OUTY8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTY8_ADDR]] to ptr
+// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr
+// CHECK-NEXT: [[OUTF32_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTF32_ADDR]] to ptr
+// CHECK-NEXT: [[OUTF8_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTF8_ADDR]] to ptr
+// CHECK-NEXT: [[OUTH16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTH16_ADDR]] to ptr
+// CHECK-NEXT: [[OUTY16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTY16_ADDR]] to ptr
+// CHECK-NEXT: [[OUTF16_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUTF16_ADDR]] to ptr
+// CHECK-NEXT: [[SRC3_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC3_ADDR]] to ptr
+// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
+// CHECK-NEXT: [[SCALE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SCALE_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUTH8:%.*]], ptr [[OUTH8_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUTY8:%.*]], ptr [[OUTY8_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <2 x i32> [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUTF32:%.*]], ptr [[OUTF32_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUTF8:%.*]], ptr [[OUTF8_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUTH16:%.*]], ptr [[OUTH16_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUTY16:%.*]], ptr [[OUTY16_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[OUTF16:%.*]], ptr [[OUTF16_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[SRC3:%.*]], ptr [[SRC3_ADDR_ASCAST]], align 16
+// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
+// CHECK-NEXT: store i32 [[SCALE:%.*]], ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp8(<2 x i32> [[TMP0]], i32 [[TMP1]], i32 4)
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUTH8_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <8 x half> [[TMP2]], ptr addrspace(1) [[TMP3]], align 16
+// CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp8(<2 x i32> [[TMP4]], i32 [[TMP5]], i32 5)
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUTY8_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <8 x bfloat> [[TMP6]], ptr addrspace(1) [[TMP7]], align 16
+// CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP10:%.*]] = call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.bf8(<2 x i32> [[TMP8]], i32 [[TMP9]], i32 6)
+// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr [[OUTH8_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <8 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 16
+// CHECK-NEXT: [[TMP12:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP14:%.*]] = call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.bf8(<2 x i32> [[TMP12]], i32 [[TMP13]], i32 7)
+// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUTY8_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <8 x bfloat> [[TMP14]], ptr addrspace(1) [[TMP15]], align 16
+// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp4(i32 [[TMP16]], i32 [[TMP17]], i32 1)
+// CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUTH8_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <8 x half> [[TMP18]], ptr addrspace(1) [[TMP19]], align 16
+// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP22:%.*]] = call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp4(i32 [[TMP20]], i32 [[TMP21]], i32 2)
+// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUTY8_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <8 x bfloat> [[TMP22]], ptr addrspace(1) [[TMP23]], align 16
+// CHECK-NEXT: [[TMP24:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP26:%.*]] = call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp8(<2 x i32> [[TMP24]], i32 [[TMP25]], i32 5)
+// CHECK-NEXT: [[TMP27:%.*]] = load ptr addrspace(1), ptr [[OUTF8_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <8 x float> [[TMP26]], ptr addrspace(1) [[TMP27]], align 32
+// CHECK-NEXT: [[TMP28:%.*]] = load <2 x i32>, ptr [[SRC2_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP30:%.*]] = call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.bf8(<2 x i32> [[TMP28]], i32 [[TMP29]], i32 6)
+// CHECK-NEXT: [[TMP31:%.*]] = load ptr addrspace(1), ptr [[OUTF8_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <8 x float> [[TMP30]], ptr addrspace(1) [[TMP31]], align 32
+// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP34:%.*]] = call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp4(i32 [[TMP32]], i32 [[TMP33]], i32 7)
+// CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr [[OUTF8_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <8 x float> [[TMP34]], ptr addrspace(1) [[TMP35]], align 32
+// CHECK-NEXT: ret void
+//
+void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2,
+ global float32 *outf32, global float8 *outf8,
+ global half16 *outh16, global bfloat16 *outy16,
+ global float16 *outf16, uint3 src3,
+ uint src1, uint scale)
+{
+ *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp8(src2, scale, 4);
+ *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp8(src2, scale, 5);
+ *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_bf8(src2, scale, 6);
+ *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_bf8(src2, scale, 7);
+ *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp4(src1, scale, 1);
+ *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp4(src1, scale, 2);
+ *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, 5);
+ *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, 6);
+ *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 7);
+}
+
// CHECK-LABEL: @test_sat_pk4_i4_i8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
index 32473808208f8..83c63f1465a8b 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
@@ -1,7 +1,21 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s
+typedef unsigned int uint;
+typedef unsigned short int ushort;
typedef int v2i __attribute__((ext_vector_type(2)));
+typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
+typedef unsigned int __attribute__((ext_vector_type(3))) uint3;
+typedef __bf16 __attribute__((ext_vector_type(8))) bfloat8;
+typedef __bf16 __attribute__((ext_vector_type(16))) bfloat16;
+typedef __bf16 __attribute__((ext_vector_type(32))) bfloat32;
+typedef half __attribute__((ext_vector_type(8))) half8;
+typedef half __attribute__((ext_vector_type(16))) half16;
+typedef half __attribute__((ext_vector_type(32))) half32;
+typedef float __attribute__((ext_vector_type(8))) float8;
+typedef float __attribute__((ext_vector_type(16))) float16;
+typedef float __attribute__((ext_vector_type(32))) float32;
+
typedef int v4i __attribute__((ext_vector_type(4)));
typedef int v8i __attribute__((ext_vector_type(8)));
@@ -29,6 +43,32 @@ void test__builtin_amdgcn_cvt_f16_bf8(int a, int b) {
__builtin_amdgcn_cvt_f16_bf8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_bf8' must be a constant integer}}
}
+void test_cvt_scale_pk(global half8 *outh8, global bfloat8 *outy8, uint2 src2,
+ global float32 *outf32, global half16 *outh16, global bfloat16 *outy16,
+ global float16 *outf16, uint3 src3,
+ global float8 *outf8, uint src1, uint scale, uint scale_sel)
+{
+ *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f16_fp8' must be a constant integer}}
+ *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_bf16_fp8' must be a constant integer}}
+ *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_bf8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f16_bf8' must be a constant integer}}
+ *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_bf8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_bf16_bf8' must be a constant integer}}
+ *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp4(src1, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f16_fp4' must be a constant integer}}
+ *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp4(src1, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_bf16_fp4' must be a constant integer}}
+ *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f32_fp8' must be a constant integer}}
+ *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f32_bf8' must be a constant integer}}
+ *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, scale_sel); // expected-error {{'__builtin_amdgcn_cvt_scale_pk8_f32_fp4' must be a constant integer}}
+
+ *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_bf8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_bf8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ *outh8 = __builtin_amdgcn_cvt_scale_pk8_f16_fp4(src1, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ *outy8 = __builtin_amdgcn_cvt_scale_pk8_bf16_fp4(src1, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_bf8(src2, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+ *outf8 = __builtin_amdgcn_cvt_scale_pk8_f32_fp4(src1, scale, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
+
void test_amdgcn_load_monitor(global int* b32gaddr, global v2i* b64gaddr, global v4i* b128gaddr, int *b32faddr, v2i* b64faddr, v4i *b128faddr,
global int* b32out, global v2i* b64out, global v4i* b128out, int cpol)
{
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index a58e26c7d2224..7265a76294c4c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -633,18 +633,33 @@ def int_amdgcn_cvt_sr_bf8_f16 : DefaultAttrsIntrinsic<
[IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]
>, ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f16">;
-class AMDGPUCvtScaleF32Intrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
- [DstTy], [Src0Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
+// llvm.amdgcn.cvt.scale.pk32.f16.bf6 v32f16 vdst, v6i32 src0, i32 scale_sel [0..7]
+class AMDGPUCvtScaleIntrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
+ [DstTy], [Src0Ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>]
>, ClangBuiltin<"__builtin_amdgcn_"#name>;
-class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMType Src1Ty, string name> : DefaultAttrsIntrinsic<
- [DstTy], [Src0Ty, Src1Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
+class AMDGPUCvtScaleF32Intrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
+ [DstTy], [Src0Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
>, ClangBuiltin<"__builtin_amdgcn_"#name>;
class AMDGPUCvtScaleF32SRIntrinsic<LLVMType DstTy, LLVMType Src0Ty, string name> : DefaultAttrsIntrinsic<
[DstTy], [Src0Ty, llvm_i32_ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
>, ClangBuiltin<"__builtin_amdgcn_"#name>;
+def int_amdgcn_cvt_scale_pk8_f16_fp8 : AMDGPUCvtScaleIntrinsic<llvm_v8f16_ty, llvm_v2i32_ty, "cvt_scale_pk8_f16_fp8">;
+def int_amdgcn_cvt_scale_pk8_bf16_fp8 : AMDGPUCvtScaleIntrinsic<llvm_v8bf16_ty, llvm_v2i32_ty, "cvt_scale_pk8_bf16_fp8">;
+def int_amdgcn_cvt_scale_pk8_f16_bf8 : AMDGPUCvtScaleIntrinsic<llvm_v8f16_ty, llvm_v2i32_ty, "cvt_scale_pk8_f16_bf8">;
+def int_amdgcn_cvt_scale_pk8_bf16_bf8 : AMDGPUCvtScaleIntrinsic<llvm_v8bf16_ty, llvm_v2i32_ty, "cvt_scale_pk8_bf16_bf8">;
+def int_amdgcn_cvt_scale_pk8_f16_fp4 : AMDGPUCvtScaleIntrinsic<llvm_v8f16_ty, llvm_i32_ty, "cvt_scale_pk8_f16_fp4">;
+def int_amdgcn_cvt_scale_pk8_bf16_fp4 : AMDGPUCvtScaleIntrinsic<llvm_v8bf16_ty, llvm_i32_ty, "cvt_scale_pk8_bf16_fp4">;
+def int_amdgcn_cvt_scale_pk8_f32_fp8 : AMDGPUCvtScaleIntrinsic<llvm_v8f32_ty, llvm_v2i32_ty, "cvt_scale_pk8_f32_fp8">;
+def int_amdgcn_cvt_scale_pk8_f32_bf8 : AMDGPUCvtScaleIntrinsic<llvm_v8f32_ty, llvm_v2i32_ty, "cvt_scale_pk8_f32_bf8">;
+def int_amdgcn_cvt_scale_pk8_f32_fp4 : AMDGPUCvtScaleIntrinsic<llvm_v8f32_ty, llvm_i32_ty, "cvt_scale_pk8_f32_fp4">;
+
+class AMDGPUCvtScaleF32ToFP6BF6Intrinsic<LLVMType DstTy, LLVMType Src0Ty, LLVMType Src1Ty, string name> : DefaultAttrsIntrinsic<
+ [DstTy], [Src0Ty, Src1Ty, llvm_float_ty], [IntrNoMem, IntrSpeculatable]
+>, ClangBuiltin<"__builtin_amdgcn_"#name>;
+
def int_amdgcn_cvt_scalef32_pk32_fp6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_fp6_f16">;
def int_amdgcn_cvt_scalef32_pk32_bf6_f16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32f16_ty, "cvt_scalef32_pk32_bf6_f16">;
def int_amdgcn_cvt_scalef32_pk32_fp6_bf16 : AMDGPUCvtScaleF32Intrinsic<llvm_v6i32_ty, llvm_v32bf16_ty, "cvt_scalef32_pk32_fp6_bf16">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 0f1d7edad12f7..c8e45d47c3660 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4582,6 +4582,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_pk_bf8_f16:
case Intrinsic::amdgcn_cvt_sr_fp8_f16:
case Intrinsic::amdgcn_cvt_sr_bf8_f16:
+ case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp8:
+ case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp8:
+ case Intrinsic::amdgcn_cvt_scale_pk8_f16_bf8:
+ case Intrinsic::amdgcn_cvt_scale_pk8_bf16_bf8:
+ case Intrinsic::amdgcn_cvt_scale_pk8_f16_fp4:
+ case Intrinsic::amdgcn_cvt_scale_pk8_bf16_fp4:
+ case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp8:
+ case Intrinsic::amdgcn_cvt_scale_pk8_f32_bf8:
+ case Intrinsic::amdgcn_cvt_scale_pk8_f32_fp4:
case Intrinsic::amdgcn_sat_pk4_i4_i8:
case Intrinsic::amdgcn_sat_pk4_u4_u8:
case Intrinsic::amdgcn_fmed3:
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 700701d503853..a83caa0db8a69 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -180,6 +180,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
ImmTyMatrixBFMT,
ImmTyMatrixAReuse,
ImmTyMatrixBReuse,
+ ImmTyScaleSel,
ImmTyByteSel,
};
@@ -1184,6 +1185,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
case ImmTyMatrixBFMT: OS << "ImmTyMatrixBFMT"; break;
case ImmTyMatrixAReuse: OS << "ImmTyMatrixAReuse"; break;
case ImmTyMatrixBReuse: OS << "ImmTyMatrixBReuse"; break;
+ case ImmTyScaleSel: OS << "ScaleSel" ; break;
case ImmTyByteSel: OS << "ByteSel" ; break;
}
// clang-format on
@@ -9366,6 +9368,10 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
}
}
+ if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::scale_sel))
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTyScaleSel);
+
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyClamp);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 15088ac25863f..42c4d8b8a9717 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1793,4 +1793,14 @@ void AMDGPUInstPrinter::printBitOp3(const MCInst *MI, unsigned OpNo,
O << formatHex(static_cast<uint64_t>(Imm));
}
+void AMDGPUInstPrinter::printScaleSel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
+ uint8_t Imm = MI->getOperand(OpNo).getImm();
+ if (!Imm)
+ return;
+
+ O << " scale_sel:" << formatDec(Imm);
+}
+
#include "AMDGPUGenAsmWriter.inc"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index e0b7aa5799e62..f6739b14926e1 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -173,6 +173,8 @@ class AMDGPUInstPrinter : public MCInstPrinter {
const MCSubtargetInfo &STI, raw_ostream &O,
StringRef Prefix, bool PrintInHex, bool AlwaysPrint);
+ void printScaleSel(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printBitOp3(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index c5931fcd5d909..a3e20baa9e298 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1313,6 +1313,10 @@ def MatrixBFMT : CustomOperand<i32, 1, "MatrixBFMT">;
def MatrixAReuse : NamedBitOperand<"matrix_a_reuse">;
def MatrixBReuse : NamedBitOperand<"matrix_b_reuse">;
+def ScaleSel : NamedIntOperand<"scale_sel"> {
+ let Validator = "isUInt<3>";
+}
+
class KImmFPOperand<ValueType vt> : ImmOperand<vt> {
let OperandNamespace = "AMDGPU";
let OperandType = "OPERAND_KIMM"#vt.Size;
@@ -2944,6 +2948,13 @@ def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>;
def VOP_F16_F32_I32 : VOPProfile<[f16, f32, i32, untyped]>;
def VOP_I32_BF16_I32_F32 : VOPProfile<[i32, bf16, i32, f32]>;
def VOP_I32_F16_I32_F32 : VOPProfile<[i32, f16, i32, f32]>;
+def VOP_V8F16_V2I32_I32 : VOPProfile<[v8f16, v2i32, i32, untyped]>;
+def VOP_V8BF16_V2I32_I32 : VOPProfile<[v8bf16, v2i32, i32, untyped]>;
+def VOP_V8F16_I32_I32 : VOPProfile<[v8f16, i32, i32, untyped]>;
+def VOP_V8BF16_I32_I32 : VOPProfile<[v8bf16, i32, i32, untyped]>;
+def VOP_V16F32_V3I32_I32 : VOPProfile<[v16f32, v3i32, i32, untyped]>;
+def VOP_V8F32_V2I32_I32 : VOPProfile<[v8f32, v2i32, i32, untyped]>;
+def VOP_V8F32_I32_I32 : VOPProfile<[v8f32, i32, i32, untyped]>;
def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>;
def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 7e922abd695c0..1ffe39dc5cba5 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1675,6 +1675,23 @@ let SubtargetPredicate = HasBF16ConversionInsts in {
(V_CVT_PK_BF16_F32_e64 $src0_modifiers, $src0, 0, (f32 (IMPLICIT_DEF)))>;
}
+class VOP3_CVT_SCALE_PK_F16_F864_Profile<VOPProfile P> : VOP3_CVT_SCALEF32_PK_F864_Profile<P> {
+ let Src0RC64 = getVOP3VRegSrcForVT<Src0VT>.ret;
+ let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
+ HasClamp, HasModifiers, HasSrc2Mods,
+ HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
+ (ins ScaleSel:$scale_sel));
+ let Asm64 = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
+ HasOpSel, HasOMod, IsVOP3P, HasNeg, HasSrc0Mods, HasSrc1Mods,
+ HasSrc2Mods, DstVT>.ret # "$scale_sel";
+}
+
+multiclass VOP3CvtScaleSelInst<string OpName, VOPProfile P, SDPatternOperator node> {
+ def _e64 : VOP3InstBase<OpName, VOP3_CVT_SCALE_PK_F16_F864_Profile<P>> {
+ let Pattern = [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0)), i32:$src1, i32:$scale_sel))];
+ }
+}
+
let Src0RC64 = VSrc_NoInline_v2f16 in {
def VOP3_CVT_PK_F8_F16_Profile : VOP3_Profile<VOP_I16_V2F16>;
def VOP3_CVT_PK_F8_F16_True16_Profile : VOP3_Profile_True16<VOP3_CVT_PK_F8_F16_Profile>;
@@ -1712,6 +1729,19 @@ let SubtargetPredicate = isGFX1250Plus in {
defm V_CVT_SR_BF8_F16 : VOP3Inst_t16_with_profiles<"v_cvt_sr_bf8_f16", VOP3_CVT_SR_F8_F16_Profile,
VOP3_CVT_SR_F8_F16_True16_Profile, VOP3_CVT_SR_F8_F16_Fake16_Profile>;
}
+
+ let Constraints = "@earlyclobber $vdst" in {
+ defm V_CVT_SCALE_PK8_F16_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp8", VOP_V8F16_V2I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp8>;
+ defm V_CVT_SCALE_PK8_BF16_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_fp8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_fp8>;
+ defm V_CVT_SCALE_PK8_F16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_bf8", VOP_V8F16_V2I32_I32, int_amdgcn_cvt_scale_pk8_f16_bf8>;
+ defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_bf8", VOP_V8BF16_V2I32_I32, int_amdgcn_cvt_scale_pk8_bf16_bf8>;
+ defm V_CVT_SCALE_PK8_F32_FP8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp8>;
+ defm V_CVT_SCALE_PK8_F32_BF8 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_bf8", VOP_V8F32_V2I32_I32, int_amdgcn_cvt_scale_pk8_f32_bf8>;
+ } // End Constraints = "@earlyclobber $vdst"
+
+ defm V_CVT_SCALE_PK8_F16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f16_fp4", VOP_V8F16_I32_I32, int_amdgcn_cvt_scale_pk8_f16_fp4>;
+ defm V_CVT_SCALE_PK8_BF16_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_bf16_fp4", VOP_V8BF16_I32_I32, int_amdgcn_cvt_scale_pk8_bf16_fp4>;
+ defm V_CVT_SCALE_PK8_F32_FP4 : VOP3CvtScaleSelInst<"v_cvt_scale_pk8_f32_fp4", VOP_V8F32_I32_I32, int_amdgcn_cvt_scale_pk8_f32_fp4>;
} // End ReadsModeReg = 0
let True16Predicate = UseRealTrue16Insts in {
@@ -2120,6 +2150,15 @@ let AssemblerPredicate = isGFX11Plus in {
defm V_LSHL_ADD_U64 : VOP3Only_Realtriple_gfx1250<0x252>;
defm V_ASHR_PK_I8_I32 : VOP3Only_Realtriple_gfx1250<0x290>;
defm V_ASHR_PK_U8_I32 : VOP3Only_Realtriple_gfx1250<0x291>;
+defm V_CVT_SCALE_PK8_F16_FP4 : VOP3Only_ScaleSel_Real_gfx1250<0x29f>;
+defm V_CVT_SCALE_PK8_BF16_FP4 : VOP3Only_ScaleSel_Real_gfx1250<0x2a0>;
+defm V_CVT_SCALE_PK8_F32_FP4 : VOP3Only_ScaleSel_Real_gfx1250<0x2a1>;
+defm V_CVT_SCALE_PK8_F16_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2a8>;
+defm V_CVT_SCALE_PK8_BF16_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2a9>;
+defm V_CVT_SCALE_PK8_F32_FP8 : VOP3Only_ScaleSel_Real_gfx1250<0x2aa>;
+defm V_CVT_SCALE_PK8_F16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ab>;
+defm V_CVT_SCALE_PK8_BF16_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ac>;
+defm V_CVT_SCALE_PK8_F32_BF8 : VOP3Only_ScaleSel_Real_gfx1250<0x2ad>;
defm V_CVT_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36d>;
defm V_CVT_SR_PK_BF16_F32 : VOP3Only_Realtriple_gfx1250<0x36e>;
defm V_CVT_PK_F16_F32 : VOP3Only_Realtriple_gfx1250<0x36f>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 0858b0475eb07..f027ab05c546c 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -414,6 +414,13 @@ class VOP3a_BITOP3_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, p> {
let Inst{14} = !if(p.HasOpSel, src0_modifiers{3}, 0);
}
+class VOP3a_ScaleSel_gfx1250<bits<10> op, VOPProfile p> : VOP3e_gfx11_gfx12<op, p> {
+ bits<3> scale_sel;
+
+ let Inst{13-11} = scale_sel;
+ let Inst{14} = 0;
+}
+
class VOP3Interp_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
bits<6> attr;
bits<2> attrchan;
@@ -2010,6 +2017,13 @@ multiclass VOP3_BITOP3_Real_Base<GFXGen Gen, bits<10> op, string asmName> {
}
}
+multiclass VOP3Only_ScaleSel_Real_gfx1250<bits<10> op> {
+ defvar ps = !cast<VOP_Pseudo>(NAME#"_e64");
+ def _e64_gfx1250 :
+ VOP3_Real_Gen<ps, GFX1250Gen>,
+ VOP3a_ScaleSel_gfx1250<op, ps.Pfl>;
+}
+
multiclass VOP3Only_Realtriple_t16_gfx11_gfx12_not_gfx1250<bits<10> op, string asmName, string opName = NAME,
string pseudo_mnemonic = "", bit isSingle = 0> :
VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll
new file mode 100644
index 0000000000000..4309cfbe1b124
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll
@@ -0,0 +1,164 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+declare <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp8(<2 x i32> %src, i32 %scale, i32 %scale_sel)
+declare <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp8(<2 x i32> %src, i32 %scale, i32 %scale_sel)
+declare <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.bf8(<2 x i32> %src, i32 %scale, i32 %scale_sel)
+declare <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.bf8(<2 x i32> %src, i32 %scale, i32 %scale_sel)
+declare <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp4(i32 %src, i32 %scale, i32 %scale_sel)
+declare <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp4(i32 %src, i32 %scale, i32 %scale_sel)
+declare <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp8(<2 x i32> %src, i32 %scale, i32 %scale_sel)
+declare <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.bf8(<2 x i32> %src, i32 %scale, i32 %scale_sel)
+declare <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp4(i32 %src, i32 %scale, i32 %scale_sel)
+
+define amdgpu_ps void @test_cvt_scale_pk8_f16_fp8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f16_fp8_vv:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v8, v3
+; GFX1250-SDAG-NEXT: v_cvt_scale_pk8_f16_fp8 v[4:7], v[0:1], v2 scale_sel:1
+; GFX1250-SDAG-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f16_fp8_vv:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT: v_cvt_scale_pk8_f16_fp8 v[4:7], v[0:1], v2 scale_sel:1
+; GFX1250-GISEL-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp8(<2 x i32> %src, i32 %scale, i32 1)
+ store <8 x half> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_cvt_scale_pk8_f16_bf8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f16_bf8_vv:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v8, v3
+; GFX1250-SDAG-NEXT: v_cvt_scale_pk8_f16_bf8 v[4:7], v[0:1], v2
+; GFX1250-SDAG-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f16_bf8_vv:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GFX1250-GISEL-NEXT: v_cvt_scale_pk8_f16_bf8 v[4:7], v[0:1], v2
+; GFX1250-GISEL-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.bf8(<2 x i32> %src, i32 %scale, i32 0)
+ store <8 x half> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_cvt_scale_pk8_bf16_fp8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_cvt_scale_pk8_bf16_fp8_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v8, v3
+; GFX1250-NEXT: v_cvt_scale_pk8_bf16_fp8 v[4:7], v[0:1], v2 scale_sel:1
+; GFX1250-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp8(<2 x i32> %src, i32 %scale, i32 1)
+ store <8 x bfloat> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_cvt_scale_pk8_bf16_bf8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_cvt_scale_pk8_bf16_bf8_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v8, v3
+; GFX1250-NEXT: v_cvt_scale_pk8_bf16_bf8 v[4:7], v[0:1], v2 scale_sel:2
+; GFX1250-NEXT: global_store_b128 v[8:9], v[4:7], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.bf8(<2 x i32> %src, i32 %scale, i32 2)
+ store <8 x bfloat> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_cvt_scale_pk8_f16_fp4_vv(i32 %src, i32 %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_cvt_scale_pk8_f16_fp4_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scale_pk8_f16_fp4 v[4:7], v0, v1 scale_sel:3
+; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <8 x half> @llvm.amdgcn.cvt.scale.pk8.f16.fp4(i32 %src, i32 %scale, i32 3)
+ store <8 x half> %cvt, ptr addrspace(1) %out, align 16
+ ret void
+}
+
+define amdgpu_ps void @test_cvt_scale_pk8_bf16_fp4_vv(i32 %src, i32 %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_cvt_scale_pk8_bf16_fp4_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scale_pk8_bf16_fp4 v[4:7], v0, v1 scale_sel:4
+; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <8 x bfloat> @llvm.amdgcn.cvt.scale.pk8.bf16.fp4(i32 %src, i32 %scale, i32 4)
+ store <8 x bfloat> %cvt, ptr addrspace(1) %out, align 16
+ ret void
+}
+
+define amdgpu_ps void @test_cvt_scale_pk8_f32_fp8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f32_fp8_vv:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v13, v4 :: v_dual_mov_b32 v12, v3
+; GFX1250-SDAG-NEXT: v_cvt_scale_pk8_f32_fp8 v[4:11], v[0:1], v2 scale_sel:7
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f32_fp8_vv:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v13, v4
+; GFX1250-GISEL-NEXT: v_cvt_scale_pk8_f32_fp8 v[4:11], v[0:1], v2 scale_sel:7
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX1250-GISEL-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp8(<2 x i32> %src, i32 %scale, i32 7)
+ store <8 x float> %cvt, ptr addrspace(1) %out, align 16
+ ret void
+}
+
+define amdgpu_ps void @test_cvt_scale_pk8_f32_bf8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f32_bf8_vv:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v13, v4 :: v_dual_mov_b32 v12, v3
+; GFX1250-SDAG-NEXT: v_cvt_scale_pk8_f32_bf8 v[4:11], v[0:1], v2
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f32_bf8_vv:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v13, v4
+; GFX1250-GISEL-NEXT: v_cvt_scale_pk8_f32_bf8 v[4:11], v[0:1], v2
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off
+; GFX1250-GISEL-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.bf8(<2 x i32> %src, i32 %scale, i32 0)
+ store <8 x float> %cvt, ptr addrspace(1) %out, align 16
+ ret void
+}
+
+define amdgpu_ps void @test_cvt_scale_pk8_f32_fp4_vv(i32 %src, i32 %scale, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f32_fp4_vv:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_cvt_scale_pk8_f32_fp4 v[4:11], v0, v1 scale_sel:1
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX1250-SDAG-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f32_fp4_vv:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_cvt_scale_pk8_f32_fp4 v[4:11], v0, v1 scale_sel:1
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: global_store_b128 v[2:3], v[4:7], off
+; GFX1250-GISEL-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <8 x float> @llvm.amdgcn.cvt.scale.pk8.f32.fp4(i32 %src, i32 %scale, i32 1)
+ store <8 x float> %cvt, ptr addrspace(1) %out, align 32
+ ret void
+}
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
index d73e214f1bedf..1f40a3249ff94 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
@@ -685,3 +685,84 @@ v_cvt_sr_bf8_f32 v10, s2, v5
v_cvt_sr_bf8_f32 v5, -|v255|, v4
// GFX1250: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]
+
+v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8
+// GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xa8,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], 0xcf00
+// GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xa8,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 scale_sel:5
+// GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 scale_sel:5 ; encoding: [0x0a,0x28,0xa8,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8
+// GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xa9,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], 0xcf00
+// GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xa9,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 scale_sel:6
+// GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 scale_sel:6 ; encoding: [0x0a,0x30,0xa9,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8
+// GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xab,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], 0xcf00
+// GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xab,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7
+// GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xab,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8
+// GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], 0xcf00
+// GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 scale_sel:1
+// GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 scale_sel:1 ; encoding: [0x0a,0x08,0xac,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8
+// GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 ; encoding: [0x0a,0x00,0xa0,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, 0xcf00
+// GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, 0xcf00 ; encoding: [0x0a,0x00,0xa0,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 scale_sel:2
+// GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 scale_sel:2 ; encoding: [0x0a,0x10,0xa0,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8
+// GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 ; encoding: [0x0a,0x00,0x9f,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f16_fp4 v[10:13], v20, 0xcf00
+// GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, 0xcf00 ; encoding: [0x0a,0x00,0x9f,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 scale_sel:3
+// GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 scale_sel:3 ; encoding: [0x0a,0x18,0x9f,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8
+// GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 ; encoding: [0x0a,0x00,0xaa,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], 0xcf00
+// GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xaa,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:6
+// GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:6 ; encoding: [0x0a,0x30,0xaa,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8
+// GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 ; encoding: [0x0a,0x00,0xad,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], 0xcf00
+// GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xad,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7
+// GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xad,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8
+// GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00
+// GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1
+// GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 ; encoding: [0x0a,0x08,0xa1,0xd6,0x14,0x11,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
index 33b003b4377c8..03f642d8ef33b 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
@@ -685,3 +685,84 @@ v_cvt_sr_bf8_f32 v10, s2, v5
v_cvt_sr_bf8_f32 v5, -|v255|, v4
// GFX1250: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]
+
+v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8
+// GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xa8,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], 0xcf00
+// GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xa8,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 scale_sel:5
+// GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 scale_sel:5 ; encoding: [0x0a,0x28,0xa8,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8
+// GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xa9,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], 0xcf00
+// GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xa9,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 scale_sel:6
+// GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 scale_sel:6 ; encoding: [0x0a,0x30,0xa9,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8
+// GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xab,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], 0xcf00
+// GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xab,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7
+// GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xab,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8
+// GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], 0xcf00
+// GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 scale_sel:1
+// GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 scale_sel:1 ; encoding: [0x0a,0x08,0xac,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8
+// GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 ; encoding: [0x0a,0x00,0xa0,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, 0xcf00
+// GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, 0xcf00 ; encoding: [0x0a,0x00,0xa0,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 scale_sel:2
+// GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 scale_sel:2 ; encoding: [0x0a,0x10,0xa0,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8
+// GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 ; encoding: [0x0a,0x00,0x9f,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f16_fp4 v[10:13], v20, 0xcf00
+// GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, 0xcf00 ; encoding: [0x0a,0x00,0x9f,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 scale_sel:3
+// GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 scale_sel:3 ; encoding: [0x0a,0x18,0x9f,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8
+// GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 ; encoding: [0x0a,0x00,0xaa,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], 0xcf00
+// GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xaa,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:6
+// GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:6 ; encoding: [0x0a,0x30,0xaa,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8
+// GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 ; encoding: [0x0a,0x00,0xad,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], 0xcf00
+// GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xad,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7
+// GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xad,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8
+// GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0x11,0x02,0x00]
+
+v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00
+// GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1
+// GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 ; encoding: [0x0a,0x08,0xa1,0xd6,0x14,0x11,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s
index 301cfdd217b9f..c5bd00c004a43 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_err.s
@@ -117,7 +117,47 @@ v_cvt_sr_fp8_f16 v1, v2, v3 mul:2
// GFX125X-ERR-NEXT:{{^}}v_cvt_sr_fp8_f16 v1, v2, v3 mul:2
// GFX125X-ERR-NEXT:{{^}} ^
+v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:8
+// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid scale_sel value.
+// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:8
+// GFX125X-ERR-NEXT:{{^}} ^
+
v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:4
// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid byte_sel value.
// GFX125X-ERR-NEXT:{{^}}v_cvt_sr_bf8_f16 v1, v2, v3 byte_sel:4
// GFX125X-ERR-NEXT:{{^}} ^
+
+v_cvt_scale_pk8_f16_fp8 v[10:13], s[20:21], v8
+// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_f16_fp8 v[10:13], s[20:21], v8
+// GFX125X-ERR-NEXT:{{^}} ^
+
+v_cvt_scale_pk8_f16_fp8 v[10:13], 1, v8
+// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_f16_fp8 v[10:13], 1, v8
+// GFX125X-ERR-NEXT:{{^}} ^
+
+v_cvt_scale_pk8_bf16_fp8 v[10:13], s[20:21], v8
+// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_bf16_fp8 v[10:13], s[20:21], v8
+// GFX125X-ERR-NEXT:{{^}} ^
+
+v_cvt_scale_pk8_f32_fp8 v[10:17], s[20:21], v8
+// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_f32_fp8 v[10:17], s[20:21], v8
+// GFX125X-ERR-NEXT:{{^}} ^
+
+v_cvt_scale_pk8_f16_fp4 v[10:13], s20, v8
+// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_f16_fp4 v[10:13], s20, v8
+// GFX125X-ERR-NEXT:{{^}} ^
+
+v_cvt_scale_pk8_bf16_fp4 v[10:13], s20, v8
+// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_bf16_fp4 v[10:13], s20, v8
+// GFX125X-ERR-NEXT:{{^}} ^
+
+v_cvt_scale_pk8_f32_fp4 v[10:17], s20, v8
+// GFX125X-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// GFX125X-ERR-NEXT:{{^}}v_cvt_scale_pk8_f32_fp4 v[10:17], s20, v8
+// GFX125X-ERR-NEXT:{{^}} ^
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
index cf6a999d645be..ce8cfcbc1e987 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
@@ -736,3 +736,87 @@
0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20
# GFX1250: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]
+
+0x0a,0x00,0xac,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
+# GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+0x0a,0x00,0xac,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xac,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x08,0xac,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_bf16_bf8 v[10:13], v[20:21], v8 scale_sel:1 ; encoding: [0x0a,0x08,0xac,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x00,0xa9,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
+# GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xa9,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+0x0a,0x00,0xa9,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xa9,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x00,0xab,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
+# GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xab,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+0x0a,0x00,0xab,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xab,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x38,0xab,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_f16_bf8 v[10:13], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xab,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x00,0xa8,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
+# GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xa8,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+0x0a,0x00,0xa8,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 ; encoding: [0x0a,0x00,0xa8,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x28,0xa8,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_f16_fp8 v[10:13], v[20:21], v8 scale_sel:5 ; encoding: [0x0a,0x28,0xa8,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x30,0xa9,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_bf16_fp8 v[10:13], v[20:21], v8 scale_sel:6 ; encoding: [0x0a,0x30,0xa9,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x00,0xa0,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
+# GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, 0xcf00 ; encoding: [0x0a,0x00,0xa0,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+0x0a,0x00,0xa0,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 ; encoding: [0x0a,0x00,0xa0,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x10,0xa0,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_bf16_fp4 v[10:13], v20, v8 scale_sel:2 ; encoding: [0x0a,0x10,0xa0,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x00,0x9f,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
+# GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, 0xcf00 ; encoding: [0x0a,0x00,0x9f,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+0x0a,0x00,0x9f,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 ; encoding: [0x0a,0x00,0x9f,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x18,0x9f,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_f16_fp4 v[10:13], v20, v8 scale_sel:3 ; encoding: [0x0a,0x18,0x9f,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x00,0xad,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
+# GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xad,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+0x0a,0x00,0xad,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 ; encoding: [0x0a,0x00,0xad,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x38,0xad,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_f32_bf8 v[10:17], v[20:21], v8 scale_sel:7 ; encoding: [0x0a,0x38,0xad,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x00,0xaa,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
+# GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], 0xcf00 ; encoding: [0x0a,0x00,0xaa,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+0x0a,0x00,0xaa,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 ; encoding: [0x0a,0x00,0xaa,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x30,0xaa,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_f32_fp8 v[10:17], v[20:21], v8 scale_sel:6 ; encoding: [0x0a,0x30,0xaa,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x00,0xa1,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
+# GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+0x0a,0x00,0xa1,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0x11,0x02,0x00]
+
+0x0a,0x00,0xa1,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00
+# GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00 ; encoding: [0x0a,0x00,0xa1,0xd6,0x14,0xff,0x01,0x00,0x00,0xcf,0x00,0x00]
+
+0x0a,0x08,0xa1,0xd6,0x14,0x11,0x02,0x00
+# GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 ; encoding: [0x0a,0x08,0xa1,0xd6,0x14,0x11,0x02,0x00]
More information about the llvm-commits
mailing list