[llvm-branch-commits] [clang] [llvm] [AMDGPU] v_cvt_scalef32_sr_pk16_* gfx1250 instructions (PR #151810)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Sat Aug 2 02:14:14 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-ir
@llvm/pr-subscribers-backend-amdgpu
Author: Stanislav Mekhanoshin (rampitec)
<details>
<summary>Changes</summary>
---
Patch is 34.66 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/151810.diff
10 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+6)
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl (+42)
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+6)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+6)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+3)
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+12)
- (added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll (+232)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s (+36)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s (+36)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt (+36)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 9125315310306..ced758c814105 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -746,6 +746,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_bf8_f32, "V2UiV8fUif", "nc",
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32, "UiV8fUif", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16, "UiV8hUif", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16, "UiV8yUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16, "V3UiV16hUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32, "V3UiV16fUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16, "V3UiV16yUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16, "V3UiV16hUif", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32, "V3UiV16fUif", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_fp8_f32_e5m3, "iffiIb", "nc", "fp8e5m3-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-insts")
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index e50ab77f48c79..4ff0571239e71 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -929,6 +929,42 @@ void test_cvt_scalef32_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float
// CHECK-NEXT: [[TMP43:%.*]] = call i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> [[TMP40]], i32 [[TMP41]], float [[TMP42]])
// CHECK-NEXT: [[TMP44:%.*]] = load ptr addrspace(1), ptr [[OUT1_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[TMP43]], ptr addrspace(1) [[TMP44]], align 4
+// CHECK-NEXT: [[TMP45:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP46:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP47:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP48:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> [[TMP45]], i32 [[TMP46]], float [[TMP47]])
+// CHECK-NEXT: [[TMP49:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP48]], ptr addrspace(1) [[TMP49]], align 16
+// CHECK-NEXT: [[TMP50:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP52:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP53:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> [[TMP50]], i32 [[TMP51]], float [[TMP52]])
+// CHECK-NEXT: [[TMP54:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP53]], ptr addrspace(1) [[TMP54]], align 16
+// CHECK-NEXT: [[TMP55:%.*]] = load <16 x bfloat>, ptr [[SRCBF16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP56:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP57:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP58:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> [[TMP55]], i32 [[TMP56]], float [[TMP57]])
+// CHECK-NEXT: [[TMP59:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP58]], ptr addrspace(1) [[TMP59]], align 16
+// CHECK-NEXT: [[TMP60:%.*]] = load <16 x half>, ptr [[SRCH16_ADDR_ASCAST]], align 32
+// CHECK-NEXT: [[TMP61:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP62:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP63:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> [[TMP60]], i32 [[TMP61]], float [[TMP62]])
+// CHECK-NEXT: [[TMP64:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP63]], ptr addrspace(1) [[TMP64]], align 16
+// CHECK-NEXT: [[TMP65:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64
+// CHECK-NEXT: [[TMP66:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP67:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP68:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f32(<16 x float> [[TMP65]], i32 [[TMP66]], float [[TMP67]])
+// CHECK-NEXT: [[TMP69:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP68]], ptr addrspace(1) [[TMP69]], align 16
+// CHECK-NEXT: [[TMP70:%.*]] = load <16 x float>, ptr [[SRCF16_ADDR_ASCAST]], align 64
+// CHECK-NEXT: [[TMP71:%.*]] = load i32, ptr [[SR_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP72:%.*]] = load float, ptr [[SCALE_ADDR_ASCAST]], align 4
+// CHECK-NEXT: [[TMP73:%.*]] = call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> [[TMP70]], i32 [[TMP71]], float [[TMP72]])
+// CHECK-NEXT: [[TMP74:%.*]] = load ptr addrspace(1), ptr [[OUT3_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store <3 x i32> [[TMP73]], ptr addrspace(1) [[TMP74]], align 16
// CHECK-NEXT: ret void
//
void test_cvt_scalef32_sr_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, float8 srcf8,
@@ -944,6 +980,12 @@ void test_cvt_scalef32_sr_pk(global uint2 *out2, bfloat8 srcbf8, half8 srch8, fl
*out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f32(srcf8, sr, scale);
*out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_f16(srch8, sr, scale);
*out1 = __builtin_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16(srcbf8, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16(srcbf16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f16(srch16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16(srcbf16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f16(srch16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_bf6_f32(srcf16, sr, scale);
+ *out3 = __builtin_amdgcn_cvt_scalef32_sr_pk16_fp6_f32(srcf16, sr, scale);
}
// CHECK-LABEL: @test_sat_pk4_i4_i8(
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index af06fe7a09d7e..469bdb409aaff 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -701,6 +701,12 @@ def int_amdgcn_cvt_scalef32_sr_pk8_bf8_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm
def int_amdgcn_cvt_scalef32_sr_pk8_fp4_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8f32_ty, "cvt_scalef32_sr_pk8_fp4_f32">;
def int_amdgcn_cvt_scalef32_sr_pk8_fp4_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8f16_ty, "cvt_scalef32_sr_pk8_fp4_f16">;
def int_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_i32_ty, llvm_v8bf16_ty, "cvt_scalef32_sr_pk8_fp4_bf16">;
+def int_amdgcn_cvt_scalef32_sr_pk16_fp6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f32_ty, "cvt_scalef32_sr_pk16_fp6_f32">;
+def int_amdgcn_cvt_scalef32_sr_pk16_bf6_f32 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f32_ty, "cvt_scalef32_sr_pk16_bf6_f32">;
+def int_amdgcn_cvt_scalef32_sr_pk16_fp6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f16_ty, "cvt_scalef32_sr_pk16_fp6_f16">;
+def int_amdgcn_cvt_scalef32_sr_pk16_bf6_f16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16f16_ty, "cvt_scalef32_sr_pk16_bf6_f16">;
+def int_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16bf16_ty, "cvt_scalef32_sr_pk16_fp6_bf16">;
+def int_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16 : AMDGPUCvtScaleF32SRIntrinsic<llvm_v3i32_ty, llvm_v16bf16_ty, "cvt_scalef32_sr_pk16_bf6_bf16">;
def int_amdgcn_cvt_scalef32_2xpk16_fp6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_fp6_f32">;
def int_amdgcn_cvt_scalef32_2xpk16_bf6_f32 : AMDGPUCvtScaleF32ToFP6BF6Intrinsic<llvm_v6i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, "cvt_scalef32_2xpk16_bf6_f32">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index ddb1e1081da8a..d11e5a3c4e3cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4633,6 +4633,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f32:
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_f16:
case Intrinsic::amdgcn_cvt_scalef32_sr_pk8_fp4_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f32:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_f16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_fp6_bf16:
+ case Intrinsic::amdgcn_cvt_scalef32_sr_pk16_bf6_bf16:
case Intrinsic::amdgcn_sat_pk4_i4_i8:
case Intrinsic::amdgcn_sat_pk4_u4_u8:
case Intrinsic::amdgcn_fmed3:
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 350a31885e629..4698a5805ee0c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2973,6 +2973,9 @@ def VOP_I32_F32_I32_F32 : VOPProfile<[i32, f32, i32, f32]>;
def VOP_V6I32_V32BF16_I32_F32 : VOPProfile<[v6i32, v32bf16, i32, f32]>;
def VOP_V6I32_V32F16_I32_F32 : VOPProfile<[v6i32, v32f16, i32, f32]>;
def VOP_V6I32_V32F32_I32_F32 : VOPProfile<[v6i32, v32f32, i32, f32]>;
+def VOP_V3I32_V16F16_I32_F32 : VOPProfile<[v3i32, v16f16, i32, f32]>;
+def VOP_V3I32_V16BF16_I32_F32 : VOPProfile<[v3i32, v16bf16, i32, f32]>;
+def VOP_V3I32_V16F32_I32_F32 : VOPProfile<[v3i32, v16f32, i32, f32]>;
def VOP_V2I32_V8BF16_I32_F32 : VOPProfile<[v2i32, v8bf16, i32, f32]>;
def VOP_V2I32_V8F16_I32_F32 : VOPProfile<[v2i32, v8f16, i32, f32]>;
def VOP_V2I32_V8F32_I32_F32 : VOPProfile<[v2i32, v8f32, i32, f32]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 100bb2de9abc7..f4b6af647ca1a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1820,6 +1820,12 @@ let SubtargetPredicate = isGFX1250Plus in {
defm V_CVT_SCALEF32_SR_PK8_FP4_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_f16>;
defm V_CVT_SCALEF32_SR_PK8_FP4_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk8_fp4_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_I32_V8BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk8_fp4_bf16>;
} // End WaveSizePredicate = isWave32
+ defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f16>;
+ defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_bf6_f32>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_bf16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16BF16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_bf16>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f16", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F16_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f16>;
+ defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Inst<"v_cvt_scalef32_sr_pk16_fp6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V3I32_V16F32_I32_F32>, int_amdgcn_cvt_scalef32_sr_pk16_fp6_f32>;
} // End Constraints = "@earlyclobber $vdst"
let True16Predicate = UseRealTrue16Insts in {
@@ -2272,6 +2278,12 @@ defm V_CVT_SCALEF32_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2cf>;
defm V_CVT_SCALEF32_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d0>;
defm V_CVT_SCALEF32_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d1>;
defm V_CVT_SCALEF32_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d2>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_F32 : VOP3Only_Real_Base_gfx1250<0x2d3>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_F32 : VOP3Only_Real_Base_gfx1250<0x2d4>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_F16 : VOP3Only_Real_Base_gfx1250<0x2d5>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_F16 : VOP3Only_Real_Base_gfx1250<0x2d6>;
+defm V_CVT_SCALEF32_SR_PK16_FP6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d7>;
+defm V_CVT_SCALEF32_SR_PK16_BF6_BF16 : VOP3Only_Real_Base_gfx1250<0x2d8>;
defm V_CVT_SCALEF32_SR_PK8_FP4_F32 : VOP3Only_Real_Base_gfx1250<0x297>;
defm V_CVT_SCALEF32_SR_PK8_FP8_F32 : VOP3Only_Real_Base_gfx1250<0x298>;
defm V_CVT_SCALEF32_SR_PK8_BF8_F32 : VOP3Only_Real_Base_gfx1250<0x299>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll
new file mode 100644
index 0000000000000..c4395182d6719
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll
@@ -0,0 +1,232 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f32(<16 x float> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> %src, i32 %sr, float %scale)
+declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> %src, i32 %sr, float %scale)
+
+define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_vv(<16 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_bf16_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_bf16 v[12:14], v[0:7], v8, v9
+; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> %src, i32 %sr, float %scale)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_sl(<16 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_bf16_sl:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
+; GFX1250-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_bf16 v[10:12], v[2:9], s8, 0x42c80000
+; GFX1250-NEXT: global_store_b96 v[0:1], v[10:12], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.bf16(<16 x bfloat> %src, i32 %sr, float 100.0)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_vv(<16 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_f16_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_bf6_f16 v[12:14], v[0:7], v8, v9
+; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> %src, i32 %sr, float %scale)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_sl(<16 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_bf6_f16_sl:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_cvt_scalef32_sr_pk16_bf6_f16 v[10:12], v[2:9], s8, 0x42c80000
+; GFX1250-SDAG-NEXT: global_store_b96 v[0:1], v[10:12], off
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: test_scalef32_sr_pk16_bf6_f16_sl:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_cvt_scalef32_sr_pk16_bf6_f16 v[10:12], v[2:9], s8, 0x42c80000
+; GFX1250-GISEL-NEXT: global_store_b96 v[0:1], v[10:12], off
+; GFX1250-GISEL-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.bf6.f16(<16 x half> %src, i32 %sr, float 100.0)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_vv(<16 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_bf16_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_bf16 v[12:14], v[0:7], v8, v9
+; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> %src, i32 %sr, float %scale)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_sl(<16 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_bf16_sl:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
+; GFX1250-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v9, s7
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_bf16 v[10:12], v[2:9], s8, 0x42c80000
+; GFX1250-NEXT: global_store_b96 v[0:1], v[10:12], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.bf16(<16 x bfloat> %src, i32 %sr, float 100.0)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_vv(<16 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_f16_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: v_cvt_scalef32_sr_pk16_fp6_f16 v[12:14], v[0:7], v8, v9
+; GFX1250-NEXT: global_store_b96 v[10:11], v[12:14], off
+; GFX1250-NEXT: s_endpgm
+ %cvt = tail call <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f16(<16 x half> %src, i32 %sr, float %scale)
+ store <3 x i32> %cvt, ptr addrspace(1) %out, align 8
+ ret void
+}
+
+define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_sl(<16 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
+; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_fp6_f16_sl:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v2...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/151810
More information about the llvm-branch-commits
mailing list