[llvm-branch-commits] [clang] [llvm] AMDGPU: Builtins & Codegen support for: v_cvt_scalef32_[f16|f32]_[bf8|fp8] (PR #117739)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Nov 26 08:59:34 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
OPSEL[1:0] collectively decide which byte to read
from src input.
Builtin takes additional imm argument which
represents index (with valid values:[0:3]) of src
byte read. Out of bounds checks will added in next
patch.
OPSEL ASM Syntax: opsel:[x,y,z]
where,
opsel[x] = Inst{11} = src0_modifier{2}
opsel[y] = Inst{12} = src1_modifier{2}
opsel[z] = Inst{14} = src0_modifier{3}
Note: Inst{13} i.e. OPSEL[2] is ignored in
asm syntax and opsel[z] is meaningless
for v_cvt_scalef32_f32_{fp|bf}8
Co-authored-by: Pravin Jagtap <Pravin.Jagtap@<!-- -->amd.com>
---
Patch is 55.22 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/117739.diff
13 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+4)
- (modified) clang/test/CodeGenOpenCL/amdgpu-features.cl (+1-1)
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl (+6-1)
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl (+235)
- (modified) clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl (+9-1)
- (modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+26)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+30)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (+16)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+4)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+1)
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+62-2)
- (modified) llvm/lib/TargetParser/TargetParser.cpp (+2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll (+268-2)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 49304d12d6d70d..829a936537575b 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -570,6 +570,10 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_fp6_f16, "V6UiV32hf", "nc", "f
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_bf6_f16, "V6UiV32hf", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16, "V6UiV32yf", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16, "V6UiV32yf", "nc", "f16bf16-to-fp6bf6-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_f16_fp8, "V2hV2hifIiIb", "nc", "fp8-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_f16_bf8, "V2hV2hifIiIb", "nc", "bf8-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_f32_fp8, "fifIi", "nc", "fp8-cvt-scale-insts")
+TARGET_BUILTIN(__builtin_amdgcn_cvt_scalef32_f32_bf8, "fifIi", "nc", "bf8-cvt-scale-insts")
#undef BUILTIN
#undef TARGET_BUILTIN
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index f739872685e780..a036acb150926f 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -89,7 +89,7 @@
// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
-// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
+// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl
index 5b75ee417e545b..cc21c119ec14de 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl
@@ -12,9 +12,14 @@
typedef unsigned int uint;
typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
+typedef half __attribute__((ext_vector_type(2))) half2;
-void test(global uint* out, global uint2* out_v2u32, uint a, uint b) {
+void test(global uint* out, global uint2* out_v2u32, uint a, uint b, global half2* out_v2f16, global float* out_f32, float scale) {
*out = __builtin_amdgcn_prng_b32(a); // expected-error{{'__builtin_amdgcn_prng_b32' needs target feature prng-inst}}
*out_v2u32 = __builtin_amdgcn_permlane16_swap(a, b, false, false); // expected-error{{'__builtin_amdgcn_permlane16_swap' needs target feature permlane16-swap}}
*out_v2u32 = __builtin_amdgcn_permlane32_swap(a, b, false, false); // expected-error{{'__builtin_amdgcn_permlane32_swap' needs target feature permlane32-swap}}
+ *out_v2f16 = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out_v2f16, a, scale, 0, false); // expected-error{{'__builtin_amdgcn_cvt_scalef32_f16_fp8' needs target feature fp8-cvt-scale-insts}}
+ *out_f32 = __builtin_amdgcn_cvt_scalef32_f32_fp8(a, scale, 0); // expected-error{{'__builtin_amdgcn_cvt_scalef32_f32_fp8' needs target feature fp8-cvt-scale-insts}}
+ *out_v2f16 = __builtin_amdgcn_cvt_scalef32_f16_bf8(*out_v2f16, a, scale, 0, false); // expected-error{{'__builtin_amdgcn_cvt_scalef32_f16_bf8' needs target feature bf8-cvt-scale-insts}}
+ *out_f32 = __builtin_amdgcn_cvt_scalef32_f32_bf8(a, scale, 0); // expected-error{{'__builtin_amdgcn_cvt_scalef32_f32_bf8' needs target feature bf8-cvt-scale-insts}}
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
index d2125e90bc2c89..57e2568a813920 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
@@ -2,6 +2,8 @@
// RUN: %clang_cc1 -cl-std=CL1.2 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -emit-llvm -o - %s | FileCheck %s
// REQUIRES: amdgpu-registered-target
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
typedef unsigned int uint;
typedef unsigned int __attribute__((ext_vector_type(2))) uint2;
typedef unsigned int __attribute__((ext_vector_type(6))) uint6;
@@ -10,6 +12,7 @@ typedef half __attribute__((ext_vector_type(32))) half32;
typedef short __attribute__((ext_vector_type(2))) short2;
typedef __bf16 __attribute__((ext_vector_type(2))) bfloat2;
typedef float __attribute__((ext_vector_type(16))) float16;
+typedef half __attribute__((ext_vector_type(2))) half2;
// CHECK-LABEL: @test_prng_b32(
// CHECK-NEXT: entry:
@@ -262,3 +265,235 @@ void builtins_amdgcn_dl_insts(global float *out, float fC, short2 v2ssA, short2
void builtins_amdgcn_dl_dot2c(global float *out, float fC, bfloat2 v2ssA, bfloat2 v2ssB) {
*out = __builtin_amdgcn_fdot2c_f32_bf16(v2ssA, v2ssB, fC, false);
}
+
+// CHECK-LABEL: @test_cvt_scalef32_f16_fp8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP1]], i32 [[TMP2]], float [[TMP3]], i32 0, i1 false)
+// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x half> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP6]], align 4
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP7]], i32 [[TMP8]], float [[TMP9]], i32 1, i1 false)
+// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
+// CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: [[TMP13:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP12]], align 4
+// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP16:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP13]], i32 [[TMP14]], float [[TMP15]], i32 2, i1 false)
+// CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x half> [[TMP16]], ptr addrspace(1) [[TMP17]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: [[TMP19:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP18]], align 4
+// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP22:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP19]], i32 [[TMP20]], float [[TMP21]], i32 3, i1 false)
+// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x half> [[TMP22]], ptr addrspace(1) [[TMP23]], align 4
+// CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: [[TMP25:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP24]], align 4
+// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP27:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP28:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP25]], i32 [[TMP26]], float [[TMP27]], i32 0, i1 true)
+// CHECK-NEXT: [[TMP29:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x half> [[TMP28]], ptr addrspace(1) [[TMP29]], align 4
+// CHECK-NEXT: [[TMP30:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: [[TMP31:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP30]], align 4
+// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP33:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP34:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP31]], i32 [[TMP32]], float [[TMP33]], i32 1, i1 true)
+// CHECK-NEXT: [[TMP35:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x half> [[TMP34]], ptr addrspace(1) [[TMP35]], align 4
+// CHECK-NEXT: [[TMP36:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: [[TMP37:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP36]], align 4
+// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP39:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP40:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP37]], i32 [[TMP38]], float [[TMP39]], i32 2, i1 true)
+// CHECK-NEXT: [[TMP41:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x half> [[TMP40]], ptr addrspace(1) [[TMP41]], align 4
+// CHECK-NEXT: [[TMP42:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: [[TMP43:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP42]], align 4
+// CHECK-NEXT: [[TMP44:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP45:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP46:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.fp8(<2 x half> [[TMP43]], i32 [[TMP44]], float [[TMP45]], i32 3, i1 true)
+// CHECK-NEXT: [[TMP47:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x half> [[TMP46]], ptr addrspace(1) [[TMP47]], align 4
+// CHECK-NEXT: ret void
+//
+void test_cvt_scalef32_f16_fp8(global half2* out, uint src, float scale)
+{
+ *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 0, false);
+ *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 1, false);
+ *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 2, false);
+ *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 3, false);
+ *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 0, true);
+ *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 1, true);
+ *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 2, true);
+ *out = __builtin_amdgcn_cvt_scalef32_f16_fp8(*out, src, scale, 3, true);
+}
+
+// CHECK-LABEL: @test_cvt_scalef32_f32_fp8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP0]], float [[TMP1]], i32 0)
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store float [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP4]], float [[TMP5]], i32 1)
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store float [[TMP6]], ptr addrspace(1) [[TMP7]], align 4
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP8]], float [[TMP9]], i32 2)
+// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store float [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
+// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP13:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP14:%.*]] = call float @llvm.amdgcn.cvt.scalef32.f32.fp8(i32 [[TMP12]], float [[TMP13]], i32 3)
+// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store float [[TMP14]], ptr addrspace(1) [[TMP15]], align 4
+// CHECK-NEXT: ret void
+//
+void test_cvt_scalef32_f32_fp8(global float* out, uint src, float scale)
+{
+ *out = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, 0);
+ *out = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, 1);
+ *out = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, 2);
+ *out = __builtin_amdgcn_cvt_scalef32_f32_fp8(src, scale, 3);
+}
+
+// CHECK-LABEL: @test_cvt_scalef32_f16_bf8(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store i32 [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP1]], i32 [[TMP2]], float [[TMP3]], i32 0, i1 false)
+// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x half> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: [[TMP7:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP6]], align 4
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP10:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP7]], i32 [[TMP8]], float [[TMP9]], i32 1, i1 false)
+// CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x half> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
+// CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: [[TMP13:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP12]], align 4
+// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP15:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP16:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16.bf8(<2 x half> [[TMP13]], i32 [[TMP14]], float [[TMP15]], i32 2, i1 false)
+// CHECK-NEXT: [[TMP17:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store <2 x half> [[TMP16]], ptr addrspace(1) [[TMP17]], align 4
+// CHECK-NEXT: [[TMP18:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: [[TMP19:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP18]], align 4
+// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(5) [[SRC_ADDR]], align 4
+// CHECK-NEXT: [[TMP21:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
+// CHECK-NEXT: [[TMP22:%.*]] = call <2 x half> @llvm.amdgcn.cvt.scalef32.f16....
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/117739
More information about the llvm-branch-commits
mailing list