[llvm-branch-commits] [llvm] AMDGPU: Add support for V_CVT_PK_F16_F32 instruction for gfx950 (PR #118300)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Dec 2 06:30:33 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang
@llvm/pr-subscribers-clang-codegen
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
Co-authored-by: Shilei Tian <shilei.tian@<!-- -->amd.com>
---
Patch is 27.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118300.diff
8 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+11-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+7-4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h (+3)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+4)
- (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+18)
- (modified) llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll (+313)
- (modified) llvm/test/MC/AMDGPU/gfx950_asm_features.s (+24)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt (+18)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 5978f5b0bbae5f..ebe6f0965deb97 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -432,6 +432,12 @@ def FeatureAshrPkInsts : SubtargetFeature<"ashr-pk-insts",
"Has Arithmetic Shift Pack instructions"
>;
+def FeatureCvtPkF16F32Inst : SubtargetFeature<"cvt-pk-f16-f32-inst",
+ "HasCvtPkF16F32Inst",
+ "true",
+ "Has cvt_pk_f16_f32 instruction"
+>;
+
def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
"GFX950Insts",
"true",
@@ -445,8 +451,9 @@ def FeatureGFX950Insts : SubtargetFeature<"gfx950-insts",
FeatureFP6BF6ConversionScaleInsts,
FeatureF16BF16ToFP6BF6ConversionScaleInsts,
FeatureF32ToF16BF16ConversionSRInsts,
+ FeatureCvtPkF16F32Inst,
FeatureMinimum3Maximum3F32,
- FeatureMinimum3Maximum3PKF16
+ FeatureMinimum3Maximum3PKF16,
]
>;
@@ -2510,6 +2517,9 @@ def HasFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasFP6BF6ConversionSca
def HasF16BF16ToFP6BF6ConversionScaleInsts : Predicate<"Subtarget->hasF16BF16ToFP6BF6ConversionScaleInsts()">,
AssemblerPredicate<(all_of FeatureF16BF16ToFP6BF6ConversionScaleInsts)>;
+def HasCvtPkF16F32Inst : Predicate<"Subtarget->hasCvtPkF16F32Inst()">,
+ AssemblerPredicate<(all_of FeatureCvtPkF16F32Inst)>;
+
def HasF32ToF16BF16ConversionSRInsts : Predicate<"Subtarget->hasF32ToF16BF16ConversionSRInsts()">,
AssemblerPredicate<(all_of FeatureF32ToF16BF16ConversionSRInsts)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 9bf1f281c32a09..2e66f7525b9ccf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1040,10 +1040,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.lower();
}
- getActionDefinitionsBuilder(G_FPTRUNC)
- .legalFor({{S32, S64}, {S16, S32}})
- .scalarize(0)
- .lower();
+ auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
+ if (ST.hasCvtPkF16F32Inst())
+ FPTruncActions.legalFor(
+ {{S32, S64}, {S16, S32}, {V2S16, V2S32}, {V2S16, V2S64}});
+ else
+ FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
+ FPTruncActions.scalarize(0).lower();
getActionDefinitionsBuilder(G_FPEXT)
.legalFor({{S64, S32}, {S32, S16}})
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index c5c951b58b8d6d..7701fef5365841 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -55,6 +55,7 @@ class AMDGPUSubtarget {
bool HasFP4ConversionScaleInsts = false;
bool HasFP6BF6ConversionScaleInsts = false;
bool HasF16BF16ToFP6BF6ConversionScaleInsts = false;
+ bool HasCvtPkF16F32Inst = false;
bool HasF32ToF16BF16ConversionSRInsts = false;
bool EnableRealTrue16Insts = false;
bool HasBF16ConversionInsts = false;
@@ -191,6 +192,8 @@ class AMDGPUSubtarget {
bool hasF16BF16ToFP6BF6ConversionScaleInsts() const { return HasF16BF16ToFP6BF6ConversionScaleInsts; }
+ bool hasCvtPkF16F32Inst() const { return HasCvtPkF16F32Inst; }
+
bool hasF32ToF16BF16ConversionSRInsts() const {
return HasF32ToF16BF16ConversionSRInsts;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a212a9218ca0db..70230b5abc5171 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -902,6 +902,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Legal);
}
+ if (Subtarget->hasCvtPkF16F32Inst()) {
+ setOperationAction(ISD::FP_ROUND, MVT::v2f16, Legal);
+ }
+
setTargetDAGCombine({ISD::ADD,
ISD::UADDO_CARRY,
ISD::SUB,
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 1160975f3302a9..33050b718a484e 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1143,6 +1143,21 @@ let SubtargetPredicate = HasGFX950Insts, mayRaiseFPException = 0 in {
defm V_CVT_SCALEF32_2XPK16_BF6_F32 : VOP3Inst<"v_cvt_scalef32_2xpk16_bf6_f32", VOP3_CVT_SCALEF32_PK_F864_Profile<VOP_V6I32_V16F32_V16F32_F32>, int_amdgcn_cvt_scalef32_2xpk16_bf6_f32>;
}
+let SubtargetPredicate = HasCvtPkF16F32Inst in {
+ let ReadsModeReg = 0 in {
+ defm V_CVT_PK_F16_F32 : VOP3Inst<"v_cvt_pk_f16_f32", VOP3_Profile<VOP_V2F16_F32_F32>>;
+ }
+
+ def : GCNPat<(v2f16 (fpround v2f32:$src)),
+ (V_CVT_PK_F16_F32_e64 0, (EXTRACT_SUBREG VReg_64:$src, sub0), 0, (EXTRACT_SUBREG VReg_64:$src, sub1))>;
+ def : GCNPat<(v2f16 (fpround v2f64:$src)),
+ (V_CVT_PK_F16_F32_e64 0, (V_CVT_F32_F64_e64 0, (EXTRACT_SUBREG VReg_128:$src, sub0_sub1)),
+ 0, (V_CVT_F32_F64_e64 0, (EXTRACT_SUBREG VReg_128:$src, sub2_sub3)))>;
+ def : GCNPat<(v2f16 (build_vector (f16 (fpround (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+ (f16 (fpround (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))),
+ (V_CVT_PK_F16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>;
+}
+
class Cvt_Scale_FP4FP8BF8ToF16F32_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType DstTy> : GCNPat<
(DstTy (node i32:$src0, f32:$src1, timm:$index)),
(inst (SrcAndDstSelToOpSelXForm_0_0 $index), $src0, (SrcAndDstSelToOpSelXForm_1_0 $index), $src1)
@@ -2270,6 +2285,9 @@ defm V_CVT_SR_BF16_F32: VOP3OpSel_Real_gfx9 <0x2a7>;
defm V_ASHR_PK_I8_I32 : VOP3OpSel_Real_gfx9 <0x265>;
defm V_ASHR_PK_U8_I32 : VOP3OpSel_Real_gfx9 <0x266>;
+let OtherPredicates = [HasCvtPkF16F32Inst] in {
+defm V_CVT_PK_F16_F32 : VOP3_Real_gfx9<0x267, "v_cvt_pk_f16_f32">;
+}
defm V_CVT_SCALEF32_2XPK16_FP6_F32 : VOP3_Real_gfx9<0x252, "v_cvt_scalef32_2xpk16_fp6_f32">;
defm V_CVT_SCALEF32_2XPK16_BF6_F32 : VOP3_Real_gfx9<0x253, "v_cvt_scalef32_2xpk16_bf6_f32">;
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 0005f1179a5d28..59767eb9811e7f 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -5,6 +5,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=1 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SDAG %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-GISEL %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-SDAG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-GISEL %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL %s
@@ -99,6 +101,36 @@ define amdgpu_kernel void @fptrunc_f32_to_f16(
; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-GISEL-NEXT: s_endpgm
;
+; GFX950-SDAG-LABEL: fptrunc_f32_to_f16:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX950-SDAG-NEXT: s_mov_b32 s11, s3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX950-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX950-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GFX950-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX950-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: fptrunc_f32_to_f16:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX950-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX950-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX950-GISEL-NEXT: s_endpgm
+;
; GFX11-SDAG-LABEL: fptrunc_f32_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -234,6 +266,38 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-GISEL-NEXT: s_endpgm
;
+; GFX950-SDAG-LABEL: fptrunc_f64_to_f16:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX950-SDAG-NEXT: s_mov_b32 s11, s3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX950-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX950-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GFX950-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX950-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX950-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: fptrunc_f64_to_f16:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX950-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX950-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX950-GISEL-NEXT: s_endpgm
+;
; GFX11-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -382,6 +446,37 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-GISEL-NEXT: s_endpgm
;
+; GFX950-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX950-SDAG-NEXT: s_mov_b32 s11, s3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX950-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX950-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GFX950-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX950-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
+; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: fptrunc_v2f32_to_v2f16:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX950-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX950-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
+; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX950-GISEL-NEXT: s_endpgm
+;
; GFX11-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
; GFX11-SDAG: ; %bb.0: ; %entry
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -543,6 +638,42 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-GISEL-NEXT: s_endpgm
;
+; GFX950-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX950-SDAG-NEXT: s_mov_b32 s11, s3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX950-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX950-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GFX950-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX950-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
+; GFX950-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v2
+; GFX950-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX950-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX950-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
+; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v0, v0, v2
+; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX950-GISEL-NEXT: s_endpgm
+;
; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
; GFX11-SDAG: ; %bb.0: ; %entry
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -685,6 +816,36 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-GISEL-NEXT: s_endpgm
;
+; GFX950-SDAG-LABEL: fneg_fptrunc_f32_to_f16:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX950-SDAG-NEXT: s_mov_b32 s11, s3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX950-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX950-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GFX950-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX950-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: fneg_fptrunc_f32_to_f16:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX950-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX950-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s0
+; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX950-GISEL-NEXT: s_endpgm
+;
; GFX11-SDAG-LABEL: fneg_fptrunc_f32_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -815,6 +976,36 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-GISEL-NEXT: s_endpgm
;
+; GFX950-SDAG-LABEL: fabs_fptrunc_f32_to_f16:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX950-SDAG-NEXT: s_mov_b32 s11, s3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX950-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX950-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GFX950-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX950-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
+; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: fabs_fptrunc_f32_to_f16:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX950-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX950-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
+; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX950-GISEL-NEXT: s_endpgm
+;
; GFX11-SDAG-LABEL: fabs_fptrunc_f32_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -945,6 +1136,36 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-GISEL-NEXT: s_endpgm
;
+; GFX950-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX950-SDAG-NEXT: s_mov_b32 s11, s3
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX950-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX950-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GFX950-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX950-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -|v0|
+; GFX950-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16:
+; GFX950-GISEL: ; %bb.0: ; %entry
+; GFX950-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX950-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX950-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s0|
+; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX950-GISEL-NEXT: s_endpgm
+;
; GFX11-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
@@ -1076,6 +1297,36 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-GISEL-NEXT: s_endpgm
;
+; GFX950-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32:
+; GFX950-SDAG: ; %bb.0: ; %entry
+; GFX950-SDAG-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX950-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX950-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX950-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX950-SDAG-NEXT: s...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/118300
More information about the llvm-branch-commits
mailing list