[llvm] 7680951 - [AMDGPU][GFX11] Add test coverage for 16-bit conversions, part 7.
Ivan Kosarev via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 15 02:45:59 PDT 2023
Author: Ivan Kosarev
Date: 2023-06-15T10:40:58+01:00
New Revision: 7680951ac8a29b60e8b0c190fc003a2a38334b85
URL: https://github.com/llvm/llvm-project/commit/7680951ac8a29b60e8b0c190fc003a2a38334b85
DIFF: https://github.com/llvm/llvm-project/commit/7680951ac8a29b60e8b0c190fc003a2a38334b85.diff
LOG: [AMDGPU][GFX11] Add test coverage for 16-bit conversions, part 7.
Reviewed By: Joe_Nash
Differential Revision: https://reviews.llvm.org/D152808
Added:
Modified:
llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
llvm/test/CodeGen/AMDGPU/mad-mix.ll
llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
llvm/test/CodeGen/AMDGPU/select.f16.ll
llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index 41da34cdb39fb..8ddfc8fd42181 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -1,13 +1,22 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,SDAG-GFX9 %s
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s
-; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,SDAG-CI %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-CI %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GISEL-GFX9 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GISEL-VI %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,GISEL-CI %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-CI %s
define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %src1, half %src2) #0 {
+; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -51,6 +60,16 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %s
}
define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %src1, half %src2) #0 {
+; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mov_b32_e32 v3, 0x3c00
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -99,6 +118,15 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s
}
define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src1, half %src2, half %lo) #0 {
+; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -147,6 +175,15 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src
}
define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, half %src2) #0 {
+; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
; SDAG-GFX9: ; %bb.0:
; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -172,6 +209,16 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha
; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
; GISEL-GFX9: ; %bb.0:
; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -202,6 +249,15 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha
}
define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src1, half %src2) #0 {
+; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
+; SDAG-GFX11: ; %bb.0:
+; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
; SDAG-GFX9: ; %bb.0:
; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -227,6 +283,16 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src
; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
+; GISEL-GFX11: ; %bb.0:
+; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX11-NEXT: v_bfe_i32 v0, v0, 0, 16
+; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
; GISEL-GFX9: ; %bb.0:
; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -270,6 +336,16 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src
}
define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %src0, half %src1, half %src2) #0 {
+; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -316,6 +392,13 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %
}
define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half %src0, half %src1, half %src2) #0 {
+; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -369,6 +452,16 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
}
define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use(half %src0, half %src1, half %src2) #0 {
+; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX11-NEXT: v_fma_mixhi_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -445,5 +538,3 @@ declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #
attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #1 = { nounwind readnone speculatable }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 1d7b1ed7a10a9..6077d779c0152 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -1,15 +1,24 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,SDAG-GFX900 %s
; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,SDAG-GFX906 %s
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s
-; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,SDAG-CI %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-CI %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GISEL-GFX900 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,GISEL-GFX906 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GISEL-VI %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,GISEL-CI %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-CI %s
define half @mixlo_simple(float %src0, float %src1, float %src2) #0 {
+; GFX1100-LABEL: mixlo_simple:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: mixlo_simple:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -49,6 +58,13 @@ define half @mixlo_simple(float %src0, float %src1, float %src2) #0 {
}
define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 {
+; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -97,6 +113,13 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo(half %src0, half %src1, half %src
}
define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2) #0 {
+; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -142,6 +165,13 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2
}
define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 {
+; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -197,6 +227,15 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
}
define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src1, float %src2) #0 {
+; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -249,6 +288,16 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src
; operation only clobbers relevant lane.
define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_v2f32:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_v2f32:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -343,6 +392,17 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half
}
define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_v3f32:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1100-NEXT: v_mov_b32_e32 v0, v6
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX900-LABEL: v_mad_mix_v3f32:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -481,6 +541,19 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half
}
define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_v4f32:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX900-LABEL: v_mad_mix_v4f32:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -655,6 +728,16 @@ define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half
; FIXME (DAG): Fold clamp
define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -768,6 +851,20 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
; FIXME (GIsel): V_PK_MAX clamp could be folded into mixlo
define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
+; SDAG-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt:
+; SDAG-GFX1100: ; %bb.0:
+; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT: v_pack_b32_f16 v1, v1, 0
+; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v1, v1 clamp
+; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -843,6 +940,19 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_pk_max_f16 v1, v1, v1 clamp
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_mov_b32_e32 v0, v6
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; GISEL-GFX900: ; %bb.0:
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -934,6 +1044,19 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
}
define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1115,6 +1238,16 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
; a build_vector to select the mixhi. Issue is more specifically with how insert_vector_elt is being
; legalized (bitwise ops instead of shuffle/build_vector for instance).
define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
+; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
+; SDAG-GFX1100: ; %bb.0:
+; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1170,6 +1303,20 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_mov_b32_e32 v4, v3
+; GISEL-GFX1100-NEXT: v_max_f16_e64 v3, v3, v3 clamp
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_and_b32_e32 v0, 0xffff, v3
+; GISEL-GFX1100-NEXT: v_and_or_b32 v0, 0xffff0000, v4, v0
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo:
; GISEL-GFX900: ; %bb.0:
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1249,6 +1396,16 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
}
define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
+; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
+; SDAG-GFX1100: ; %bb.0:
+; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1304,6 +1461,20 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v1 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v4, v0, v1, v2 op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v4, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v3
+; GISEL-GFX1100-NEXT: v_and_or_b32 v0, 0xffff, v4, v0
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi:
; GISEL-GFX900: ; %bb.0:
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1390,6 +1561,19 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
; FIXME (DAG): Should be able to use mixlo/mixhi
define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
+; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_clamp_precvt:
+; SDAG-GFX1100: ; %bb.0:
+; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v3
+; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-GFX1100-NEXT: v_pack_b32_f16 v0, v0, v1
+; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1449,6 +1633,19 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_clamp_precvt:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v3
+; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_pack_b32_f16 v0, v1, v0
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX900-LABEL: v_mad_mix_v2f32_clamp_precvt:
; GISEL-GFX900: ; %bb.0:
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1512,6 +1709,21 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
; FIXME (DAG): Handling undef 4th component
define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
+; SDAG-GFX1100-LABEL: v_mad_mix_v3f32_clamp_precvt:
+; SDAG-GFX1100: ; %bb.0:
+; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX1100-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v2, v6
+; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SDAG-GFX1100-NEXT: v_pack_b32_f16 v0, v0, v2
+; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1589,6 +1801,21 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_v3f32_clamp_precvt:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v2, v6
+; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GISEL-GFX1100-NEXT: v_pack_b32_f16 v0, v2, v0
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX900-LABEL: v_mad_mix_v3f32_clamp_precvt:
; GISEL-GFX900: ; %bb.0:
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1664,6 +1891,25 @@ define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %sr
}
define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
+; SDAG-GFX1100-LABEL: v_mad_mix_v4f32_clamp_precvt:
+; SDAG-GFX1100: ; %bb.0:
+; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX1100-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT: v_fma_mix_f32 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v2, v6
+; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v3, v7
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SDAG-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT: v_pack_b32_f16 v0, v0, v3
+; SDAG-GFX1100-NEXT: v_pack_b32_f16 v1, v1, v2
+; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_precvt:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1762,6 +2008,25 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr
; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_precvt:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v3, v6
+; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GISEL-GFX1100-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_pack_b32_f16 v0, v3, v0
+; GISEL-GFX1100-NEXT: v_pack_b32_f16 v1, v2, v1
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX900-LABEL: v_mad_mix_v4f32_clamp_precvt:
; GISEL-GFX900: ; %bb.0:
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1880,5 +2145,3 @@ declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #
attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #1 = { nounwind readnone speculatable }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index b3b8807fea05f..62b0db7745b7b 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -1,15 +1,24 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,SDAG-GFX1100 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,SDAG-GFX900 %s
; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,SDAG-GFX906 %s
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s
; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,SDAG-CI %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1100,GISEL-GFX1100 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX900,GISEL-GFX900 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX906,GISEL-GFX906 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,GISEL-VI %s
; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=CI,GISEL-CI %s
define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -53,6 +62,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2
}
define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_int:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -102,6 +118,13 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_int(i32 %src0, i32 %src1, i32 %src
}
define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_f16hi_f16hi_f16hi_elt:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -148,6 +171,16 @@ define float @v_mad_mix_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %
}
define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_v2f32:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX900-LABEL: v_mad_mix_v2f32:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -246,6 +279,16 @@ define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x hal
}
define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_v2f32_shuffle:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1]
+; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_v2f32_shuffle:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -322,6 +365,13 @@ define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x half> %src0, <2 x half> %src1,
}
define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -375,6 +425,13 @@ define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %s
}
define float @v_mad_mix_f32_absf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -419,6 +476,13 @@ define float @v_mad_mix_f32_absf16lo_f16lo_f16lo(half %src0, half %src1, half %s
}
define float @v_mad_mix_f32_negabsf16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -464,6 +528,13 @@ define float @v_mad_mix_f32_negabsf16lo_f16lo_f16lo(half %src0, half %src1, half
}
define float @v_mad_mix_f32_f16lo_f16lo_f32(half %src0, half %src1, float %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -504,6 +575,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32(half %src0, half %src1, float %src2)
}
define float @v_mad_mix_f32_f16lo_f16lo_negf32(half %src0, half %src1, float %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, -v2 op_sel_hi:[1,1,0]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -545,6 +623,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_negf32(half %src0, half %src1, float %sr
}
define float @v_mad_mix_f32_f16lo_f16lo_absf32(half %src0, half %src1, float %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, |v2| op_sel_hi:[1,1,0]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -586,6 +671,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_absf32(half %src0, half %src1, float %sr
}
define float @v_mad_mix_f32_f16lo_f16lo_negabsf32(half %src0, half %src1, float %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, -|v2| op_sel_hi:[1,1,0]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -632,6 +724,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_negabsf32(half %src0, half %src1, float
; inline immediate.
define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 {
+; SDAG-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
+; SDAG-GFX1100: ; %bb.0:
+; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX1100-NEXT: s_mov_b32 s0, 1.0
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -660,6 +761,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 {
; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, 1.0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-GFX1100-NEXT: v_mov_b32_e32 v2, 1.0
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
; GISEL-GFX900: ; %bb.0:
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -688,6 +798,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 {
}
define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0 {
+; SDAG-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
+; SDAG-GFX1100: ; %bb.0:
+; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX1100-NEXT: s_mov_b32 s0, 0.15915494
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -716,6 +835,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0
; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e22f983
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-GFX1100-NEXT: v_mov_b32_e32 v2, 0.15915494
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
; GISEL-GFX900: ; %bb.0:
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -751,6 +879,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0
; f32 1/2pi = 0x3e22f983
define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1) #0 {
+; SDAG-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
+; SDAG-GFX1100: ; %bb.0:
+; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX1100-NEXT: s_mov_b32 s0, 0x3e230000
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -779,6 +916,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1)
; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e230000
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-GFX1100-NEXT: v_mov_b32_e32 v2, 0x3e230000
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
; GISEL-GFX900: ; %bb.0:
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -819,6 +965,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1)
define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 {
+; SDAG-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
+; SDAG-GFX1100: ; %bb.0:
+; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; SDAG-GFX1100-NEXT: s_mov_b32 s0, 0x367c0000
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -847,6 +1002,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 {
; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x367c0000
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GISEL-GFX1100-NEXT: v_mov_b32_e32 v2, 0x367c0000
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
; GISEL-GFX900: ; %bb.0:
; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -886,6 +1050,17 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 {
}
define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 {
+; GFX1100-LABEL: v_mad_mix_v2f32_f32imm1:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: s_mov_b32 s0, 1.0
+; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
+; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX1100-NEXT: v_mov_b32_e32 v0, v2
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imm1:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -976,6 +1151,17 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1)
}
define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 {
+; GFX1100-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: s_mov_b32 s0, 0x3e230000
+; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
+; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX1100-NEXT: v_mov_b32_e32 v0, v2
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX900-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1071,6 +1257,17 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half>
}
define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 {
+; GFX1100-LABEL: v_mad_mix_v2f32_f32imminv2pi:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: s_mov_b32 s0, 0.15915494
+; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1100-NEXT: v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
+; GFX1100-NEXT: v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GFX1100-NEXT: v_mov_b32_e32 v0, v2
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imminv2pi:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1164,6 +1361,13 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s
}
define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1212,6 +1416,13 @@ define float @v_mad_mix_clamp_f32_f16hi_f16hi_f16hi_elt(<2 x half> %src0, <2 x h
}
define float @no_mix_simple(float %src0, float %src1, float %src2) #0 {
+; GFX1100-LABEL: no_mix_simple:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: no_mix_simple:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1240,6 +1451,13 @@ define float @no_mix_simple(float %src0, float %src1, float %src2) #0 {
}
define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 {
+; GFX1100-LABEL: no_mix_simple_fabs:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_f32 v0, |v0|, v1, v2
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: no_mix_simple_fabs:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1273,6 +1491,13 @@ define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 {
; v_mad_mix_f32 flushes.
define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals(half %src0, half %src1, half %src2) #1 {
+; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1320,6 +1545,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals(half %src0, half %sr
}
define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals(half %src0, half %src1, float %src2) #1 {
+; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1363,6 +1595,18 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals(half %src0, half %src1, fl
}
define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half %src1, half %src2) #1 {
+; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1100-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX1100-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1418,6 +1662,17 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0,
}
define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half %src1, float %src2) #1 {
+; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX1100-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX1100-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1468,6 +1723,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half
}
define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src1, half %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1512,6 +1774,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd(half %src0, hal
}
define float @v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src1, float %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,0]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1553,6 +1822,13 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src
}
define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1612,6 +1888,16 @@ define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1
; Make sure we don't fold pre-cvt fneg if we already have a fabs
define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1100-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1666,6 +1952,13 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %
}
define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1715,6 +2008,13 @@ define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1
}
define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1775,6 +2075,13 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half
}
define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1835,6 +2142,13 @@ define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half
}
define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half %src1, half %src2) #0 {
+; GFX1100-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX1100-NEXT: v_fma_mix_f32 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
; GFX900-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
index 95bc04bbe4e68..88d5d0ff9da8b 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
@@ -1,9 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-SAFE %s
; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=VI,VI-SAFE %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SAFE %s
; RUN: llc -march=amdgcn -mcpu=hawaii -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefixes=CI,CI-NSZ %s
; RUN: llc -march=amdgcn -mcpu=fiji -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefixes=VI,VI-NSZ %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefixes=GFX11,GFX11-NSZ %s
define half @add_select_fabs_fabs_f16(i32 %c, half %x, half %y, half %z) {
; CI-LABEL: add_select_fabs_fabs_f16:
@@ -27,6 +29,16 @@ define half @add_select_fabs_fabs_f16(i32 %c, half %x, half %y, half %z) {
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_add_f16_e64 v0, |v0|, v3
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_fabs_fabs_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs.x = call half @llvm.fabs.f16(half %x)
%fabs.y = call half @llvm.fabs.f16(half %y)
@@ -61,6 +73,17 @@ define { half, half } @add_select_multi_use_lhs_fabs_fabs_f16(i32 %c, half %x, h
; VI-NEXT: v_add_f16_e64 v0, |v0|, v4
; VI-NEXT: v_add_f16_e64 v1, |v1|, v3
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_multi_use_lhs_fabs_fabs_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_add_f16_e64 v1, |v1|, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs.x = call half @llvm.fabs.f16(half %x)
%fabs.y = call half @llvm.fabs.f16(half %y)
@@ -97,6 +120,17 @@ define { half, half } @add_select_multi_store_use_lhs_fabs_fabs_f16(i32 %c, half
; VI-NEXT: v_add_f16_e64 v0, |v0|, v3
; VI-NEXT: v_mov_b32_e32 v1, v4
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs.x = call half @llvm.fabs.f16(half %x)
%fabs.y = call half @llvm.fabs.f16(half %y)
@@ -133,6 +167,17 @@ define { half, half } @add_select_multi_use_rhs_fabs_fabs_f16(i32 %c, half %x, h
; VI-NEXT: v_add_f16_e64 v0, |v0|, v3
; VI-NEXT: v_add_f16_e64 v1, |v2|, v4
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_multi_use_rhs_fabs_fabs_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_add_f16_e64 v1, |v2|, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs.x = call half @llvm.fabs.f16(half %x)
%fabs.y = call half @llvm.fabs.f16(half %y)
@@ -167,6 +212,17 @@ define half @add_select_fabs_var_f16(i32 %c, half %x, half %y, half %z) {
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_add_f16_e32 v0, v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_fabs_var_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_add_f16_e32 v0, v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs.x = call half @llvm.fabs.f16(half %x)
%select = select i1 %cmp, half %fabs.x, half %y
@@ -196,6 +252,17 @@ define half @add_select_fabs_negk_f16(i32 %c, half %x, half %y) {
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; VI-NEXT: v_add_f16_e32 v0, v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_fabs_negk_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo
+; GFX11-NEXT: v_add_f16_e32 v0, v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs = call half @llvm.fabs.f16(half %x)
%select = select i1 %cmp, half %fabs, half -1.0
@@ -224,6 +291,17 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; VI-NEXT: v_add_f16_e64 v0, |v0|, v1
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_fabs_negk_negk_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0xc000
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo
+; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, half -2.0, half -1.0
%fabs = call half @llvm.fabs.f16(half %select)
@@ -251,6 +329,17 @@ define half @add_select_posk_posk_f16(i32 %c, half %x) {
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; VI-NEXT: v_add_f16_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_posk_posk_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0x4000
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo
+; GFX11-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, half 2.0, half 1.0
%add = fadd half %select, %x
@@ -279,6 +368,17 @@ define half @add_select_negk_fabs_f16(i32 %c, half %x, half %y) {
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; VI-NEXT: v_add_f16_e32 v0, v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_negk_fabs_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo
+; GFX11-NEXT: v_add_f16_e32 v0, v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs = call half @llvm.fabs.f16(half %x)
%select = select i1 %cmp, half -1.0, half %fabs
@@ -309,6 +409,17 @@ define half @add_select_negliteralk_fabs_f16(i32 %c, half %x, half %y) {
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; VI-NEXT: v_add_f16_e32 v0, v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_negliteralk_fabs_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xe400, v1, vcc_lo
+; GFX11-NEXT: v_add_f16_e32 v0, v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs = call half @llvm.fabs.f16(half %x)
%select = select i1 %cmp, half -1024.0, half %fabs
@@ -337,6 +448,16 @@ define half @add_select_fabs_posk_f16(i32 %c, half %x, half %y) {
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; VI-NEXT: v_add_f16_e64 v0, |v0|, v2
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_fabs_posk_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs = call half @llvm.fabs.f16(half %x)
%select = select i1 %cmp, half %fabs, half 1.0
@@ -365,6 +486,16 @@ define half @add_select_posk_fabs_f16(i32 %c, half %x, half %y) {
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; VI-NEXT: v_add_f16_e64 v0, |v0|, v2
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_posk_fabs_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs = call half @llvm.fabs.f16(half %x)
%select = select i1 %cmp, half 1.0, half %fabs
@@ -394,6 +525,16 @@ define half @add_select_fneg_fneg_f16(i32 %c, half %x, half %y, half %z) {
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_sub_f16_e32 v0, v3, v0
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_fneg_fneg_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fneg.x = fneg half %x
%fneg.y = fneg half %y
@@ -428,6 +569,17 @@ define { half, half } @add_select_multi_use_lhs_fneg_fneg_f16(i32 %c, half %x, h
; VI-NEXT: v_sub_f16_e32 v0, v3, v0
; VI-NEXT: v_sub_f16_e32 v1, v4, v1
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_multi_use_lhs_fneg_fneg_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_sub_f16_e32 v1, v4, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fneg.x = fneg half %x
%fneg.y = fneg half %y
@@ -464,6 +616,17 @@ define { half, half } @add_select_multi_store_use_lhs_fneg_fneg_f16(i32 %c, half
; VI-NEXT: v_sub_f16_e32 v0, v3, v0
; VI-NEXT: v_mov_b32_e32 v1, v4
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fneg.x = fneg half %x
%fneg.y = fneg half %y
@@ -500,6 +663,17 @@ define { half, half } @add_select_multi_use_rhs_fneg_fneg_f16(i32 %c, half %x, h
; VI-NEXT: v_sub_f16_e32 v0, v3, v0
; VI-NEXT: v_sub_f16_e32 v1, v4, v2
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_multi_use_rhs_fneg_fneg_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_sub_f16_e32 v1, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fneg.x = fneg half %x
%fneg.y = fneg half %y
@@ -534,6 +708,17 @@ define half @add_select_fneg_var_f16(i32 %c, half %x, half %y, half %z) {
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_add_f16_e32 v0, v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_fneg_var_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_add_f16_e32 v0, v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fneg.x = fneg half %x
%select = select i1 %cmp, half %fneg.x, half %y
@@ -562,6 +747,16 @@ define half @add_select_fneg_negk_f16(i32 %c, half %x, half %y) {
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; VI-NEXT: v_sub_f16_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_fneg_negk_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fneg.x = fneg half %x
%select = select i1 %cmp, half %fneg.x, half -1.0
@@ -591,6 +786,16 @@ define half @add_select_fneg_inv2pi_f16(i32 %c, half %x, half %y) {
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; VI-NEXT: v_sub_f16_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_fneg_inv2pi_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xb118, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fneg.x = fneg half %x
%select = select i1 %cmp, half %fneg.x, half 0xH3118
@@ -620,6 +825,16 @@ define half @add_select_fneg_neginv2pi_f16(i32 %c, half %x, half %y) {
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; VI-NEXT: v_sub_f16_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_fneg_neginv2pi_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3118, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fneg.x = fneg half %x
%select = select i1 %cmp, half %fneg.x, half 0xHB118
@@ -647,6 +862,17 @@ define half @add_select_negk_negk_f16(i32 %c, half %x) {
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; VI-NEXT: v_add_f16_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_negk_negk_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0xc000
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo
+; GFX11-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, half -2.0, half -1.0
%add = fadd half %select, %x
@@ -675,6 +901,17 @@ define half @add_select_negliteralk_negliteralk_f16(i32 %c, half %x) {
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; VI-NEXT: v_add_f16_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_negliteralk_negliteralk_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0xe800
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xec00, v2, vcc_lo
+; GFX11-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, half -2048.0, half -4096.0
%add = fadd half %select, %x
@@ -701,6 +938,17 @@ define half @add_select_fneg_negk_negk_f16(i32 %c, half %x) {
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; VI-NEXT: v_sub_f16_e32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_fneg_negk_negk_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mov_b32_e32 v2, 0xc000
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo
+; GFX11-NEXT: v_sub_f16_e32 v0, v1, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, half -2.0, half -1.0
%fneg.x = fneg half %select
@@ -729,6 +977,16 @@ define half @add_select_negk_fneg_f16(i32 %c, half %x, half %y) {
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; VI-NEXT: v_sub_f16_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_negk_fneg_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fneg.x = fneg half %x
%select = select i1 %cmp, half -1.0, half %fneg.x
@@ -757,6 +1015,16 @@ define half @add_select_fneg_posk_f16(i32 %c, half %x, half %y) {
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; VI-NEXT: v_sub_f16_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_fneg_posk_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fneg.x = fneg half %x
%select = select i1 %cmp, half %fneg.x, half 1.0
@@ -785,6 +1053,16 @@ define half @add_select_posk_fneg_f16(i32 %c, half %x, half %y) {
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; VI-NEXT: v_sub_f16_e32 v0, v2, v0
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_posk_fneg_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fneg.x = fneg half %x
%select = select i1 %cmp, half 1.0, half %fneg.x
@@ -816,6 +1094,18 @@ define half @add_select_negfabs_fabs_f16(i32 %c, half %x, half %y, half %z) {
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_add_f16_e32 v0, v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_negfabs_fabs_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff, v2
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_add_f16_e32 v0, v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs.x = call half @llvm.fabs.f16(half %x)
%fneg.fabs.x = fsub half -0.000000e+00, %fabs.x
@@ -849,6 +1139,18 @@ define half @add_select_fabs_negfabs_f16(i32 %c, half %x, half %y, half %z) {
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_add_f16_e32 v0, v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_fabs_negfabs_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX11-NEXT: v_or_b32_e32 v2, 0x8000, v2
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_add_f16_e32 v0, v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs.x = call half @llvm.fabs.f16(half %x)
%fabs.y = call half @llvm.fabs.f16(half %y)
@@ -882,6 +1184,18 @@ define half @add_select_neg_fabs_f16(i32 %c, half %x, half %y, half %z) {
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_add_f16_e32 v0, v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_neg_fabs_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff, v2
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_add_f16_e32 v0, v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fneg.x = fsub half -0.000000e+00, %x
%fabs.y = call half @llvm.fabs.f16(half %y)
@@ -914,6 +1228,18 @@ define half @add_select_fabs_neg_f16(i32 %c, half %x, half %y, half %z) {
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_add_f16_e32 v0, v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_fabs_neg_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_add_f16_e32 v0, v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs.x = call half @llvm.fabs.f16(half %x)
%fneg.y = fsub half -0.000000e+00, %y
@@ -945,6 +1271,17 @@ define half @add_select_neg_negfabs_f16(i32 %c, half %x, half %y, half %z) {
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_sub_f16_e32 v0, v3, v0
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_neg_negfabs_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff, v2
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fneg.x = fsub half -0.000000e+00, %x
%fabs.y = call half @llvm.fabs.f16(half %y)
@@ -977,6 +1314,17 @@ define half @add_select_negfabs_neg_f16(i32 %c, half %x, half %y, half %z) {
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; VI-NEXT: v_sub_f16_e32 v0, v3, v0
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: add_select_negfabs_neg_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs.x = call half @llvm.fabs.f16(half %x)
%fneg.fabs.x = fsub half -0.000000e+00, %fabs.x
@@ -1008,6 +1356,17 @@ define half @mul_select_negfabs_posk_f16(i32 %c, half %x, half %y) {
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; VI-NEXT: v_mul_f16_e32 v0, v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: mul_select_negfabs_posk_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo
+; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs.x = call half @llvm.fabs.f16(half %x)
%fneg.fabs.x = fsub half -0.000000e+00, %fabs.x
@@ -1038,6 +1397,17 @@ define half @mul_select_posk_negfabs_f16(i32 %c, half %x, half %y) {
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; VI-NEXT: v_mul_f16_e32 v0, v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: mul_select_posk_negfabs_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo
+; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs.x = call half @llvm.fabs.f16(half %x)
%fneg.fabs.x = fsub half -0.000000e+00, %fabs.x
@@ -1068,6 +1438,17 @@ define half @mul_select_negfabs_negk_f16(i32 %c, half %x, half %y) {
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; VI-NEXT: v_mul_f16_e32 v0, v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: mul_select_negfabs_negk_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo
+; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs.x = call half @llvm.fabs.f16(half %x)
%fneg.fabs.x = fsub half -0.000000e+00, %fabs.x
@@ -1098,6 +1479,17 @@ define half @mul_select_negk_negfabs_f16(i32 %c, half %x, half %y) {
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; VI-NEXT: v_mul_f16_e32 v0, v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: mul_select_negk_negfabs_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo
+; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fabs.x = call half @llvm.fabs.f16(half %x)
%fneg.fabs.x = fsub half -0.000000e+00, %fabs.x
@@ -1131,6 +1523,17 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) {
; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
+; GFX11-SAFE-LABEL: select_fneg_posk_src_add_f16:
+; GFX11-SAFE: ; %bb.0:
+; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-NEXT: v_add_f16_e32 v1, 4.0, v1
+; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
+;
; CI-NSZ-LABEL: select_fneg_posk_src_add_f16:
; CI-NSZ: ; %bb.0:
; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1149,6 +1552,16 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) {
; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-LABEL: select_fneg_posk_src_add_f16:
+; GFX11-NSZ: ; %bb.0:
+; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NSZ-NEXT: v_sub_f16_e32 v1, -4.0, v1
+; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%add = fadd half %x, 4.0
%fneg = fneg half %add
@@ -1177,6 +1590,17 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) {
; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
+; GFX11-SAFE-LABEL: select_fneg_posk_src_sub_f16:
+; GFX11-SAFE: ; %bb.0:
+; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-NEXT: v_add_f16_e32 v1, -4.0, v1
+; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
+;
; CI-NSZ-LABEL: select_fneg_posk_src_sub_f16:
; CI-NSZ: ; %bb.0:
; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1195,6 +1619,16 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) {
; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-LABEL: select_fneg_posk_src_sub_f16:
+; GFX11-NSZ: ; %bb.0:
+; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NSZ-NEXT: v_sub_f16_e32 v1, 4.0, v1
+; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%add = fsub half %x, 4.0
%fneg = fneg half %add
@@ -1221,6 +1655,16 @@ define half @select_fneg_posk_src_mul_f16(i32 %c, half %x) {
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: select_fneg_posk_src_mul_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_mul_f16_e32 v1, -4.0, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%mul = fmul half %x, 4.0
%fneg = fneg half %mul
@@ -1251,6 +1695,17 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) {
; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
+; GFX11-SAFE-LABEL: select_fneg_posk_src_fma_f16:
+; GFX11-SAFE: ; %bb.0:
+; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, 4.0, v1
+; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v2
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
+;
; CI-NSZ-LABEL: select_fneg_posk_src_fma_f16:
; CI-NSZ: ; %bb.0:
; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1271,6 +1726,16 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) {
; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-LABEL: select_fneg_posk_src_fma_f16:
+; GFX11-NSZ: ; %bb.0:
+; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NSZ-NEXT: v_fma_f16 v1, v1, -4.0, -v2
+; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fma = call half @llvm.fma.f16(half %x, half 4.0, half %z)
%fneg = fneg half %fma
@@ -1302,6 +1767,17 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) {
; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
+; GFX11-SAFE-LABEL: select_fneg_posk_src_fmad_f16:
+; GFX11-SAFE: ; %bb.0:
+; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, 4.0, v1
+; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v2
+; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
+;
; CI-NSZ-LABEL: select_fneg_posk_src_fmad_f16:
; CI-NSZ: ; %bb.0:
; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1323,6 +1799,16 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) {
; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-LABEL: select_fneg_posk_src_fmad_f16:
+; GFX11-NSZ: ; %bb.0:
+; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NSZ-NEXT: v_fma_f16 v1, v1, -4.0, -v2
+; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo
+; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%fmad = call half @llvm.fmuladd.f16(half %x, half 4.0, half %z)
%fneg = fneg half %fmad
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index be01fa8ab1e63..396aad42c9bd2 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
+; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11
define amdgpu_kernel void @select_f16(
; SI-LABEL: select_f16:
@@ -79,6 +80,44 @@ define amdgpu_kernel void @select_f16(
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: select_f16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-NEXT: s_mov_b32 s14, -1
+; GFX11-NEXT: s_mov_b32 s15, 0x31016000
+; GFX11-NEXT: s_mov_b32 s18, s14
+; GFX11-NEXT: s_mov_b32 s19, s15
+; GFX11-NEXT: s_mov_b32 s22, s14
+; GFX11-NEXT: s_mov_b32 s23, s15
+; GFX11-NEXT: s_mov_b32 s26, s14
+; GFX11-NEXT: s_mov_b32 s27, s15
+; GFX11-NEXT: s_mov_b32 s2, s14
+; GFX11-NEXT: s_mov_b32 s3, s15
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s16, s6
+; GFX11-NEXT: s_mov_b32 s17, s7
+; GFX11-NEXT: s_mov_b32 s20, s8
+; GFX11-NEXT: s_mov_b32 s21, s9
+; GFX11-NEXT: s_mov_b32 s24, s10
+; GFX11-NEXT: s_mov_b32 s25, s11
+; GFX11-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v2, off, s[24:27], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v3, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s4
+; GFX11-NEXT: s_mov_b32 s13, s5
+; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
+; GFX11-NEXT: buffer_store_b16 v0, off, s[12:15], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b,
@@ -161,6 +200,38 @@ define amdgpu_kernel void @select_f16_imm_a(
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: select_f16_imm_a:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s18, s10
+; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_mov_b32 s22, s10
+; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s2
+; GFX11-NEXT: s_mov_b32 s13, s3
+; GFX11-NEXT: s_mov_b32 s16, s4
+; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s20, s6
+; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %b,
ptr addrspace(1) %c,
@@ -241,6 +312,38 @@ define amdgpu_kernel void @select_f16_imm_b(
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: select_f16_imm_b:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s18, s10
+; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_mov_b32 s22, s10
+; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s2
+; GFX11-NEXT: s_mov_b32 s13, s3
+; GFX11-NEXT: s_mov_b32 s16, s4
+; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s20, s6
+; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %c,
@@ -322,6 +425,38 @@ define amdgpu_kernel void @select_f16_imm_c(
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: select_f16_imm_c:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s18, s10
+; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_mov_b32 s22, s10
+; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s2
+; GFX11-NEXT: s_mov_b32 s13, s3
+; GFX11-NEXT: s_mov_b32 s16, s4
+; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s20, s6
+; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
+; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b,
@@ -403,6 +538,38 @@ define amdgpu_kernel void @select_f16_imm_d(
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: select_f16_imm_d:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s18, s10
+; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_mov_b32 s22, s10
+; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s2
+; GFX11-NEXT: s_mov_b32 s13, s3
+; GFX11-NEXT: s_mov_b32 s16, s4
+; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s20, s6
+; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
+; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b,
@@ -515,6 +682,52 @@ define amdgpu_kernel void @select_v2f16(
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: select_v2f16:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-NEXT: s_mov_b32 s14, -1
+; GFX11-NEXT: s_mov_b32 s15, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, s14
+; GFX11-NEXT: s_mov_b32 s3, s15
+; GFX11-NEXT: s_mov_b32 s22, s14
+; GFX11-NEXT: s_mov_b32 s23, s15
+; GFX11-NEXT: s_mov_b32 s18, s14
+; GFX11-NEXT: s_mov_b32 s19, s15
+; GFX11-NEXT: s_mov_b32 s26, s14
+; GFX11-NEXT: s_mov_b32 s27, s15
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s20, s8
+; GFX11-NEXT: s_mov_b32 s21, s9
+; GFX11-NEXT: s_mov_b32 s16, s6
+; GFX11-NEXT: s_mov_b32 s17, s7
+; GFX11-NEXT: s_mov_b32 s24, s10
+; GFX11-NEXT: s_mov_b32 s25, s11
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0
+; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0
+; GFX11-NEXT: buffer_load_b32 v3, off, s[24:27], 0
+; GFX11-NEXT: s_mov_b32 s12, s4
+; GFX11-NEXT: s_mov_b32 s13, s5
+; GFX11-NEXT: s_waitcnt vmcnt(3)
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v2, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[12:15], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b,
@@ -616,6 +829,47 @@ define amdgpu_kernel void @select_v2f16_imm_a(
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: select_v2f16_imm_a:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s18, s10
+; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_mov_b32 s22, s10
+; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s2
+; GFX11-NEXT: s_mov_b32 s13, s3
+; GFX11-NEXT: s_mov_b32 s16, s4
+; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s20, s6
+; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0
+; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0x3900, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %b,
ptr addrspace(1) %c,
@@ -715,6 +969,47 @@ define amdgpu_kernel void @select_v2f16_imm_b(
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: select_v2f16_imm_b:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s18, s10
+; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_mov_b32 s22, s10
+; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s12, s2
+; GFX11-NEXT: s_mov_b32 s13, s3
+; GFX11-NEXT: s_mov_b32 s16, s4
+; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s20, s6
+; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0
+; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0x3900, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %c,
@@ -816,6 +1111,47 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: select_v2f16_imm_c:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s18, s10
+; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s22, s10
+; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s16, s4
+; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s12, s2
+; GFX11-NEXT: s_mov_b32 s13, s3
+; GFX11-NEXT: s_mov_b32 s20, s6
+; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
+; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b,
@@ -917,6 +1253,47 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
+;
+; GFX11-LABEL: select_v2f16_imm_d:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s10, -1
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s18, s10
+; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_mov_b32 s14, s10
+; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_mov_b32 s22, s10
+; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s16, s4
+; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s12, s2
+; GFX11-NEXT: s_mov_b32 s13, s3
+; GFX11-NEXT: s_mov_b32 s20, s6
+; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0
+; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
+; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_waitcnt vmcnt(2)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
+; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b,
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index 226facf7ba3a6..b71bef7d66800 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
; FIXME: This should be merged with uint_to_fp.ll, but s_uint_to_fp_v2i64 crashes on r600
@@ -42,6 +43,29 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_uint_to_fp_i64_to_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clz_i32_u32 s4, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_min_u32 s4, s4, 32
+; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_min_u32 s2, s2, 1
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX11-NEXT: s_sub_i32 s2, 32, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = uitofp i64 %in to half
store half %result, ptr addrspace(1) %out
ret void
@@ -98,6 +122,31 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uint_to_fp_i64_to_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_clz_i32_u32_e32 v3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_min_u32_e32 v3, 32, v3
+; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_min_u32_e32 v1, 1, v1
+; GFX11-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%out.gep = getelementptr half, ptr addrspace(1) %out, i32 %tid
@@ -143,6 +192,27 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i
; GFX8-NEXT: v_ldexp_f32 v2, v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_uint_to_fp_i64_to_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clz_i32_u32 s4, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_min_u32 s4, s4, 32
+; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_min_u32 s2, s2, 1
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX11-NEXT: s_sub_i32 s2, 32, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = uitofp i64 %in to float
store float %result, ptr addrspace(1) %out
ret void
@@ -197,6 +267,29 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
; GFX8-NEXT: v_ldexp_f32 v2, v5, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uint_to_fp_i64_to_f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_clz_i32_u32_e32 v3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_min_u32_e32 v3, 32, v3
+; GFX11-NEXT: v_lshlrev_b64 v[1:2], v3, v[1:2]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_min_u32_e32 v1, 1, v1
+; GFX11-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX11-NEXT: v_sub_nc_u32_e32 v2, 32, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
%out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
@@ -258,6 +351,34 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_uint_to_fp_v2i64_to_v2f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clz_i32_u32 s2, s7
+; GFX11-NEXT: s_clz_i32_u32 s3, s5
+; GFX11-NEXT: s_min_u32 s8, s2, 32
+; GFX11-NEXT: s_min_u32 s9, s3, 32
+; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
+; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
+; GFX11-NEXT: s_min_u32 s2, s2, 1
+; GFX11-NEXT: s_min_u32 s4, s4, 1
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_or_b32 s3, s5, s4
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s3
+; GFX11-NEXT: s_sub_i32 s2, 32, s8
+; GFX11-NEXT: s_sub_i32 s3, 32, s9
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ldexp_f32 v1, v0, s2
+; GFX11-NEXT: v_ldexp_f32 v0, v2, s3
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = uitofp <2 x i64> %in to <2 x float>
store <2 x float> %result, ptr addrspace(1) %out
ret void
@@ -367,6 +488,56 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
; GFX8-NEXT: v_ldexp_f32 v2, v4, v12
; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[0:3]
; GFX8-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f32:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16
+; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_clz_i32_u32_e32 v9, v4
+; GFX11-NEXT: v_clz_i32_u32_e32 v10, v2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_clz_i32_u32_e32 v11, v8
+; GFX11-NEXT: v_clz_i32_u32_e32 v12, v6
+; GFX11-NEXT: v_min_u32_e32 v9, 32, v9
+; GFX11-NEXT: v_min_u32_e32 v10, 32, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_min_u32_e32 v11, 32, v11
+; GFX11-NEXT: v_min_u32_e32 v12, 32, v12
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4]
+; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8]
+; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6]
+; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9
+; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10
+; GFX11-NEXT: v_min_u32_e32 v3, 1, v3
+; GFX11-NEXT: v_min_u32_e32 v1, 1, v1
+; GFX11-NEXT: v_min_u32_e32 v7, 1, v7
+; GFX11-NEXT: v_min_u32_e32 v5, 1, v5
+; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX11-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v8, v7
+; GFX11-NEXT: v_or_b32_e32 v4, v6, v5
+; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12
+; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT: v_cvt_f32_u32_e32 v6, v2
+; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 4, v0
+; GFX11-NEXT: v_ldexp_f32 v3, v3, v9
+; GFX11-NEXT: v_ldexp_f32 v2, v1, v10
+; GFX11-NEXT: v_ldexp_f32 v1, v6, v11
+; GFX11-NEXT: v_ldexp_f32 v0, v4, v5
+; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
%out.gep = getelementptr <4 x float>, ptr addrspace(1) %out, i32 %tid
@@ -435,6 +606,39 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
+;
+; GFX11-LABEL: s_uint_to_fp_v2i64_to_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clz_i32_u32 s2, s7
+; GFX11-NEXT: s_clz_i32_u32 s3, s5
+; GFX11-NEXT: s_min_u32 s8, s2, 32
+; GFX11-NEXT: s_min_u32 s9, s3, 32
+; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
+; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
+; GFX11-NEXT: s_min_u32 s2, s2, 1
+; GFX11-NEXT: s_min_u32 s4, s4, 1
+; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_or_b32 s3, s5, s4
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX11-NEXT: v_cvt_f32_u32_e32 v1, s3
+; GFX11-NEXT: s_sub_i32 s2, 32, s8
+; GFX11-NEXT: s_sub_i32 s3, 32, s9
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
+; GFX11-NEXT: v_ldexp_f32 v1, v1, s3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%result = uitofp <2 x i64> %in to <2 x half>
store <2 x half> %result, ptr addrspace(1) %out
ret void
@@ -558,6 +762,65 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
; GFX8-NEXT: v_or_b32_e32 v3, v6, v5
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16
+; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3]
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_clz_i32_u32_e32 v9, v4
+; GFX11-NEXT: v_clz_i32_u32_e32 v10, v2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_clz_i32_u32_e32 v11, v8
+; GFX11-NEXT: v_clz_i32_u32_e32 v12, v6
+; GFX11-NEXT: v_min_u32_e32 v9, 32, v9
+; GFX11-NEXT: v_min_u32_e32 v10, 32, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_min_u32_e32 v11, 32, v11
+; GFX11-NEXT: v_min_u32_e32 v12, 32, v12
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b64 v[3:4], v9, v[3:4]
+; GFX11-NEXT: v_lshlrev_b64 v[1:2], v10, v[1:2]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_lshlrev_b64 v[7:8], v11, v[7:8]
+; GFX11-NEXT: v_lshlrev_b64 v[5:6], v12, v[5:6]
+; GFX11-NEXT: v_sub_nc_u32_e32 v9, 32, v9
+; GFX11-NEXT: v_sub_nc_u32_e32 v10, 32, v10
+; GFX11-NEXT: v_min_u32_e32 v3, 1, v3
+; GFX11-NEXT: v_min_u32_e32 v1, 1, v1
+; GFX11-NEXT: v_min_u32_e32 v7, 1, v7
+; GFX11-NEXT: v_min_u32_e32 v5, 1, v5
+; GFX11-NEXT: v_sub_nc_u32_e32 v11, 32, v11
+; GFX11-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX11-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX11-NEXT: v_or_b32_e32 v2, v8, v7
+; GFX11-NEXT: v_or_b32_e32 v4, v6, v5
+; GFX11-NEXT: v_sub_nc_u32_e32 v5, 32, v12
+; GFX11-NEXT: v_cvt_f32_u32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
+; GFX11-NEXT: v_cvt_f32_u32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f32_u32_e32 v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_ldexp_f32 v3, v3, v9
+; GFX11-NEXT: v_ldexp_f32 v1, v1, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_ldexp_f32 v2, v2, v11
+; GFX11-NEXT: v_ldexp_f32 v4, v4, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX11-NEXT: v_pack_b32_f16 v0, v4, v2
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
%out.gep = getelementptr <4 x half>, ptr addrspace(1) %out, i32 %tid
More information about the llvm-commits
mailing list