[llvm] enable true16 for more codegen test (PR #131206)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 13 13:22:25 PDT 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/131206
>From 28dc7ae297f0294c75885825ac78a06d4a950130 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 13 Mar 2025 16:05:04 -0400
Subject: [PATCH] enable true16 for more codegen test
---
llvm/test/CodeGen/AMDGPU/abs_i16.ll | 62 +-
llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 66 +-
llvm/test/CodeGen/AMDGPU/br_cc.f16.ll | 205 +-
.../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 446 +-
llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 167 +-
llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 1092 +++--
llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 851 ++--
llvm/test/CodeGen/AMDGPU/fma.f16.ll | 457 +-
llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll | 278 +-
llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll | 278 +-
llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll | 2694 +++++++----
llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 4158 +++++++++++++----
llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 182 +-
llvm/test/CodeGen/AMDGPU/fneg.f16.ll | 190 +-
llvm/test/CodeGen/AMDGPU/fsub.f16.ll | 177 +-
llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll | 1486 ++++--
llvm/test/CodeGen/AMDGPU/imm16.ll | 782 +++-
llvm/test/CodeGen/AMDGPU/immv216.ll | 2234 ++++++++-
.../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 1284 +++--
19 files changed, 12781 insertions(+), 4308 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/abs_i16.ll b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
index 0ae2b4f549919..7633ba0eb4f9c 100644
--- a/llvm/test/CodeGen/AMDGPU/abs_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/abs_i16.ll
@@ -4,8 +4,10 @@
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
define i16 @abs_i16(i16 %arg) {
; GFX6-LABEL: abs_i16:
@@ -45,25 +47,45 @@ define i16 @abs_i16(i16 %arg) {
; GFX10-NEXT: v_max_i16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: abs_i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_sub_nc_u16 v1, 0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_i16 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: abs_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: abs_i16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_sub_nc_u16 v1, 0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_max_i16 v0, v0, v1
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: abs_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_sub_nc_u16 v1, 0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_max_i16 v0, v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: abs_i16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_sub_nc_u16 v0.h, 0, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v0.h
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: abs_i16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_sub_nc_u16 v1, 0, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_max_i16 v0, v0, v1
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
ret i16 %res
}
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index f94ec392ee55c..f6d3be1ee17e0 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -2,7 +2,8 @@
; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; FIXME: Need to handle non-uniform case for function below (load without gep).
; FIXME: VI or should be unnecessary
@@ -753,26 +754,49 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX10-NEXT: global_store_dwordx4 v1, v[0:3], s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_test_add_v2i16_zext_to_v2i64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16
-; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT: global_store_b128 v1, v[0:3], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_test_add_v2i16_zext_to_v2i64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v1, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, 0, 16, v2
+; GFX11-TRUE16-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_test_add_v2i16_zext_to_v2i64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_pk_add_u16 v0, v1, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_alignbit_b32 v2, 0, v0, 16
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid
%gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
index 98832aaa3bc25..b27ad26cf97b9 100644
--- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-FAKE16 %s
define amdgpu_kernel void @br_cc_f16(
; SI-LABEL: br_cc_f16:
@@ -60,32 +61,62 @@ define amdgpu_kernel void @br_cc_f16(
; VI-NEXT: buffer_store_short v1, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: br_cc_f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_load_b64 s[8:9], s[4:5], 0x34
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s4, s2
-; GFX11-NEXT: s_mov_b32 s5, s3
-; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s2, s6
-; GFX11-NEXT: s_mov_b32 s3, s7
-; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
-; GFX11-NEXT: s_cbranch_vccnz .LBB0_2
-; GFX11-NEXT: ; %bb.1: ; %one
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT: s_endpgm
-; GFX11-NEXT: .LBB0_2: ; %two
-; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: br_cc_f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_load_b64 s[8:9], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s3
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v2.l, v2.h
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB0_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %one
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+; GFX11-TRUE16-NEXT: .LBB0_2: ; %two
+; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[0:3], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: br_cc_f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_load_b64 s[8:9], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s3
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7
+; GFX11-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB0_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %one
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT: s_endpgm
+; GFX11-FAKE16-NEXT: .LBB0_2: ; %two
+; GFX11-FAKE16-NEXT: buffer_store_b16 v1, off, s[0:3], 0
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
@@ -151,25 +182,47 @@ define amdgpu_kernel void @br_cc_f16_imm_a(
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: br_cc_f16_imm_a:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s4, s2
-; GFX11-NEXT: s_mov_b32 s5, s3
-; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v0
-; GFX11-NEXT: s_cbranch_vccnz .LBB1_2
-; GFX11-NEXT: ; %bb.1: ; %one
-; GFX11-NEXT: v_mov_b32_e32 v0, 0x3800
-; GFX11-NEXT: .LBB1_2: ; %two
-; GFX11-NEXT: s_mov_b32 s2, s6
-; GFX11-NEXT: s_mov_b32 s3, s7
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: br_cc_f16_imm_a:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s3
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v1.l
+; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB1_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %one
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x3800
+; GFX11-TRUE16-NEXT: .LBB1_2: ; %two
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: br_cc_f16_imm_a:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s3
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v0
+; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB1_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %one
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x3800
+; GFX11-FAKE16-NEXT: .LBB1_2: ; %two
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %b) {
entry:
@@ -235,25 +288,47 @@ define amdgpu_kernel void @br_cc_f16_imm_b(
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: br_cc_f16_imm_b:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s4, s2
-; GFX11-NEXT: s_mov_b32 s5, s3
-; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v0
-; GFX11-NEXT: s_cbranch_vccz .LBB2_2
-; GFX11-NEXT: ; %bb.1: ; %two
-; GFX11-NEXT: v_mov_b32_e32 v0, 0x3800
-; GFX11-NEXT: .LBB2_2: ; %one
-; GFX11-NEXT: s_mov_b32 s2, s6
-; GFX11-NEXT: s_mov_b32 s3, s7
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: br_cc_f16_imm_b:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s3
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v1.l
+; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_2
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %two
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x3800
+; GFX11-TRUE16-NEXT: .LBB2_2: ; %one
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: br_cc_f16_imm_b:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s3
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v0
+; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB2_2
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %two
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x3800
+; GFX11-FAKE16-NEXT: .LBB2_2: ; %one
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index 1c9a9d16d2be7..14ddf7daad1c6 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
; SI-LABEL: extract_vector_elt_v2f16:
@@ -215,19 +216,34 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:2
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: extract_vector_elt_v3f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: extract_vector_elt_v3f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: extract_vector_elt_v3f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:2
+; GFX11-FAKE16-NEXT: s_endpgm
%p0 = extractelement <3 x half> %foo, i32 0
%p1 = extractelement <3 x half> %foo, i32 2
%out1 = getelementptr half, ptr addrspace(1) %out, i32 1
@@ -268,20 +284,35 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: dynamic_extract_vector_elt_v3f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s4, s6, 4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: dynamic_extract_vector_elt_v3f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s6, 4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: dynamic_extract_vector_elt_v3f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s6, 4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT: s_endpgm
%p0 = extractelement <3 x half> %foo, i32 %idx
%out1 = getelementptr half, ptr addrspace(1) %out, i32 1
store half %p0, ptr addrspace(1) %out
@@ -635,48 +666,91 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1)
; VI-NEXT: flat_store_short v[5:6], v0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_extractelement_v8f16_dynamic_sgpr:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v4, 0x3ff, v0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 4, v4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v0, s[2:3]
-; GFX11-NEXT: s_cmp_eq_u32 s4, 1
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s4, 2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-NEXT: s_cmp_eq_u32 s4, 3
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s4, 4
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s4, 5
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s4, 6
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX11-NEXT: s_cmp_eq_u32 s4, 7
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_extractelement_v8f16_dynamic_sgpr:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 4, v4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v0, s[2:3]
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 1
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v5.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v5.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_extractelement_v8f16_dynamic_sgpr:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 4, v4
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 1
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 1, v4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext
@@ -852,83 +926,161 @@ define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1)
; VI-NEXT: flat_store_short v[9:10], v0
; VI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_extractelement_v16f16_dynamic_sgpr:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v8, 0x3ff, v0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 5, v8
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
-; GFX11-NEXT: global_load_b128 v[4:7], v4, s[2:3] offset:16
-; GFX11-NEXT: s_cmp_eq_u32 s4, 1
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s4, 2
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v1
-; GFX11-NEXT: s_cmp_eq_u32 s4, 3
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s4, 4
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s4, 5
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s4, 6
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 1, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX11-NEXT: s_cmp_eq_u32 s4, 7
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s4, 8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v4
-; GFX11-NEXT: s_cmp_eq_u32 s4, 9
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s4, 10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v5
-; GFX11-NEXT: s_cmp_eq_u32 s4, 11
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s4, 12
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX11-NEXT: s_cmp_eq_u32 s4, 13
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s4, 14
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v7
-; GFX11-NEXT: s_cmp_eq_u32 s4, 15
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
-; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_extractelement_v16f16_dynamic_sgpr:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 5, v8
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_b128 v[4:7], v4, s[2:3] offset:16
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 1
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 2
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v9.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v9.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 6
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 8
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 11
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v5.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v6.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 14
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v7.l, s2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2
+; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_extractelement_v16f16_dynamic_sgpr:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v8, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 5, v8
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-FAKE16-NEXT: global_load_b128 v[4:7], v4, s[2:3] offset:16
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 1
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 6
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 1, v8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 8
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 12
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 14
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc_lo
+; GFX11-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX11-FAKE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 65266a4e39294..365588eaec3ac 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -2,7 +2,8 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
; DAGCombiner will transform:
; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
@@ -44,17 +45,30 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_fabs_free_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_fabs_free_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_fabs_free_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%bc= bitcast i16 %in to half
%fabs = call half @llvm.fabs.f16(half %bc)
store half %fabs, ptr addrspace(1) %out
@@ -97,17 +111,30 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_fabs_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_fabs_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_fabs_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%fabs = call half @llvm.fabs.f16(half %in)
store half %fabs, ptr addrspace(1) %out
ret void
@@ -262,18 +289,33 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fabs_fold_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s3, s2, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mul_f16_e64 v1, |s2|, s3
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fabs_fold_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, |v0.l|, v0.h
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fabs_fold_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_mul_f16_e64 v1, |s2|, s3
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%fabs = call half @llvm.fabs.f16(half %in0)
%fmul = fmul half %fabs, %in1
store half %fmul, ptr addrspace(1) %out
@@ -624,24 +666,43 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: v_extract_fabs_fold_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_mul_f16_e64 v0, |v0|, 4.0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_add_f16_e64 v1, |v1|, 2.0
-; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_extract_fabs_fold_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, |v0.l|, 4.0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.h, |v1.l|, 2.0
+; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v0, off dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_extract_fabs_fold_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_mul_f16_e64 v0, |v0|, 4.0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_add_f16_e64 v1, |v1|, 2.0
+; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v0, off dlc
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v1, off dlc
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.in = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i32 %tid
%val = load <2 x half>, ptr addrspace(1) %gep.in
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 19deaf4a5535e..620273a360439 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -2,7 +2,8 @@
; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope --check-prefixes=SI %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope --check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope --check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
declare half @llvm.copysign.f16(half, half) #0
declare float @llvm.copysign.f32(float, float) #0
@@ -59,19 +60,34 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag,
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_copysign_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s3, s2, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, s3
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_copysign_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_copysign_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s3
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
+; GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%out = call half @llvm.copysign.f16(half %mag, half %sign)
store half %out, ptr addrspace(1) %arg_out
ret void
@@ -113,17 +129,30 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_test_copysign_f16_0:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_test_copysign_f16_0:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_test_copysign_f16_0:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%result = call half @llvm.copysign.f16(half %mag, half 0.0)
store half %result, ptr addrspace(1) %out, align 4
ret void
@@ -165,17 +194,30 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_test_copysign_f16_1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_test_copysign_f16_1:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_test_copysign_f16_1:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%result = call half @llvm.copysign.f16(half %mag, half 1.0)
store half %result, ptr addrspace(1) %out, align 4
ret void
@@ -217,17 +259,30 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_test_copysign_f16_10.0:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_test_copysign_f16_10.0:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_test_copysign_f16_10.0:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%result = call half @llvm.copysign.f16(half %mag, half 10.0)
store half %result, ptr addrspace(1) %out, align 4
ret void
@@ -269,17 +324,30 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_test_copysign_f16_neg1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s2, 15
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_test_copysign_f16_neg1:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_test_copysign_f16_neg1:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s2, 15
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%result = call half @llvm.copysign.f16(half %mag, half -1.0)
store half %result, ptr addrspace(1) %out, align 4
ret void
@@ -321,17 +389,30 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_test_copysign_f16_neg10:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s2, 15
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_test_copysign_f16_neg10:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_test_copysign_f16_neg10:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s2, 15
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%result = call half @llvm.copysign.f16(half %mag, half -10.0)
store half %result, ptr addrspace(1) %out, align 4
ret void
@@ -680,11 +761,17 @@ define half @v_test_copysign_f16_0(half %mag) {
; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_test_copysign_f16_0:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_test_copysign_f16_0:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_test_copysign_f16_0:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half %mag, half 0.0)
ret half %result
}
@@ -709,11 +796,17 @@ define half @v_test_copysign_f16_1(half %mag) {
; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_test_copysign_f16_1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_test_copysign_f16_1:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_test_copysign_f16_1:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half %mag, half 1.0)
ret half %result
}
@@ -738,11 +831,17 @@ define half @v_test_copysign_f16_10(half %mag) {
; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_test_copysign_f16_10:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_test_copysign_f16_10:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_test_copysign_f16_10:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half %mag, half 10.0)
ret half %result
}
@@ -767,11 +866,17 @@ define half @v_test_copysign_f16_neg1(half %mag) {
; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_test_copysign_f16_neg1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_test_copysign_f16_neg1:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_test_copysign_f16_neg1:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half %mag, half -1.0)
ret half %result
}
@@ -796,11 +901,17 @@ define half @v_test_copysign_f16_neg10(half %mag) {
; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_test_copysign_f16_neg10:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_test_copysign_f16_neg10:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_test_copysign_f16_neg10:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half %mag, half -10.0)
ret half %result
}
@@ -875,25 +986,45 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1)
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v1, s[2:3]
-; GFX11-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v1, v0
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v1, s[4:5]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
+; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v1, s[2:3]
+; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v1, v0
+; GFX11-FAKE16-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
%mag = load half, ptr addrspace(1) %arg_mag_gep
@@ -976,25 +1107,45 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1)
; GFX9-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v1, s[2:3]
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v1
-; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 1, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v1, s[4:5]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
+; GFX11-TRUE16-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v0, s[4:5]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v1
+; GFX11-FAKE16-NEXT: global_store_b64 v0, v[2:3], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
%mag = load half, ptr addrspace(1) %arg_mag_gep
@@ -1075,25 +1226,45 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1)
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v1, s[6:7]
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[6:7]
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v1, s[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v1, v0
+; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v1, s[6:7]
+; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
+; GFX11-FAKE16-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid
%mag = load float, ptr addrspace(1) %arg_mag_gep
@@ -1176,25 +1347,45 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1)
; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v1, s[6:7]
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 1, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 3, v1
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[6:7]
+; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v0
+; GFX11-TRUE16-NEXT: global_store_b64 v3, v[1:2], s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v2, v1, s[6:7]
+; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
+; GFX11-FAKE16-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%arg_mag_gep = getelementptr double, ptr addrspace(1) %arg_mag, i32 %tid
%mag = load double, ptr addrspace(1) %arg_mag_gep
@@ -1277,25 +1468,46 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1)
; GFX9-NEXT: global_store_short v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v1, s[6:7]
-; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
-; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v1, s[6:7]
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v1, v1, s[6:7]
+; GFX11-FAKE16-NEXT: global_load_u16 v0, v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-FAKE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
%mag = load half, ptr addrspace(1) %arg_mag_gep
@@ -1375,25 +1587,45 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1)
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
-; GFX11-NEXT: global_load_u16 v0, v2, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
-; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v0, s[6:7]
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b64 s[6:7], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v0, s[6:7]
+; GFX11-FAKE16-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-FAKE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%arg_mag_gep = getelementptr half, ptr addrspace(1) %arg_mag, i32 %tid
%mag = load half, ptr addrspace(1) %arg_mag
@@ -1478,25 +1710,46 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1)
; GFX9-NEXT: global_store_short v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v1, s[2:3]
-; GFX11-NEXT: global_load_u16 v0, v0, s[4:5]
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0
-; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v1, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[4:5]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v1, v1, s[2:3]
+; GFX11-FAKE16-NEXT: global_load_u16 v0, v0, s[4:5]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX11-FAKE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%arg_mag_gep = getelementptr float, ptr addrspace(1) %arg_mag, i32 %tid
%mag = load float, ptr addrspace(1) %arg_mag_gep
@@ -1671,64 +1924,124 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x34
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s5, s3, 0x1ff
-; GFX11-NEXT: s_lshr_b32 s6, s3, 8
-; GFX11-NEXT: s_or_b32 s2, s5, s2
-; GFX11-NEXT: s_and_b32 s5, s6, 0xffe
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-NEXT: s_bfe_u32 s2, s3, 0xb0014
-; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13
-; GFX11-NEXT: v_readfirstlane_b32 s3, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_readfirstlane_b32 s6, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_or_b32 s3, s5, s3
-; GFX11-NEXT: s_or_b32 s5, s3, 0x1000
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshr_b32 s7, s5, s6
-; GFX11-NEXT: s_lshl_b32 s6, s7, s6
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_cmp_lg_u32 s6, s5
-; GFX11-NEXT: s_cselect_b32 s5, 1, 0
-; GFX11-NEXT: s_addk_i32 s2, 0xfc10
-; GFX11-NEXT: s_or_b32 s5, s7, s5
-; GFX11-NEXT: s_lshl_b32 s6, s2, 12
-; GFX11-NEXT: s_or_b32 s6, s3, s6
-; GFX11-NEXT: s_cmp_lt_i32 s2, 1
-; GFX11-NEXT: s_cselect_b32 s5, s5, s6
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s6, s5, 7
-; GFX11-NEXT: s_cmp_gt_i32 s6, 5
-; GFX11-NEXT: s_cselect_b32 s7, 1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s6, 3
-; GFX11-NEXT: s_cselect_b32 s6, 1, 0
-; GFX11-NEXT: s_lshr_b32 s5, s5, 2
-; GFX11-NEXT: s_or_b32 s6, s6, s7
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s5, s5, s6
-; GFX11-NEXT: s_cmp_lt_i32 s2, 31
-; GFX11-NEXT: s_movk_i32 s6, 0x7e00
-; GFX11-NEXT: s_cselect_b32 s5, s5, 0x7c00
-; GFX11-NEXT: s_cmp_lg_u32 s3, 0
-; GFX11-NEXT: s_cselect_b32 s3, s6, 0x7c00
-; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f
-; GFX11-NEXT: s_cselect_b32 s2, s3, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s3, 0x1ff
+; GFX11-TRUE16-NEXT: s_lshr_b32 s6, s3, 8
+; GFX11-TRUE16-NEXT: s_or_b32 s2, s5, s2
+; GFX11-TRUE16-NEXT: s_and_b32 s5, s6, 0xffe
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-TRUE16-NEXT: s_bfe_u32 s2, s3, 0xb0014
+; GFX11-TRUE16-NEXT: s_sub_i32 s3, 0x3f1, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_med3_i32 v1, s3, 0, 13
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v1
+; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s4
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s3, 0x1000
+; GFX11-TRUE16-NEXT: s_lshr_b32 s7, s5, s6
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s7, s6
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s6, s5
+; GFX11-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-TRUE16-NEXT: s_addk_i32 s2, 0xfc10
+; GFX11-TRUE16-NEXT: s_or_b32 s5, s7, s5
+; GFX11-TRUE16-NEXT: s_lshl_b32 s6, s2, 12
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s3, s6
+; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s2, 1
+; GFX11-TRUE16-NEXT: s_cselect_b32 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_and_b32 s6, s5, 7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-TRUE16-NEXT: s_cselect_b32 s7, 1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-TRUE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-TRUE16-NEXT: s_or_b32 s6, s6, s7
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, s6
+; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s2, 31
+; GFX11-TRUE16-NEXT: s_movk_i32 s6, 0x7e00
+; GFX11-TRUE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s6, 0x7c00
+; GFX11-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, s3, s5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s3, 0x1ff
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s3, 8
+; GFX11-FAKE16-NEXT: s_or_b32 s2, s5, s2
+; GFX11-FAKE16-NEXT: s_and_b32 s5, s6, 0xffe
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-FAKE16-NEXT: s_bfe_u32 s2, s3, 0xb0014
+; GFX11-FAKE16-NEXT: s_sub_i32 s3, 0x3f1, s2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_med3_i32 v1, s3, 0, 13
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s6, v1
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s3
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s3, 0x1000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_lshr_b32 s7, s5, s6
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s7, s6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s6, s5
+; GFX11-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
+; GFX11-FAKE16-NEXT: s_addk_i32 s2, 0xfc10
+; GFX11-FAKE16-NEXT: s_or_b32 s5, s7, s5
+; GFX11-FAKE16-NEXT: s_lshl_b32 s6, s2, 12
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s3, s6
+; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s2, 1
+; GFX11-FAKE16-NEXT: s_cselect_b32 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_and_b32 s6, s5, 7
+; GFX11-FAKE16-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-FAKE16-NEXT: s_cselect_b32 s7, 1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-FAKE16-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s5, 2
+; GFX11-FAKE16-NEXT: s_or_b32 s6, s6, s7
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_add_i32 s5, s5, s6
+; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s2, 31
+; GFX11-FAKE16-NEXT: s_movk_i32 s6, 0x7e00
+; GFX11-FAKE16-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s6, 0x7c00
+; GFX11-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, s3, s5
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
+; GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%mag.trunc = fptrunc double %mag to half
%result = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
store half %result, ptr addrspace(1) %arg_out
@@ -1799,24 +2112,44 @@ define amdgpu_kernel void @s_copysign_v2f16(ptr addrspace(1) %arg_out, <2 x half
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_copysign_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, s3
-; GFX11-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
-; GFX11-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_copysign_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s3
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_copysign_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s3
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign)
store <2 x half> %out, ptr addrspace(1) %arg_out
ret void
@@ -1904,28 +2237,56 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half
; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_copysign_v3f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b16 v3, v2, s[4:5] offset:4
-; GFX11-NEXT: global_store_b32 v3, v0, s[4:5]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_copysign_v3f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s3
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v3
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v4, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v3, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v3
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: global_store_b16 v4, v0, s[4:5] offset:4
+; GFX11-TRUE16-NEXT: global_store_b32 v4, v1, s[4:5]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_copysign_v3f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2
+; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: global_store_b16 v3, v2, s[4:5] offset:4
+; GFX11-FAKE16-NEXT: global_store_b32 v3, v0, s[4:5]
+; GFX11-FAKE16-NEXT: s_endpgm
%out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign)
store <3 x half> %out, ptr addrspace(1) %arg_out
ret void
@@ -2028,31 +2389,62 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_copysign_v4f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s3
-; GFX11-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
-; GFX11-NEXT: s_lshr_b32 s6, s1, 16
-; GFX11-NEXT: s_lshr_b32 s0, s0, 16
-; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s6, v2
-; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3
-; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v0
-; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v4
-; GFX11-NEXT: global_store_b64 v5, v[0:1], s[4:5]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_copysign_v4f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s2
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, s1
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s3
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, s0
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, s2
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v2, v3
+; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v4, v5
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0
+; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fff, v6, v7
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v2, 16, v0
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v3, 16, v4
+; GFX11-TRUE16-NEXT: global_store_b64 v5, v[0:1], s[4:5]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_copysign_v4f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s3
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, s2
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s1, v0
+; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
+; GFX11-FAKE16-NEXT: s_lshr_b32 s6, s1, 16
+; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16
+; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, s6, v2
+; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v1, v2, 16, v0
+; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v3, 16, v4
+; GFX11-FAKE16-NEXT: global_store_b64 v5, v[0:1], s[4:5]
+; GFX11-FAKE16-NEXT: s_endpgm
%out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign)
store <4 x half> %out, ptr addrspace(1) %arg_out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 2345f188b8857..3c820ca445a10 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -3,7 +3,8 @@
; RUN: llc -mtriple=amdgcn -mcpu=fiji -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
; Make sure fdiv is promoted to f32.
@@ -129,39 +130,75 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_fdiv_f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v3
-; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_f32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v2, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_fdiv_f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.l
+; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v4, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v4, v6 op_sel_hi:[1,0,1]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.h, v1.l, v0.l
+; GFX11-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_fdiv_f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_fmac_f32_e32 v4, v5, v3
+; GFX11-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_div_fixup_f16 v1, v3, v2, v1
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) #0 {
@@ -249,18 +286,31 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_rcp_f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_rcp_f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_rcp_f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -343,18 +393,31 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) %
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_rcp_f16_abs:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_rcp_f16_e64 v1, |v1|
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_rcp_f16_abs:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_rcp_f16_e64 v0.l, |v0.l|
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_rcp_f16_abs:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_rcp_f16_e64 v1, |v1|
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -440,18 +503,31 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: reciprocal_f16_rounded:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: reciprocal_f16_rounded:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: reciprocal_f16_rounded:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -521,18 +597,31 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_rcp_f16_afn:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_rcp_f16_afn:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_rcp_f16_afn:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -615,18 +704,31 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_rcp_f16_neg:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_rcp_f16_neg:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_rcp_f16_e64 v0.l, -v0.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_rcp_f16_neg:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_rcp_f16_e64 v1, -v1
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -712,18 +814,31 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_rsq_f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_rsq_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_rsq_f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_rsq_f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -813,20 +928,35 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_rsq_f16_neg:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_rsq_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_rsq_f16_neg:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_rsq_f16_neg:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -921,20 +1051,35 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac
; GFX10-NEXT: global_store_short v0, v2, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_rsq_f16_multi_use:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_rsq_f16_e32 v2, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b16 v0, v2, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_rsq_f16_multi_use:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.h, v0.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_rsq_f16_multi_use:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v2, v1
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] dlc
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -1025,20 +1170,35 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_rsq_f16_missing_contract0:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_rsq_f16_missing_contract0:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_rsq_f16_missing_contract0:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -1128,20 +1288,35 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_rsq_f16_missing_contract1:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_rsq_f16_missing_contract1:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_rsq_f16_missing_contract1:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -1231,20 +1406,35 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r,
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_neg_rsq_f16_missing_contract1:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_rcp_f16_e64 v0.l, -v0.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_rcp_f16_e64 v1, -v1
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -1337,24 +1527,43 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1)
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_fdiv_f16_afn:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_rcp_f16_e32 v2, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_fdiv_f16_afn:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_fdiv_f16_afn:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -1448,24 +1657,43 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace(
; GFX10-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: v_fdiv_f16_unsafe:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_rcp_f16_e32 v2, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_fdiv_f16_unsafe:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.h, v0.h
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_fdiv_f16_unsafe:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v2, v2
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -1528,16 +1756,27 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX10-NEXT: global_store_short v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: div_afn_2_x_pat_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mul_f16_e32 v0, 0.5, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: div_afn_2_x_pat_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.5, v0.l
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: div_afn_2_x_pat_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, 0.5, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%x = load half, ptr addrspace(1) poison
%rcp = fdiv afn half %x, 2.0
store half %rcp, ptr addrspace(1) %out, align 4
@@ -1593,16 +1832,27 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX10-NEXT: global_store_short v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: div_afn_k_x_pat_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mul_f16_e32 v0, 0x2e66, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: div_afn_k_x_pat_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x2e66, v0.l
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: div_afn_k_x_pat_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, 0x2e66, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%x = load half, ptr addrspace(1) poison
%rcp = fdiv afn half %x, 10.0
store half %rcp, ptr addrspace(1) %out, align 4
@@ -1658,16 +1908,27 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX10-NEXT: global_store_short v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
-; GFX11-LABEL: div_afn_neg_k_x_pat_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mul_f16_e32 v0, 0xae66, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: div_afn_neg_k_x_pat_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v[0:1], off
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0xae66, v0.l
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: div_afn_neg_k_x_pat_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: global_load_u16 v0, v[0:1], off
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, 0xae66, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%x = load half, ptr addrspace(1) poison
%rcp = fdiv afn half %x, -10.0
store half %rcp, ptr addrspace(1) %out, align 4
@@ -1719,13 +1980,21 @@ define half @v_fdiv_f16_arcp(half %x, half %y) {
; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fdiv_f16_arcp:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fdiv_f16_arcp:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v1.l
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fdiv_f16_arcp:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp half %x, %y
ret half %fdiv
}
@@ -1763,13 +2032,21 @@ define half @v_fdiv_f16_afn_nsz(half %x, half %y) {
; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fdiv_f16_afn_nsz:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_fdiv_f16_afn_nsz:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v1.l
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_fdiv_f16_afn_nsz:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv afn nsz half %x, %y
ret half %fdiv
}
@@ -1924,16 +2201,27 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_rsq_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_rsq_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_rsq_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_rsq_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_rsq_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX9-IEEE-LABEL: v_rsq_v2f16:
; GFX9-IEEE: ; %bb.0:
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2125,16 +2413,27 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX10-NEXT: v_pack_b32_f16 v0, -v0, -v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_neg_rsq_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_rsq_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_rsq_f16_e32 v1, v1
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_pack_b32_f16 v0, -v0, -v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_neg_rsq_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, -v0.l, -v0.h
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_neg_rsq_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, -v0, -v1
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
; GFX9-IEEE-LABEL: v_neg_rsq_v2f16:
; GFX9-IEEE: ; %bb.0:
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index 49d156788f66c..313ee346874b4 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -7,8 +7,10 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-FAKE16
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-TRUE16
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-FAKE16
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12-SDAG-TRUE16
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12-SDAG-FAKE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12-GISEL-TRUE16
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12-GISEL-FAKE16
declare half @llvm.fma.f16(half, half, half)
declare half @llvm.maxnum.f16(half, half)
@@ -52,15 +54,47 @@ define half @test_fma(half %x, half %y, half %z) {
; GFX11-GISEL-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: test_fma:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_fma:
+; GFX12-SDAG-TRUE16: ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_fma:
+; GFX12-SDAG-FAKE16: ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_fma:
+; GFX12-GISEL-TRUE16: ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_fma:
+; GFX12-GISEL-FAKE16: ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX12-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
%r = call half @llvm.fma.f16(half %x, half %y, half %z)
ret half %r
}
@@ -103,15 +137,45 @@ define half @test_fmac(half %x, half %y, half %z) {
; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2
; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: test_fmac:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_fmac_f16_e32 v0, v1, v2
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_fmac:
+; GFX12-SDAG-TRUE16: ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v2.l
+; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_fmac:
+; GFX12-SDAG-FAKE16: ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2
+; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_fmac:
+; GFX12-GISEL-TRUE16: ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v2.l
+; GFX12-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_fmac:
+; GFX12-GISEL-FAKE16: ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2
+; GFX12-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
%r = call half @llvm.fma.f16(half %y, half %z, half %x)
ret half %r
}
@@ -162,15 +226,45 @@ define half @test_fmaak(half %x, half %y, half %z) {
; GFX11-GISEL-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: test_fmaak:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_fmaak:
+; GFX12-SDAG-TRUE16: ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v1.l, 0x4200
+; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_fmaak:
+; GFX12-SDAG-FAKE16: ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
+; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_fmaak:
+; GFX12-GISEL-TRUE16: ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v1.l, 0x4200
+; GFX12-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_fmaak:
+; GFX12-GISEL-FAKE16: ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200
+; GFX12-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
%r = call half @llvm.fma.f16(half %x, half %y, half 0xH4200)
ret half %r
}
@@ -223,15 +317,47 @@ define half @test_fmamk(half %x, half %y, half %z) {
; GFX11-GISEL-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: test_fmamk:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_fmamk:
+; GFX12-SDAG-TRUE16: ; %bb.0:
+; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_fmamk_f16 v0.l, v0.l, 0x4200, v2.l
+; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_fmamk:
+; GFX12-SDAG-FAKE16: ; %bb.0:
+; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
+; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_fmamk:
+; GFX12-GISEL-TRUE16: ; %bb.0:
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v2.l, 0x4200, v0.l
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX12-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_fmamk:
+; GFX12-GISEL-FAKE16: ; %bb.0:
+; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2
+; GFX12-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
%r = call half @llvm.fma.f16(half %x, half 0xH4200, half %z)
ret half %r
}
@@ -340,42 +466,79 @@ define i32 @test_D139469_f16(half %arg) {
; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-SDAG-LABEL: test_D139469_f16:
-; GFX12-SDAG: ; %bb.0: ; %bb
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x211e
-; GFX12-SDAG-NEXT: v_mul_f16_e32 v2, 0x291e, v0
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
-; GFX12-SDAG-NEXT: v_min_num_f16_e32 v0, v2, v1
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test_D139469_f16:
-; GFX12-GISEL: ; %bb.0: ; %bb
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e
-; GFX12-GISEL-NEXT: v_mul_f16_e32 v2, 0x291e, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
-; GFX12-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
-; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_D139469_f16:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e
+; GFX12-SDAG-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_min_num_f16_e32 v0.l, v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l
+; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_D139469_f16:
+; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e
+; GFX12-SDAG-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
+; GFX12-SDAG-FAKE16-NEXT: v_min_num_f16_e32 v0, v2, v1
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_D139469_f16:
+; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e
+; GFX12-GISEL-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.h
+; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_D139469_f16:
+; GFX12-GISEL-FAKE16: ; %bb.0: ; %bb
+; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e
+; GFX12-GISEL-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0
+; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
+; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1
+; GFX12-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%i = fmul contract half %arg, 0xH291E
%i1 = fcmp olt half %i, 0xH0000
@@ -525,55 +688,103 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-SDAG-LABEL: test_D139469_v2f16:
-; GFX12-SDAG: ; %bb.0: ; %bb
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x211e
-; GFX12-SDAG-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
-; GFX12-SDAG-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_pk_min_num_f16 v0, v1, v0
-; GFX12-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1
-; GFX12-SDAG-NEXT: s_wait_alu 0xfffd
-; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: test_D139469_v2f16:
-; GFX12-GISEL: ; %bb.0: ; %bb
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e211e
-; GFX12-GISEL-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX12-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
-; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s1, 0, v1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-GISEL-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
-; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX12-GISEL-NEXT: s_or_b32 s0, s1, s2
-; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
-; GFX12-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX12-SDAG-TRUE16-LABEL: test_D139469_v2f16:
+; GFX12-SDAG-TRUE16: ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x211e
+; GFX12-SDAG-TRUE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
+; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT: v_pk_min_num_f16 v0, v1, v0
+; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l
+; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l
+; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-FAKE16-LABEL: test_D139469_v2f16:
+; GFX12-SDAG-FAKE16: ; %bb.0: ; %bb
+; GFX12-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x211e
+; GFX12-SDAG-FAKE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1]
+; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0]
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT: v_pk_min_num_f16 v0, v1, v0
+; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1
+; GFX12-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-TRUE16-LABEL: test_D139469_v2f16:
+; GFX12-GISEL-TRUE16: ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x211e211e
+; GFX12-GISEL-TRUE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2.l
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v2.h
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l
+; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v0.h
+; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-GISEL-TRUE16-NEXT: s_or_b32 s0, s1, s2
+; GFX12-GISEL-TRUE16-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX12-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-FAKE16-LABEL: test_D139469_v2f16:
+; GFX12-GISEL-FAKE16: ; %bb.0: ; %bb
+; GFX12-GISEL-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e211e
+; GFX12-GISEL-FAKE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0
+; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-GISEL-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1
+; GFX12-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0
+; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1
+; GFX12-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3
+; GFX12-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX12-GISEL-FAKE16-NEXT: s_or_b32 s0, s1, s2
+; GFX12-GISEL-FAKE16-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX12-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31]
bb:
%i = fmul contract <2 x half> %arg, <half 0xH291E, half 0xH291E>
%i1 = fcmp olt <2 x half> %i, <half 0xH0000, half 0xH0000>
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
index e874ee56f594c..7c49b5f1e169e 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
@@ -8,8 +8,10 @@
; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-FAKE16 %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-TRUE16 %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-FAKE16 %s
define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 {
; GFX9-SAFE-LABEL: test_fmax_legacy_ugt_f16:
@@ -58,18 +60,31 @@ define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 {
; SI-NNAN-NEXT: v_max_f32_e32 v0, v0, v1
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_f16:
-; GFX11-NNAN: ; %bb.0:
-; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NNAN-TRUE16-LABEL: test_fmax_legacy_ugt_f16:
+; GFX11-NNAN-TRUE16: ; %bb.0:
+; GFX11-NNAN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NNAN-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.l
+; GFX11-NNAN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NNAN-FAKE16-LABEL: test_fmax_legacy_ugt_f16:
+; GFX11-NNAN-FAKE16: ; %bb.0:
+; GFX11-NNAN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NNAN-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX11-NNAN-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ugt half %a, %b
%val = select i1 %cmp, half %a, half %b
ret half %val
@@ -146,18 +161,30 @@ define <2 x half> @test_fmax_legacy_ugt_v2f16(<2 x half> %a, <2 x half> %b) #0 {
; SI-NNAN-NEXT: v_max_f32_e32 v1, v1, v3
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v2f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v2f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3.l, v2.l
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v3.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v2f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v2f16:
; GFX11-NNAN: ; %bb.0:
@@ -256,20 +283,35 @@ define <3 x half> @test_fmax_legacy_ugt_v3f16(<3 x half> %a, <3 x half> %b) #0 {
; SI-NNAN-NEXT: v_max_f32_e32 v2, v2, v5
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v3f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v3f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0.l, v2.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s1, v1.l, v3.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, v5.l, v4.l
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, s1
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v5.l, s0
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v3f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v3f16:
; GFX11-NNAN: ; %bb.0:
@@ -392,26 +434,45 @@ define <4 x half> @test_fmax_legacy_ugt_v4f16(<4 x half> %a, <4 x half> %b) #0 {
; SI-NNAN-NEXT: v_max_f32_e32 v3, v3, v7
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v4f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v7, v6
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
-; GFX11-SAFE-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v4f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1.l, v3.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, v0.l, v2.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s1, v5.l, v4.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, v7.l, v6.l
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v5.l, s1
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, v6.l, v7.l, s2
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v4f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v5, v4
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v7, v6
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v2
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v3
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v4f16:
; GFX11-NNAN: ; %bb.0:
@@ -612,40 +673,69 @@ define <8 x half> @test_fmax_legacy_ugt_v8f16(<8 x half> %a, <8 x half> %b) #0 {
; SI-NNAN-NEXT: v_max_f32_e32 v7, v7, v15
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: test_fmax_legacy_ugt_v8f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v7
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v3
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v5
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v1
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v11, v10
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v13, v12
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v15, v14
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v9, v8
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v2, v6
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v4
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-SAFE-NEXT: v_perm_b32 v2, v11, v2, 0x5040100
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v5
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v7
-; GFX11-SAFE-NEXT: v_perm_b32 v1, v12, v1, 0x5040100
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX11-SAFE-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SAFE-NEXT: v_perm_b32 v3, v10, v3, 0x5040100
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: test_fmax_legacy_ugt_v8f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v5
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v6
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v7
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v3
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v9.l, v8.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s0, v11.l, v10.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s1, v13.l, v12.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s3, v0.l, v4.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s2, v15.l, v14.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s4, v1.l, v5.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s5, v2.l, v6.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nle_f16_e64 s6, v3.l, v7.l
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.h, v12.l, v13.l, s1
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v3.h, v14.l, v15.l, s2
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.l, v11.l, s0
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v9.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s3
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s4
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.l, v6.l, v2.l, s5
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v3.l, v7.l, v3.l, s6
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: test_fmax_legacy_ugt_v8f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v7
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v6
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v5
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v11, v10
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v13, v12
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v15, v14
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v9, v8
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v2, v6
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v4
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v2, v11, v2, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v1, v5
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v7
+; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v1, v12, v1, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v3, v10, v3, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmax_legacy_ugt_v8f16:
; GFX11-NNAN: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
index 0723290bdf734..3e96ac03494a4 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
@@ -8,8 +8,10 @@
; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s
; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE %s
-; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE-FAKE16 %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-TRUE16 %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN,GFX11-NNAN-FAKE16 %s
define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 {
@@ -59,18 +61,31 @@ define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 {
; SI-NNAN-NEXT: v_min_f32_e32 v0, v0, v1
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: test_fmin_legacy_ule_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NNAN-LABEL: test_fmin_legacy_ule_f16:
-; GFX11-NNAN: ; %bb.0:
-; GFX11-NNAN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NNAN-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX11-NNAN-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NNAN-TRUE16-LABEL: test_fmin_legacy_ule_f16:
+; GFX11-NNAN-TRUE16: ; %bb.0:
+; GFX11-NNAN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NNAN-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.l
+; GFX11-NNAN-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NNAN-FAKE16-LABEL: test_fmin_legacy_ule_f16:
+; GFX11-NNAN-FAKE16: ; %bb.0:
+; GFX11-NNAN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NNAN-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX11-NNAN-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = fcmp ule half %a, %b
%val = select i1 %cmp, half %a, half %b
ret half %val
@@ -147,18 +162,30 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
; SI-NNAN-NEXT: v_min_f32_e32 v1, v1, v3
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v2f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s0, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3.l, v2.l
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, s0
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v3.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v2f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v2f16:
; GFX11-NNAN: ; %bb.0:
@@ -257,20 +284,35 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
; SI-NNAN-NEXT: v_min_f32_e32 v2, v2, v5
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SAFE-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v3f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0.l, v2.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s1, v1.l, v3.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s0, v5.l, v4.l
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, s1
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v5.l, s0
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v3f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v3f16:
; GFX11-NNAN: ; %bb.0:
@@ -393,26 +435,45 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
; SI-NNAN-NEXT: v_min_f32_e32 v3, v3, v7
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v4, 16, v3
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v7, 16, v0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
-; GFX11-SAFE-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v4f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1.l, v3.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s0, v0.l, v2.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s1, v5.l, v4.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s2, v7.l, v6.l
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, s0
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v5.l, s1
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, v6.l, v7.l, s2
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v4f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v5, v4
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v7, v6
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v3
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v4f16:
; GFX11-NNAN: ; %bb.0:
@@ -613,40 +674,69 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
; SI-NNAN-NEXT: v_min_f32_e32 v7, v7, v15
; SI-NNAN-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v10, 16, v7
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v11, 16, v3
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v12, 16, v6
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v13, 16, v2
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v14, 16, v5
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v15, 16, v1
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v11, v10
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v8, 16, v4
-; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v9, 16, v0
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v13, v12
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v15, v14
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v9, v8
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v2, v6
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v4
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-SAFE-NEXT: v_perm_b32 v2, v11, v2, 0x5040100
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v5
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v7
-; GFX11-SAFE-NEXT: v_perm_b32 v1, v12, v1, 0x5040100
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
-; GFX11-SAFE-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SAFE-NEXT: v_perm_b32 v3, v10, v3, 0x5040100
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: test_fmin_legacy_ule_v8f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v5
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v6
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v7
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v3
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v9.l, v8.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s0, v11.l, v10.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s1, v13.l, v12.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s3, v0.l, v4.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s2, v15.l, v14.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s4, v1.l, v5.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s5, v2.l, v6.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s6, v3.l, v7.l
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.h, v12.l, v13.l, s1
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v3.h, v14.l, v15.l, s2
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.l, v11.l, s0
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v9.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, s3
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s4
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.l, v6.l, v2.l, s5
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v3.l, v7.l, v3.l, s6
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: test_fmin_legacy_ule_v8f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v7
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v3
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v6
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v5
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v11, v10
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v13, v12
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v11, v12, v13, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v15, v14
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v9, v8
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v2, v6
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v4
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v2, v11, v2, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v1, v5
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v7
+; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v1, v12, v1, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v3, v10, v3, 0x5040100
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v8f16:
; GFX11-NNAN: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
index a753e38b04abf..0a8562341b060 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -10,10 +10,14 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DENORM,GFX10-DENORM-STRICT %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DENORM,GFX10-DENORM-CONTRACT %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM,GFX11-DENORM-STRICT %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM,GFX11-DENORM-CONTRACT %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FLUSH-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM-STRICT-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM-STRICT-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM-CONTRACT-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DENORM-CONTRACT-FAKE16 %s
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare half @llvm.fmuladd.f16(half, half, half) #1
@@ -90,36 +94,95 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX10-DENORM-NEXT: global_store_short v0, v3, s[8:9]
; GFX10-DENORM-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: fmuladd_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: s_clause 0x2
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7]
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-LABEL: fmuladd_f16:
-; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: s_clause 0x2
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[6:7]
-; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[0:1]
-; GFX11-DENORM-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: fmuladd_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: s_clause 0x2
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: fmuladd_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: s_clause 0x2
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5]
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FLUSH-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v1, v3
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: fmuladd_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_clause 0x2
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v1.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_clause 0x2
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_fmac_f16_e32 v3, v1, v2
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fmuladd_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_clause 0x2
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v1.l, v0.l, v0.h
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_clause 0x2
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fmac_f16_e32 v3, v1, v2
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
%r0 = load half, ptr addrspace(1) %in1
%r1 = load half, ptr addrspace(1) %in2
@@ -216,53 +279,101 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[8:9]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: fmul_fadd_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: s_clause 0x2
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7]
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-STRICT-LABEL: fmul_fadd_f16:
-; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: s_clause 0x2
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[6:7]
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT: s_endpgm
-;
-; GFX11-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
-; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: s_clause 0x2
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[6:7]
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v3, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: fmul_fadd_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: s_clause 0x2
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: fmul_fadd_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: s_clause 0x2
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5]
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FLUSH-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v1, v3
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: fmul_fadd_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_clause 0x2
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: fmul_fadd_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_clause 0x2
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_add_f16_e32 v1, v1, v3
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fmul_fadd_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_clause 0x2
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v1.l, v0.l, v0.h
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmul_fadd_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_clause 0x2
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fmac_f16_e32 v3, v1, v2
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
%r0 = load half, ptr addrspace(1) %in1
%r1 = load half, ptr addrspace(1) %in2
@@ -344,36 +455,95 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
; GFX10-DENORM-NEXT: global_store_short v0, v3, s[8:9]
; GFX10-DENORM-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: s_clause 0x2
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7]
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
-; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-LABEL: fmul_fadd_contract_f16:
-; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: s_clause 0x2
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[6:7]
-; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[0:1]
-; GFX11-DENORM-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: fmul_fadd_contract_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: s_clause 0x2
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: fmul_fadd_contract_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: s_clause 0x2
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5]
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FLUSH-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v1, v3
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: fmul_fadd_contract_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_clause 0x2
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v1.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: fmul_fadd_contract_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_clause 0x2
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_fmac_f16_e32 v3, v1, v2
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fmul_fadd_contract_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_clause 0x2
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[6:7]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v1.l, v0.l, v0.h
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmul_fadd_contract_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_clause 0x2
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fmac_f16_e32 v3, v1, v2
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2, ptr addrspace(1) %in3) #0 {
%r0 = load half, ptr addrspace(1) %in1
%r1 = load half, ptr addrspace(1) %in2
@@ -448,37 +618,99 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
; GFX10-DENORM-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-LABEL: fmuladd_2.0_a_b_f16:
-; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
-; GFX11-DENORM-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: fmuladd_2.0_a_b_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: fmuladd_2.0_a_b_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v1, v2
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: fmuladd_2.0_a_b_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 2.0, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_2.0_a_b_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_fmac_f16_e32 v2, 2.0, v1
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fmuladd_2.0_a_b_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 2.0, v0.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_2.0_a_b_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fmac_f16_e32 v2, 2.0, v1
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
@@ -556,37 +788,99 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
; GFX10-DENORM-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-LABEL: fmuladd_a_2.0_b_f16:
-; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
-; GFX11-DENORM-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: fmuladd_a_2.0_b_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: fmuladd_a_2.0_b_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v1, v2
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: fmuladd_a_2.0_b_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 2.0, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_a_2.0_b_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_fmac_f16_e32 v2, 2.0, v1
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fmuladd_a_2.0_b_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 2.0, v0.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_a_2.0_b_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fmac_f16_e32 v2, 2.0, v1
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
@@ -678,54 +972,103 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: fadd_a_a_b_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-STRICT-LABEL: fadd_a_a_b_f16:
-; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT: s_endpgm
-;
-; GFX11-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
-; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: fadd_a_a_b_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: fadd_a_a_b_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v1, v2
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: fadd_a_a_b_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: fadd_a_a_b_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_add_f16_e32 v1, v1, v1
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_add_f16_e32 v1, v1, v2
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fadd_a_a_b_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 2.0, v0.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fadd_a_a_b_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fmac_f16_e32 v2, 2.0, v1
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -820,54 +1163,103 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: fadd_b_a_a_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-STRICT-LABEL: fadd_b_a_a_f16:
-; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT: s_endpgm
-;
-; GFX11-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
-; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: fadd_b_a_a_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: fadd_b_a_a_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v2, v1
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: fadd_b_a_a_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: fadd_b_a_a_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_add_f16_e32 v1, v1, v1
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_add_f16_e32 v1, v2, v1
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fadd_b_a_a_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 2.0, v0.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fadd_b_a_a_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fmac_f16_e32 v2, 2.0, v1
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in1,
ptr addrspace(1) %in2) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -948,37 +1340,99 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
; GFX10-DENORM-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
-; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
-; GFX11-DENORM-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: fmuladd_neg_2.0_a_b_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.h, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: fmuladd_neg_2.0_a_b_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_sub_f16_e32 v1, v2, v1
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: fmuladd_neg_2.0_a_b_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, -2.0, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_neg_2.0_a_b_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_fmac_f16_e32 v2, -2.0, v1
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fmuladd_neg_2.0_a_b_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, -2.0, v0.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_neg_2.0_a_b_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fmac_f16_e32 v2, -2.0, v1
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
@@ -1056,37 +1510,99 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
; GFX10-DENORM-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
-; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
-; GFX11-DENORM-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v2, v1
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 2.0, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_fmac_f16_e32 v2, 2.0, v1
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 2.0, v0.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fmac_f16_e32 v2, 2.0, v1
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
@@ -1166,37 +1682,99 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
; GFX10-DENORM-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
-; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
-; GFX11-DENORM-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: fmuladd_2.0_neg_a_b_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.h, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: fmuladd_2.0_neg_a_b_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_sub_f16_e32 v1, v2, v1
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: fmuladd_2.0_neg_a_b_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, -2.0, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_2.0_neg_a_b_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_fmac_f16_e32 v2, -2.0, v1
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fmuladd_2.0_neg_a_b_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, -2.0, v0.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_2.0_neg_a_b_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fmac_f16_e32 v2, -2.0, v1
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
@@ -1276,37 +1854,99 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
; GFX10-DENORM-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-DENORM-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
-; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-DENORM-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: v_fma_f16 v1, v1, 2.0, -v2
-; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: fmuladd_2.0_a_neg_b_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: fmuladd_2.0_a_neg_b_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_sub_f16_e32 v1, v1, v2
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: fmuladd_2.0_a_neg_b_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, 2.0, -v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: fmuladd_2.0_a_neg_b_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_fma_f16 v1, v1, 2.0, -v2
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fmuladd_2.0_a_neg_b_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, 2.0, -v0.h
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fmuladd_2.0_a_neg_b_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fma_f16 v1, v1, 2.0, -v2
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
@@ -1420,60 +2060,115 @@ define amdgpu_kernel void @mad_sub_f16(ptr addrspace(1) noalias nocapture %out,
; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: mad_sub_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-STRICT-LABEL: mad_sub_f16:
-; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT: s_endpgm
-;
-; GFX11-DENORM-CONTRACT-LABEL: mad_sub_f16:
-; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -v3
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: mad_sub_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: mad_sub_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_sub_f16_e32 v1, v1, v3
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: mad_sub_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: mad_sub_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_sub_f16_e32 v1, v1, v3
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: mad_sub_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v0.h, -v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: mad_sub_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fma_f16 v1, v1, v2, -v3
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -1589,60 +2284,115 @@ define amdgpu_kernel void @mad_sub_inv_f16(ptr addrspace(1) noalias nocapture %o
; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: mad_sub_inv_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v3, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-STRICT-LABEL: mad_sub_inv_f16:
-; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v3, v1
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT: s_endpgm
-;
-; GFX11-DENORM-CONTRACT-LABEL: mad_sub_inv_f16:
-; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, v3
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: mad_sub_inv_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v1.l, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: mad_sub_inv_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_sub_f16_e32 v1, v3, v1
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: mad_sub_inv_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e32 v0.l, v1.l, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: mad_sub_inv_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_sub_f16_e32 v1, v3, v1
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: mad_sub_inv_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v0.h, v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: mad_sub_inv_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fma_f16 v1, -v1, v2, v3
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -1758,60 +2508,115 @@ define amdgpu_kernel void @mad_sub_fabs_f16(ptr addrspace(1) noalias nocapture %
; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: mad_sub_fabs_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_sub_f16_e64 v1, v1, |v3|
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_f16:
-; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, v1, |v3|
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT: s_endpgm
-;
-; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_f16:
-; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, v2, -|v3|
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: mad_sub_fabs_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e64 v0.l, v0.l, |v1.l|
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: mad_sub_fabs_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_sub_f16_e64 v1, v1, |v3|
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: mad_sub_fabs_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e64 v0.l, v0.l, |v1.l|
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: mad_sub_fabs_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_sub_f16_e64 v1, v1, |v3|
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: mad_sub_fabs_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v0.h, -|v1.l|
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: mad_sub_fabs_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fma_f16 v1, v1, v2, -|v3|
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -1928,60 +2733,115 @@ define amdgpu_kernel void @mad_sub_fabs_inv_f16(ptr addrspace(1) noalias nocaptu
; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: mad_sub_fabs_inv_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_sub_f16_e64 v1, |v3|, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-STRICT-LABEL: mad_sub_fabs_inv_f16:
-; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e64 v1, |v3|, v1
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT: s_endpgm
-;
-; GFX11-DENORM-CONTRACT-LABEL: mad_sub_fabs_inv_f16:
-; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, -v1, v2, |v3|
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: mad_sub_fabs_inv_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e64 v0.l, |v1.l|, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: mad_sub_fabs_inv_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_sub_f16_e64 v1, |v3|, v1
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: mad_sub_fabs_inv_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e64 v0.l, |v1.l|, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: mad_sub_fabs_inv_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_sub_f16_e64 v1, |v3|, v1
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: mad_sub_fabs_inv_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v0.h, |v1.l|
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: mad_sub_fabs_inv_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fma_f16 v1, -v1, v2, |v3|
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -2098,60 +2958,115 @@ define amdgpu_kernel void @neg_neg_mad_f16(ptr addrspace(1) noalias nocapture %o
; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[0:1]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: neg_neg_mad_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v3, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-STRICT-LABEL: neg_neg_mad_f16:
-; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v3, v1
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT: s_endpgm
-;
-; GFX11-DENORM-CONTRACT-LABEL: neg_neg_mad_f16:
-; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v3, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: neg_neg_mad_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v1.l, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: neg_neg_mad_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v3, v1
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: neg_neg_mad_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v1.l, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: neg_neg_mad_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v2
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_add_f16_e32 v1, v3, v1
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: neg_neg_mad_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v1.l, v0.l, v0.h
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: neg_neg_mad_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fmac_f16_e32 v3, v1, v2
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -2269,60 +3184,115 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: mad_fabs_sub_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: v_mul_f16_e64 v1, v1, |v2|
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-STRICT-LABEL: mad_fabs_sub_f16:
-; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e64 v1, v1, |v2|
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v3
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT: s_endpgm
-;
-; GFX11-DENORM-CONTRACT-LABEL: mad_fabs_sub_f16:
-; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, |v2|, -v3
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: mad_fabs_sub_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, |v0.h|
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: mad_fabs_sub_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: v_mul_f16_e64 v1, v1, |v2|
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_sub_f16_e32 v1, v1, v3
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: mad_fabs_sub_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, |v0.h|
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: mad_fabs_sub_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_mul_f16_e64 v1, v1, |v2|
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_sub_f16_e32 v1, v1, v3
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: mad_fabs_sub_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, |v0.h|, -v1.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: mad_fabs_sub_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v3, v0, s[2:3] offset:4 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fma_f16 v1, v1, |v2|, -v3
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
%gep0 = getelementptr half, ptr addrspace(1) %ptr, i64 %tid.ext
@@ -2419,54 +3389,103 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16:
-; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT: s_endpgm
-;
-; GFX11-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
-; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: fsub_c_fadd_a_a_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.h, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: fsub_c_fadd_a_a_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_sub_f16_e32 v1, v2, v1
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: fsub_c_fadd_a_a_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.h, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: fsub_c_fadd_a_a_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_add_f16_e32 v1, v1, v1
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_sub_f16_e32 v1, v2, v1
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fsub_c_fadd_a_a_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fmac_f16_e32 v0.h, -2.0, v0.l
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_d16_hi_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fsub_c_fadd_a_a_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fmac_f16_e32 v2, -2.0, v1
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
@@ -2560,54 +3579,103 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp
; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
-; GFX11-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
-; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-FLUSH-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
-; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-FLUSH-NEXT: s_endpgm
-;
-; GFX11-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16:
-; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-DENORM-STRICT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
-; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-STRICT-NEXT: s_endpgm
-;
-; GFX11-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
-; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-DENORM-CONTRACT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
-; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, 2.0, -v2
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
+; GFX11-FLUSH-TRUE16-LABEL: fsub_fadd_a_a_c_f16:
+; GFX11-FLUSH-TRUE16: ; %bb.0:
+; GFX11-FLUSH-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v0.h
+; GFX11-FLUSH-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-FLUSH-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FLUSH-FAKE16-LABEL: fsub_fadd_a_a_c_f16:
+; GFX11-FLUSH-FAKE16: ; %bb.0:
+; GFX11-FLUSH-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FLUSH-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v1, v1, v1
+; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FLUSH-FAKE16-NEXT: v_sub_f16_e32 v1, v1, v2
+; GFX11-FLUSH-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-TRUE16-LABEL: fsub_fadd_a_a_c_f16:
+; GFX11-DENORM-STRICT-TRUE16: ; %bb.0:
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.l
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v0.h
+; GFX11-DENORM-STRICT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-DENORM-STRICT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-STRICT-FAKE16-LABEL: fsub_fadd_a_a_c_f16:
+; GFX11-DENORM-STRICT-FAKE16: ; %bb.0:
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_add_f16_e32 v1, v1, v1
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-STRICT-FAKE16-NEXT: v_sub_f16_e32 v1, v1, v2
+; GFX11-DENORM-STRICT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-FAKE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-TRUE16-LABEL: fsub_fadd_a_a_c_f16:
+; GFX11-DENORM-CONTRACT-TRUE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_load_d16_hi_b16 v0, v1, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, 2.0, -v0.h
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-DENORM-CONTRACT-TRUE16-NEXT: s_endpgm
+;
+; GFX11-DENORM-CONTRACT-FAKE16-LABEL: fsub_fadd_a_a_c_f16:
+; GFX11-DENORM-CONTRACT-FAKE16: ; %bb.0:
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: v_fma_f16 v1, v1, 2.0, -v2
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, ptr addrspace(1) %out, i32 %tid
%gep.1 = getelementptr half, ptr addrspace(1) %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index 78afde138944e..5ea39997938ad 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -5,8 +5,10 @@
; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=VI,VI-SAFE %s
; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope --check-prefixes=VI,VI-NSZ %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-SAFE %s
-; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-NSZ %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-SAFE,GFX11-SAFE-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-SAFE,GFX11-SAFE-FAKE16 %s
+; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-NSZ,GFX11-NSZ-TRUE16 %s
+; RUN: llc -enable-no-signed-zeros-fp-math -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-NSZ,GFX11-NSZ-FAKE16 %s
; --------------------------------------------------------------------------------
; fadd tests
@@ -47,19 +49,33 @@ define half @v_fneg_add_f16(half %a, half %b) #0 {
; VI-NSZ-NEXT: v_sub_f16_e64 v0, -v0, v1
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_add_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_add_f16_e32 v0, v0, v1
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_add_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_sub_f16_e64 v0, -v0, v1
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_add_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_add_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_add_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_add_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e64 v0, -v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%add = fadd half %a, %b
%fneg = fneg half %add
ret half %fneg
@@ -84,13 +100,37 @@ define { half, half } @v_fneg_add_store_use_add_f16(half %a, half %b) #0 {
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_add_store_use_add_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_f16_e32 v1, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_add_store_use_add_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v1.l, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_add_store_use_add_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v1, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_add_store_use_add_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e32 v1.l, v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_add_store_use_add_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e32 v1, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%add = fadd half %a, %b
%fneg = fneg half %add
%insert.0 = insertvalue { half, half } poison, half %fneg, 0
@@ -137,22 +177,39 @@ define { half, half } @v_fneg_add_multi_use_add_f16(half %a, half %b) #0 {
; VI-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_add_multi_use_add_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_add_f16_e32 v1, v0, v1
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v1
-; GFX11-SAFE-NEXT: v_mul_f16_e32 v1, 4.0, v1
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_add_multi_use_add_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_sub_f16_e64 v0, -v0, v1
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_add_multi_use_add_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.h, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.h
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v1.l, 4.0, v0.h
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_add_multi_use_add_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v1, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v1, 4.0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_add_multi_use_add_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v1.l, -4.0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_add_multi_use_add_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e64 v0, -v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v1, -4.0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%add = fadd half %a, %b
%fneg = fneg half %add
%use1 = fmul half %add, 4.0
@@ -197,19 +254,33 @@ define half @v_fneg_add_fneg_x_f16(half %a, half %b) #0 {
; VI-NSZ-NEXT: v_sub_f16_e32 v0, v0, v1
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_add_fneg_x_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_sub_f16_e32 v0, v1, v0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_add_fneg_x_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_sub_f16_e32 v0, v0, v1
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_add_fneg_x_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v1.l, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_add_fneg_x_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v1, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_add_fneg_x_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_add_fneg_x_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg half %a
%add = fadd half %fneg.a, %b
%fneg = fneg half %add
@@ -251,19 +322,33 @@ define half @v_fneg_add_x_fneg_f16(half %a, half %b) #0 {
; VI-NSZ-NEXT: v_sub_f16_e32 v0, v1, v0
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_add_x_fneg_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_sub_f16_e32 v0, v0, v1
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_add_x_fneg_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_sub_f16_e32 v0, v1, v0
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_add_x_fneg_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_add_x_fneg_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_add_x_fneg_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v1.l, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_add_x_fneg_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v1, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.b = fneg half %b
%add = fadd half %a, %fneg.b
%fneg = fneg half %add
@@ -305,19 +390,33 @@ define half @v_fneg_add_fneg_fneg_f16(half %a, half %b) #0 {
; VI-NSZ-NEXT: v_add_f16_e32 v0, v0, v1
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_add_fneg_fneg_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_sub_f16_e64 v0, -v0, v1
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_add_fneg_fneg_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_add_f16_e32 v0, v0, v1
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_add_fneg_fneg_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_add_fneg_fneg_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e64 v0, -v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_add_fneg_fneg_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_add_fneg_fneg_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg half %a
%fneg.b = fneg half %b
%add = fadd half %fneg.a, %fneg.b
@@ -366,24 +465,43 @@ define { half, half } @v_fneg_add_store_use_fneg_x_f16(half %a, half %b) #0 {
; VI-NSZ-NEXT: v_mov_b32_e32 v1, v2
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_add_store_use_fneg_x_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_sub_f16_e32 v1, v1, v0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v1
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0
-; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, v2
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_add_store_use_fneg_x_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_sub_f16_e32 v2, v0, v1
-; GFX11-NSZ-NEXT: v_xor_b32_e32 v1, 0x8000, v0
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NSZ-NEXT: v_mov_b32_e32 v0, v2
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_add_store_use_fneg_x_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v1.l, v0.h
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v0.h
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_add_store_use_fneg_x_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v1, v1, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v1
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_add_store_use_fneg_x_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.h, v1.l
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v0.h
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_add_store_use_fneg_x_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v2, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg half %a
%add = fadd half %fneg.a, %b
%fneg = fneg half %add
@@ -439,24 +557,43 @@ define { half, half } @v_fneg_add_multi_use_fneg_x_f16(half %a, half %b, half %c
; VI-NSZ-NEXT: v_mov_b32_e32 v0, v3
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_sub_f16_e32 v1, v1, v0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v3, 0x8000, v1
-; GFX11-SAFE-NEXT: v_mul_f16_e64 v1, -v0, v2
-; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_add_multi_use_fneg_x_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_sub_f16_e32 v3, v0, v1
-; GFX11-NSZ-NEXT: v_mul_f16_e64 v1, -v0, v2
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NSZ-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_add_multi_use_fneg_x_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v1.l, v0.h
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v1.l, -v0.h, v2.l
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_add_multi_use_fneg_x_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v1, v1, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v1
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v1, -v0, v2
+; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_add_multi_use_fneg_x_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.h, v1.l
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v1.l, -v0.h, v2.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_add_multi_use_fneg_x_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v3, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v1, -v0, v2
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg half %a
%add = fadd half %fneg.a, %b
%fneg = fneg half %add
@@ -551,33 +688,63 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
; VI-NSZ-NEXT: ; return to shader part epilog
;
-; GFX11-SAFE-LABEL: fneg_fadd_0_f16:
-; GFX11-SAFE: ; %bb.0: ; %.entry
-; GFX11-SAFE-NEXT: v_rcp_f16_e32 v0, s1
-; GFX11-SAFE-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_add_f16_e32 v0, 0, v0
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
-; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
-; GFX11-SAFE-NEXT: ; return to shader part epilog
-;
-; GFX11-NSZ-LABEL: fneg_fadd_0_f16:
-; GFX11-NSZ: ; %bb.0: ; %.entry
-; GFX11-NSZ-NEXT: v_rcp_f16_e32 v0, s1
-; GFX11-NSZ-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0
-; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
-; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
-; GFX11-NSZ-NEXT: ; return to shader part epilog
+; GFX11-SAFE-TRUE16-LABEL: fneg_fadd_0_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0: ; %.entry
+; GFX11-SAFE-TRUE16-NEXT: v_rcp_f16_e32 v0.l, s1
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, 0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0/*Invalid register, operand has 'VS_16' register class*/, s0, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-SAFE-FAKE16-LABEL: fneg_fadd_0_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0: ; %.entry
+; GFX11-SAFE-FAKE16-NEXT: v_rcp_f16_e32 v0, s1
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v0, 0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v0, 0, v0
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX11-NSZ-TRUE16-LABEL: fneg_fadd_0_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0: ; %.entry
+; GFX11-NSZ-TRUE16-NEXT: v_rcp_f16_e32 v0.l, s1
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x8000, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_nlt_f16_e64 s1, -v0.l, s0
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s0, s1
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-NSZ-FAKE16-LABEL: fneg_fadd_0_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0: ; %.entry
+; GFX11-NSZ-FAKE16-NEXT: v_rcp_f16_e32 v0, s1
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v0, 0x8000, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: ; return to shader part epilog
.entry:
%tmp7 = fdiv half 1.000000e+00, %tmp6
%tmp8 = fmul half 0.000000e+00, %tmp7
@@ -644,29 +811,52 @@ define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <
; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
; VI-NSZ-NEXT: ; return to shader part epilog
;
-; GFX11-SAFE-LABEL: fneg_fadd_0_nsz_f16:
-; GFX11-SAFE: ; %bb.0: ; %.entry
-; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, s0
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 vcc_lo, s0, 0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x8000, v0, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
-; GFX11-SAFE-NEXT: ; return to shader part epilog
-;
-; GFX11-NSZ-LABEL: fneg_fadd_0_nsz_f16:
-; GFX11-NSZ: ; %bb.0: ; %.entry
-; GFX11-NSZ-NEXT: v_rcp_f16_e32 v0, s1
-; GFX11-NSZ-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0
-; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
-; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
-; GFX11-NSZ-NEXT: ; return to shader part epilog
+; GFX11-SAFE-TRUE16-LABEL: fneg_fadd_0_nsz_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0: ; %.entry
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s1, s0, 0
+; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x8000
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s0, s1
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-SAFE-FAKE16-LABEL: fneg_fadd_0_nsz_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0: ; %.entry
+; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ngt_f16_e64 vcc_lo, s0, 0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x8000, v0, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: ; return to shader part epilog
+;
+; GFX11-NSZ-TRUE16-LABEL: fneg_fadd_0_nsz_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0: ; %.entry
+; GFX11-NSZ-TRUE16-NEXT: v_rcp_f16_e32 v0.l, s1
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x8000, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_nlt_f16_e64 s1, -v0.l, s0
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s0, s1
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11-NSZ-FAKE16-LABEL: fneg_fadd_0_nsz_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0: ; %.entry
+; GFX11-NSZ-FAKE16-NEXT: v_rcp_f16_e32 v0, s1
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v0, 0x8000, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: ; return to shader part epilog
.entry:
%tmp7 = fdiv afn half 1.000000e+00, %tmp6
%tmp8 = fmul half 0.000000e+00, %tmp7
@@ -701,11 +891,29 @@ define half @v_fneg_mul_f16(half %a, half %b) #0 {
; VI-NEXT: v_mul_f16_e64 v0, v0, -v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_mul_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_f16_e64 v0, v0, -v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_mul_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_mul_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v0, v0, -v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_mul_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_mul_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v0, v0, -v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mul = fmul half %a, %b
%fneg = fneg half %mul
ret half %fneg
@@ -730,13 +938,37 @@ define { half, half } @v_fneg_mul_store_use_mul_f16(half %a, half %b) #0 {
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_mul_store_use_mul_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_f16_e32 v1, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_mul_store_use_mul_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_mul_store_use_mul_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v1, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_mul_store_use_mul_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_mul_store_use_mul_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v1, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mul = fmul half %a, %b
%fneg = fneg half %mul
%insert.0 = insertvalue { half, half } poison, half %fneg, 0
@@ -763,13 +995,37 @@ define { half, half } @v_fneg_mul_multi_use_mul_f16(half %a, half %b) #0 {
; VI-NEXT: v_mul_f16_e32 v1, -4.0, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_mul_multi_use_mul_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_f16_e64 v0, v0, -v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mul_f16_e32 v1, -4.0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_mul_multi_use_mul_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v1.l, -4.0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_mul_multi_use_mul_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v0, v0, -v1
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v1, -4.0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_mul_multi_use_mul_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v1.l, -4.0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_mul_multi_use_mul_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v0, v0, -v1
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v1, -4.0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mul = fmul half %a, %b
%fneg = fneg half %mul
%use1 = fmul half %mul, 4.0
@@ -795,11 +1051,29 @@ define half @v_fneg_mul_fneg_x_f16(half %a, half %b) #0 {
; VI-NEXT: v_mul_f16_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_mul_fneg_x_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_mul_fneg_x_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_mul_fneg_x_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_mul_fneg_x_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_mul_fneg_x_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg half %a
%mul = fmul half %fneg.a, %b
%fneg = fneg half %mul
@@ -823,11 +1097,29 @@ define half @v_fneg_mul_x_fneg_f16(half %a, half %b) #0 {
; VI-NEXT: v_mul_f16_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_mul_x_fneg_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_mul_x_fneg_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_mul_x_fneg_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_mul_x_fneg_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_mul_x_fneg_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.b = fneg half %b
%mul = fmul half %a, %fneg.b
%fneg = fneg half %mul
@@ -851,11 +1143,29 @@ define half @v_fneg_mul_fneg_fneg_f16(half %a, half %b) #0 {
; VI-NEXT: v_mul_f16_e64 v0, v0, -v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_mul_fneg_fneg_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_f16_e64 v0, v0, -v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_mul_fneg_fneg_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_mul_fneg_fneg_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v0, v0, -v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_mul_fneg_fneg_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_mul_fneg_fneg_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v0, v0, -v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg half %a
%fneg.b = fneg half %b
%mul = fmul half %fneg.a, %fneg.b
@@ -883,14 +1193,41 @@ define { half, half } @v_fneg_mul_store_use_fneg_x_f16(half %a, half %b) #0 {
; VI-NEXT: v_mov_b32_e32 v1, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_mul_store_use_fneg_x_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_f16_e32 v2, v0, v1
-; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_mul_store_use_fneg_x_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.h, v1.l
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v0.h
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_mul_store_use_fneg_x_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v2, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_mul_store_use_fneg_x_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.h, v1.l
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v0.h
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_mul_store_use_fneg_x_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v2, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg half %a
%mul = fmul half %fneg.a, %b
%fneg = fneg half %mul
@@ -922,14 +1259,41 @@ define { half, half } @v_fneg_mul_multi_use_fneg_x_f16(half %a, half %b, half %c
; VI-NEXT: v_mov_b32_e32 v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_mul_multi_use_fneg_x_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_f16_e32 v3, v0, v1
-; GFX11-NEXT: v_mul_f16_e64 v1, -v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_mul_multi_use_fneg_x_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.h, v1.l
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v1.l, -v0.h, v2.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_mul_multi_use_fneg_x_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v3, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v1, -v0, v2
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_mul_multi_use_fneg_x_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.h, v1.l
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v1.l, -v0.h, v2.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_mul_multi_use_fneg_x_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v3, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v1, -v0, v2
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg half %a
%mul = fmul half %fneg.a, %b
%fneg = fneg half %mul
@@ -962,14 +1326,41 @@ define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 {
; VI-NEXT: v_max_f16_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_minnum_f16_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_minnum_f16_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.h, -v1.l, -v1.l
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_minnum_f16_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v1, -v1, -v1
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_minnum_f16_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.h, -v1.l, -v1.l
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_minnum_f16_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v1, -v1, -v1
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half %a, half %b)
%fneg = fneg half %min
ret half %fneg
@@ -992,11 +1383,29 @@ define half @v_fneg_minnum_f16_no_ieee(half %a, half %b) #4 {
; VI-NEXT: v_max_f16_e64 v0, -v0, -v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_minnum_f16_no_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_minnum_f16_no_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_minnum_f16_no_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_minnum_f16_no_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_minnum_f16_no_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half %a, half %b)
%fneg = fneg half %min
ret half %fneg
@@ -1015,11 +1424,29 @@ define half @v_fneg_self_minnum_f16_ieee(half %a) #0 {
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_self_minnum_f16_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_self_minnum_f16_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_self_minnum_f16_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_self_minnum_f16_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_self_minnum_f16_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half %a, half %a)
%min.fneg = fneg half %min
ret half %min.fneg
@@ -1038,11 +1465,29 @@ define half @v_fneg_self_minnum_f16_no_ieee(half %a) #4 {
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_self_minnum_f16_no_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_self_minnum_f16_no_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_self_minnum_f16_no_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_self_minnum_f16_no_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_self_minnum_f16_no_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half %a, half %a)
%min.fneg = fneg half %min
ret half %min.fneg
@@ -1064,13 +1509,37 @@ define half @v_fneg_posk_minnum_f16_ieee(half %a) #0 {
; VI-NEXT: v_max_f16_e32 v0, -4.0, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_posk_minnum_f16_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v0, -4.0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_posk_minnum_f16_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e32 v0.l, -4.0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_posk_minnum_f16_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e32 v0, -4.0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_posk_minnum_f16_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e32 v0.l, -4.0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_posk_minnum_f16_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e32 v0, -4.0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half 4.0, half %a)
%fneg = fneg half %min
ret half %fneg
@@ -1091,11 +1560,29 @@ define half @v_fneg_posk_minnum_f16_no_ieee(half %a) #4 {
; VI-NEXT: v_max_f16_e64 v0, -v0, -4.0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_posk_minnum_f16_no_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v0, -v0, -4.0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_posk_minnum_f16_no_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -4.0
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_posk_minnum_f16_no_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -4.0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_posk_minnum_f16_no_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -4.0
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_posk_minnum_f16_no_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -4.0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half 4.0, half %a)
%fneg = fneg half %min
ret half %fneg
@@ -1117,13 +1604,37 @@ define half @v_fneg_negk_minnum_f16_ieee(half %a) #0 {
; VI-NEXT: v_max_f16_e32 v0, 4.0, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_negk_minnum_f16_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v0, 4.0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_negk_minnum_f16_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e32 v0.l, 4.0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_negk_minnum_f16_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e32 v0, 4.0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_negk_minnum_f16_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e32 v0.l, 4.0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_negk_minnum_f16_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e32 v0, 4.0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half -4.0, half %a)
%fneg = fneg half %min
ret half %fneg
@@ -1144,11 +1655,29 @@ define half @v_fneg_negk_minnum_f16_no_ieee(half %a) #4 {
; VI-NEXT: v_max_f16_e64 v0, -v0, 4.0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_negk_minnum_f16_no_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v0, -v0, 4.0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_negk_minnum_f16_no_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, 4.0
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_negk_minnum_f16_no_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v0, -v0, 4.0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_negk_minnum_f16_no_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, 4.0
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_negk_minnum_f16_no_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v0, -v0, 4.0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half -4.0, half %a)
%fneg = fneg half %min
ret half %fneg
@@ -1171,13 +1700,37 @@ define half @v_fneg_0_minnum_f16(half %a) #0 {
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_0_minnum_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_f16_e32 v0, 0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_0_minnum_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_min_f16_e32 v0.l, 0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_0_minnum_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_min_f16_e32 v0, 0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_0_minnum_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_min_f16_e32 v0.l, 0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_0_minnum_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_min_f16_e32 v0, 0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call nnan half @llvm.minnum.f16(half 0.0, half %a)
%fneg = fneg half %min
ret half %fneg
@@ -1199,13 +1752,37 @@ define half @v_fneg_neg0_minnum_f16_ieee(half %a) #0 {
; VI-NEXT: v_max_f16_e32 v0, 0, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_neg0_minnum_f16_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v0, 0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_neg0_minnum_f16_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e32 v0.l, 0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_neg0_minnum_f16_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e32 v0, 0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_neg0_minnum_f16_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e32 v0.l, 0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_neg0_minnum_f16_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e32 v0, 0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half -0.0, half %a)
%fneg = fneg half %min
ret half %fneg
@@ -1228,14 +1805,41 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 {
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_inv2pi_minnum_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_inv2pi_minnum_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_min_f16_e32 v0.l, 0.15915494, v0.l
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_inv2pi_minnum_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_min_f16_e32 v0, 0.15915494, v0
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_inv2pi_minnum_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_min_f16_e32 v0.l, 0.15915494, v0.l
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_inv2pi_minnum_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_min_f16_e32 v0, 0.15915494, v0
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half 0xH3118, half %a)
%fneg = fneg half %min
ret half %fneg
@@ -1258,14 +1862,41 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 {
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_neg_inv2pi_minnum_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_neg_inv2pi_minnum_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_min_f16_e32 v0.l, 0.15915494, v0.l
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_neg_inv2pi_minnum_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_min_f16_e32 v0, 0.15915494, v0
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_neg_inv2pi_minnum_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_min_f16_e32 v0.l, 0.15915494, v0.l
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_neg_inv2pi_minnum_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_min_f16_e32 v0, 0.15915494, v0
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half 0xH3118, half %a)
%fneg = fneg half %min
ret half %fneg
@@ -1286,11 +1917,29 @@ define half @v_fneg_neg0_minnum_f16_no_ieee(half %a) #4 {
; VI-NEXT: v_max_f16_e64 v0, -v0, 0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_neg0_minnum_f16_no_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v0, -v0, 0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_neg0_minnum_f16_no_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, 0
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_neg0_minnum_f16_no_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v0, -v0, 0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_neg0_minnum_f16_no_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, 0
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_neg0_minnum_f16_no_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v0, -v0, 0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half -0.0, half %a)
%fneg = fneg half %min
ret half %fneg
@@ -1316,14 +1965,41 @@ define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 {
; VI-NEXT: v_mul_f16_e64 v0, -v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v0, 0, v0
-; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_min_f16_e32 v0.l, 0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_min_f16_e32 v0, 0, v0
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_min_f16_e32 v0.l, 0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_min_f16_e32 v0, 0, v0
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half 0.0, half %a)
%fneg = fneg half %min
%mul = fmul half %fneg, %b
@@ -1350,14 +2026,41 @@ define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 {
; VI-NEXT: v_mul_f16_e64 v0, -v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v0, 0.15915494, v0
-; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_min_f16_e32 v0.l, 0.15915494, v0.l
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_min_f16_e32 v0, 0.15915494, v0
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_min_f16_e32 v0.l, 0.15915494, v0.l
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_min_f16_e32 v0, 0.15915494, v0
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half 0xH3118, half %a)
%fneg = fneg half %min
%mul = fmul half %fneg, %b
@@ -1383,13 +2086,37 @@ define half @v_fneg_0_minnum_foldable_use_f16_no_ieee(half %a, half %b) #4 {
; VI-NEXT: v_mul_f16_e64 v0, -v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_f16_e32 v0, 0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_min_f16_e32 v0.l, 0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_min_f16_e32 v0, 0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_min_f16_e32 v0.l, 0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_min_f16_e32 v0, 0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half 0.0, half %a)
%fneg = fneg half %min
%mul = fmul half %fneg, %b
@@ -1417,15 +2144,45 @@ define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b)
; VI-NEXT: v_mul_f16_e32 v1, -4.0, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT: v_mul_f16_e32 v1, -4.0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.h, -v1.l, -v1.l
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v1.l, -4.0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v1, -v1, -v1
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v1, -4.0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.h, -v1.l, -v1.l
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v1.l, -4.0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v1, -v1, -v1
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v1, -4.0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half %a, half %b)
%fneg = fneg half %min
%use1 = fmul half %min, 4.0
@@ -1455,14 +2212,41 @@ define <2 x half> @v_fneg_minnum_multi_use_minnum_f16_no_ieee(half %a, half %b)
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_f16_e32 v1, 4.0, v0
-; GFX11-NEXT: v_pack_b32_f16 v0, -v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.h, 4.0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: v_pack_b32_f16 v0, -v0.l, v0.h
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v1, 4.0, v0
+; GFX11-SAFE-FAKE16-NEXT: v_pack_b32_f16 v0, -v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.h, 4.0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: v_pack_b32_f16 v0, -v0.l, v0.h
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v1, 4.0, v0
+; GFX11-NSZ-FAKE16-NEXT: v_pack_b32_f16 v0, -v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%min = call half @llvm.minnum.f16(half %a, half %b)
%fneg = fneg half %min
%use1 = fmul half %min, 4.0
@@ -1494,14 +2278,41 @@ define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 {
; VI-NEXT: v_min_f16_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_maxnum_f16_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_maxnum_f16_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.h, -v1.l, -v1.l
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_maxnum_f16_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v1, -v1, -v1
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_maxnum_f16_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.h, -v1.l, -v1.l
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_maxnum_f16_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v1, -v1, -v1
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%max = call half @llvm.maxnum.f16(half %a, half %b)
%fneg = fneg half %max
ret half %fneg
@@ -1524,11 +2335,29 @@ define half @v_fneg_maxnum_f16_no_ieee(half %a, half %b) #4 {
; VI-NEXT: v_min_f16_e64 v0, -v0, -v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_maxnum_f16_no_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_f16_e64 v0, -v0, -v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_maxnum_f16_no_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_min_f16_e64 v0.l, -v0.l, -v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_maxnum_f16_no_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_min_f16_e64 v0, -v0, -v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_maxnum_f16_no_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_min_f16_e64 v0.l, -v0.l, -v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_maxnum_f16_no_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_min_f16_e64 v0, -v0, -v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%max = call half @llvm.maxnum.f16(half %a, half %b)
%fneg = fneg half %max
ret half %fneg
@@ -1547,11 +2376,29 @@ define half @v_fneg_self_maxnum_f16_ieee(half %a) #0 {
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_self_maxnum_f16_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_self_maxnum_f16_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_self_maxnum_f16_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_self_maxnum_f16_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_self_maxnum_f16_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%max = call half @llvm.maxnum.f16(half %a, half %a)
%max.fneg = fneg half %max
ret half %max.fneg
@@ -1570,11 +2417,29 @@ define half @v_fneg_self_maxnum_f16_no_ieee(half %a) #4 {
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_self_maxnum_f16_no_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_self_maxnum_f16_no_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_self_maxnum_f16_no_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_self_maxnum_f16_no_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_self_maxnum_f16_no_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%max = call half @llvm.maxnum.f16(half %a, half %a)
%max.fneg = fneg half %max
ret half %max.fneg
@@ -1596,13 +2461,37 @@ define half @v_fneg_posk_maxnum_f16_ieee(half %a) #0 {
; VI-NEXT: v_min_f16_e32 v0, -4.0, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_posk_maxnum_f16_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v0, -4.0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_posk_maxnum_f16_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_min_f16_e32 v0.l, -4.0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_posk_maxnum_f16_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_min_f16_e32 v0, -4.0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_posk_maxnum_f16_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_min_f16_e32 v0.l, -4.0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_posk_maxnum_f16_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_min_f16_e32 v0, -4.0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%max = call half @llvm.maxnum.f16(half 4.0, half %a)
%fneg = fneg half %max
ret half %fneg
@@ -1623,11 +2512,29 @@ define half @v_fneg_posk_maxnum_f16_no_ieee(half %a) #4 {
; VI-NEXT: v_min_f16_e64 v0, -v0, -4.0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_posk_maxnum_f16_no_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_f16_e64 v0, -v0, -4.0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_posk_maxnum_f16_no_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_min_f16_e64 v0.l, -v0.l, -4.0
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_posk_maxnum_f16_no_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_min_f16_e64 v0, -v0, -4.0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_posk_maxnum_f16_no_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_min_f16_e64 v0.l, -v0.l, -4.0
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_posk_maxnum_f16_no_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_min_f16_e64 v0, -v0, -4.0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%max = call half @llvm.maxnum.f16(half 4.0, half %a)
%fneg = fneg half %max
ret half %fneg
@@ -1649,13 +2556,37 @@ define half @v_fneg_negk_maxnum_f16_ieee(half %a) #0 {
; VI-NEXT: v_min_f16_e32 v0, 4.0, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_negk_maxnum_f16_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v0, 4.0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_negk_maxnum_f16_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_min_f16_e32 v0.l, 4.0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_negk_maxnum_f16_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_min_f16_e32 v0, 4.0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_negk_maxnum_f16_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_min_f16_e32 v0.l, 4.0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_negk_maxnum_f16_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_min_f16_e32 v0, 4.0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%max = call half @llvm.maxnum.f16(half -4.0, half %a)
%fneg = fneg half %max
ret half %fneg
@@ -1676,11 +2607,29 @@ define half @v_fneg_negk_maxnum_f16_no_ieee(half %a) #4 {
; VI-NEXT: v_min_f16_e64 v0, -v0, 4.0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_negk_maxnum_f16_no_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_f16_e64 v0, -v0, 4.0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_negk_maxnum_f16_no_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_min_f16_e64 v0.l, -v0.l, 4.0
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_negk_maxnum_f16_no_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_min_f16_e64 v0, -v0, 4.0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_negk_maxnum_f16_no_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_min_f16_e64 v0.l, -v0.l, 4.0
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_negk_maxnum_f16_no_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_min_f16_e64 v0, -v0, 4.0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%max = call half @llvm.maxnum.f16(half -4.0, half %a)
%fneg = fneg half %max
ret half %fneg
@@ -1703,13 +2652,37 @@ define half @v_fneg_0_maxnum_f16(half %a) #0 {
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_0_maxnum_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v0, 0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_0_maxnum_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e32 v0.l, 0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_0_maxnum_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e32 v0, 0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_0_maxnum_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e32 v0.l, 0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_0_maxnum_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e32 v0, 0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%max = call nnan half @llvm.maxnum.f16(half 0.0, half %a)
%fneg = fneg half %max
ret half %fneg
@@ -1731,13 +2704,37 @@ define half @v_fneg_neg0_maxnum_f16_ieee(half %a) #0 {
; VI-NEXT: v_min_f16_e32 v0, 0, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_neg0_maxnum_f16_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v0, 0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_neg0_maxnum_f16_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_min_f16_e32 v0.l, 0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_neg0_maxnum_f16_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_min_f16_e32 v0, 0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_neg0_maxnum_f16_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_min_f16_e32 v0.l, 0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_neg0_maxnum_f16_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_min_f16_e32 v0, 0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%max = call half @llvm.maxnum.f16(half -0.0, half %a)
%fneg = fneg half %max
ret half %fneg
@@ -1758,11 +2755,29 @@ define half @v_fneg_neg0_maxnum_f16_no_ieee(half %a) #4 {
; VI-NEXT: v_min_f16_e64 v0, -v0, 0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_neg0_maxnum_f16_no_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_min_f16_e64 v0, -v0, 0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_neg0_maxnum_f16_no_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_min_f16_e64 v0.l, -v0.l, 0
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_neg0_maxnum_f16_no_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_min_f16_e64 v0, -v0, 0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_neg0_maxnum_f16_no_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_min_f16_e64 v0.l, -v0.l, 0
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_neg0_maxnum_f16_no_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_min_f16_e64 v0, -v0, 0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%max = call half @llvm.maxnum.f16(half -0.0, half %a)
%fneg = fneg half %max
ret half %fneg
@@ -1788,14 +2803,41 @@ define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 {
; VI-NEXT: v_mul_f16_e64 v0, -v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_max_f16_e32 v0, 0, v0
-; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e32 v0.l, 0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e32 v0, 0, v0
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e32 v0.l, 0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e32 v0, 0, v0
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%max = call half @llvm.maxnum.f16(half 0.0, half %a)
%fneg = fneg half %max
%mul = fmul half %fneg, %b
@@ -1821,13 +2863,37 @@ define half @v_fneg_0_maxnum_foldable_use_f16_no_ieee(half %a, half %b) #4 {
; VI-NEXT: v_mul_f16_e64 v0, -v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v0, 0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e32 v0.l, 0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e32 v0, 0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e32 v0.l, 0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e32 v0, 0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%max = call half @llvm.maxnum.f16(half 0.0, half %a)
%fneg = fneg half %max
%mul = fmul half %fneg, %b
@@ -1855,15 +2921,45 @@ define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b)
; VI-NEXT: v_mul_f16_e32 v1, -4.0, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT: v_mul_f16_e32 v1, -4.0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.h, -v1.l, -v1.l
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v1.l, -4.0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v1, -v1, -v1
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v1, -4.0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.h, -v1.l, -v1.l
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v1.l, -4.0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v1, -v1, -v1
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v1, -4.0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%max = call half @llvm.maxnum.f16(half %a, half %b)
%fneg = fneg half %max
%use1 = fmul half %max, 4.0
@@ -1893,14 +2989,41 @@ define <2 x half> @v_fneg_maxnum_multi_use_maxnum_f16_no_ieee(half %a, half %b)
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_f16_e32 v1, 4.0, v0
-; GFX11-NEXT: v_pack_b32_f16 v0, -v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.h, 4.0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: v_pack_b32_f16 v0, -v0.l, v0.h
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v1, 4.0, v0
+; GFX11-SAFE-FAKE16-NEXT: v_pack_b32_f16 v0, -v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.h, 4.0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: v_pack_b32_f16 v0, -v0.l, v0.h
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v1, 4.0, v0
+; GFX11-NSZ-FAKE16-NEXT: v_pack_b32_f16 v0, -v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%max = call half @llvm.maxnum.f16(half %a, half %b)
%fneg = fneg half %max
%use1 = fmul half %max, 4.0
@@ -1952,19 +3075,33 @@ define half @v_fneg_fma_f16(half %a, half %b, half %c) #0 {
; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_fma_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, v0, v1
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_fma_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_fma_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v2.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_fma_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_fmac_f16_e32 v2, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v2
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_fma_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, -v2.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_fma_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, -v2
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fma = call half @llvm.fma.f16(half %a, half %b, half %c)
%fneg = fneg half %fma
ret half %fneg
@@ -1991,13 +3128,37 @@ define { half, half } @v_fneg_fma_store_use_fma_f16(half %a, half %b, half %c) #
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_fma_store_use_fma_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v1, v0, v1, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_fma_store_use_fma_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_fma_f16 v1.l, v0.l, v1.l, v2.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_fma_store_use_fma_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_fma_f16 v1, v0, v1, v2
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_fma_store_use_fma_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v1.l, v0.l, v1.l, v2.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_fma_store_use_fma_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v1, v0, v1, v2
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fma = call half @llvm.fma.f16(half %a, half %b, half %c)
%fneg = fneg half %fma
%insert.0 = insertvalue { half, half } poison, half %fneg, 0
@@ -2048,22 +3209,39 @@ define { half, half } @v_fneg_fma_multi_use_fma_f16(half %a, half %b, half %c) #
; VI-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_fma_multi_use_fma_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, v0, v1
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2
-; GFX11-SAFE-NEXT: v_mul_f16_e32 v1, 4.0, v2
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_fma_multi_use_fma_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_fma_multi_use_fma_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v2.l
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v1.l, 4.0, v2.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_fma_multi_use_fma_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_fmac_f16_e32 v2, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v2
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v1, 4.0, v2
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_fma_multi_use_fma_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, -v2.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v1.l, -4.0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_fma_multi_use_fma_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, -v2
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v1, -4.0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fma = call half @llvm.fma.f16(half %a, half %b, half %c)
%fneg = fneg half %fma
%use1 = fmul half %fma, 4.0
@@ -2111,19 +3289,33 @@ define half @v_fneg_fma_fneg_x_y_f16(half %a, half %b, half %c) #0 {
; VI-NSZ-NEXT: v_fma_f16 v0, v0, v1, -v2
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_fma_fneg_x_y_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_fma_f16 v0, -v0, v1, v2
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_fma_fneg_x_y_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, v1, -v2
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_fma_fneg_x_y_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v1.l, v2.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_fma_fneg_x_y_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_fma_f16 v0, -v0, v1, v2
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_fma_fneg_x_y_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, -v2.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_fma_fneg_x_y_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v0, v0, v1, -v2
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg half %a
%fma = call half @llvm.fma.f16(half %fneg.a, half %b, half %c)
%fneg = fneg half %fma
@@ -2169,19 +3361,33 @@ define half @v_fneg_fma_x_fneg_y_f16(half %a, half %b, half %c) #0 {
; VI-NSZ-NEXT: v_fma_f16 v0, v0, v1, -v2
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_fma_x_fneg_y_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_fma_f16 v0, v0, -v1, v2
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_fma_x_fneg_y_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, v1, -v2
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_fma_x_fneg_y_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, v2.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_fma_x_fneg_y_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, v2
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_fma_x_fneg_y_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, -v2.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_fma_x_fneg_y_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v0, v0, v1, -v2
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.b = fneg half %b
%fma = call half @llvm.fma.f16(half %a, half %fneg.b, half %c)
%fneg = fneg half %fma
@@ -2227,19 +3433,33 @@ define half @v_fneg_fma_fneg_fneg_y_f16(half %a, half %b, half %c) #0 {
; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_fma_fneg_fneg_y_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, v0, v1
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_fma_fneg_fneg_y_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_fma_fneg_fneg_y_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v2.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_fma_fneg_fneg_y_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_fmac_f16_e32 v2, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v2
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_fma_fneg_fneg_y_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, -v2.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_fma_fneg_fneg_y_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, -v2
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg half %a
%fneg.b = fneg half %b
%fma = call half @llvm.fma.f16(half %fneg.a, half %fneg.b, half %c)
@@ -2286,19 +3506,33 @@ define half @v_fneg_fma_fneg_x_fneg_f16(half %a, half %b, half %c) #0 {
; VI-NSZ-NEXT: v_fma_f16 v0, v0, v1, v2
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_fma_f16 v0, -v0, v1, -v2
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_fma_fneg_x_fneg_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, v1, v2
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_fma_fneg_x_fneg_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v1.l, -v2.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_fma_fneg_x_fneg_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_fma_f16 v0, -v0, v1, -v2
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_fma_fneg_x_fneg_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, v2.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_fma_fneg_x_fneg_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg half %a
%fneg.c = fneg half %c
%fma = call half @llvm.fma.f16(half %fneg.a, half %b, half %fneg.c)
@@ -2345,19 +3579,33 @@ define half @v_fneg_fma_x_y_fneg_f16(half %a, half %b, half %c) #0 {
; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v1, v2
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_fma_f16 v0, v0, v1, -v2
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_fma_x_y_fneg_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, v2
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_fma_x_y_fneg_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, -v2.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_fma_x_y_fneg_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_fma_f16 v0, v0, v1, -v2
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_fma_x_y_fneg_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, v2.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_fma_x_y_fneg_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, v2
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.c = fneg half %c
%fma = call half @llvm.fma.f16(half %a, half %b, half %fneg.c)
%fneg = fneg half %fma
@@ -2409,24 +3657,43 @@ define { half, half } @v_fneg_fma_store_use_fneg_x_y_f16(half %a, half %b, half
; VI-NSZ-NEXT: v_mov_b32_e32 v1, v3
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_fma_f16 v1, -v0, v1, v2
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v1
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0
-; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, v2
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_fma_f16 v2, v0, v1, -v2
-; GFX11-NSZ-NEXT: v_xor_b32_e32 v1, 0x8000, v0
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NSZ-NEXT: v_mov_b32_e32 v0, v2
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v2.l
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v0.h
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_fma_f16 v1, -v0, v1, v2
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v1
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.l, v0.h, v1.l, -v2.l
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v0.h
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v2, v0, v1, -v2
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg half %a
%fma = call half @llvm.fma.f16(half %fneg.a, half %b, half %c)
%fneg = fneg half %fma
@@ -2485,24 +3752,43 @@ define { half, half } @v_fneg_fma_multi_use_fneg_x_y_f16(half %a, half %b, half
; VI-NSZ-NEXT: v_mov_b32_e32 v0, v2
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_fma_f16 v1, -v0, v1, v2
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v1
-; GFX11-SAFE-NEXT: v_mul_f16_e64 v1, -v0, v3
-; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, v2
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_fma_f16 v2, v0, v1, -v2
-; GFX11-NSZ-NEXT: v_mul_f16_e64 v1, -v0, v3
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NSZ-NEXT: v_mov_b32_e32 v0, v2
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v2.l
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v1.l, -v0.h, v3.l
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_fma_f16 v1, -v0, v1, v2
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v1
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v1, -v0, v3
+; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.l, v0.h, v1.l, -v2.l
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v1.l, -v0.h, v3.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v2, v0, v1, -v2
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v1, -v0, v3
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg half %a
%fma = call half @llvm.fma.f16(half %fneg.a, half %b, half %c)
%fneg = fneg half %fma
@@ -2555,19 +3841,33 @@ define half @v_fneg_fmad_f16(half %a, half %b, half %c) #0 {
; VI-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_fmad_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, v0, v1
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_fmad_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_fmad_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v2.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_fmad_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_fmac_f16_e32 v2, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v2
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_fmad_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, -v2.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_fmad_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, -v2
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fma = call half @llvm.fmuladd.f16(half %a, half %b, half %c)
%fneg = fneg half %fma
ret half %fneg
@@ -2748,22 +4048,39 @@ define { half, half } @v_fneg_fmad_multi_use_fmad_f16(half %a, half %b, half %c)
; VI-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_fmad_multi_use_fmad_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, v0, v1
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v2
-; GFX11-SAFE-NEXT: v_mul_f16_e32 v1, 4.0, v2
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_fmad_multi_use_fmad_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_fma_f16 v0, v0, -v1, -v2
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NSZ-NEXT: v_mul_f16_e32 v1, -4.0, v0
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_fmad_multi_use_fmad_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v2.l
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v1.l, 4.0, v2.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_fmad_multi_use_fmad_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_fmac_f16_e32 v2, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v2
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v1, 4.0, v2
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_fmad_multi_use_fmad_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, -v2.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v1.l, -4.0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_fmad_multi_use_fmad_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, -v2
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v1, -4.0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fma = call half @llvm.fmuladd.f16(half %a, half %b, half %c)
%fneg = fneg half %fma
%use1 = fmul half %fma, 4.0
@@ -2791,14 +4108,41 @@ define double @v_fneg_fp_extend_f16_to_f64(half %a) #0 {
; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_fp_extend_f16_to_f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_fp_extend_f16_to_f64:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_fp_extend_f16_to_f64:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-SAFE-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_fp_extend_f16_to_f64:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_fp_extend_f16_to_f64:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NSZ-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fpext = fpext half %a to double
%fneg = fneg double %fpext
ret double %fneg
@@ -2818,13 +4162,37 @@ define double @v_fneg_fp_extend_fneg_f16_to_f64(half %a) #0 {
; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_fp_extend_fneg_f16_to_f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_fp_extend_fneg_f16_to_f64:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_fp_extend_fneg_f16_to_f64:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_fp_extend_fneg_f16_to_f64:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_fp_extend_fneg_f16_to_f64:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg half %a
%fpext = fpext half %fneg.a to double
%fneg = fneg double %fpext
@@ -2849,15 +4217,45 @@ define { double, half } @v_fneg_fp_extend_store_use_fneg_f16_to_f64(half %a) #0
; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_fp_extend_store_use_fneg_f16_to_f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v2
-; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v2
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_fp_extend_store_use_fneg_f16_to_f64:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v2.l, 0x8000, v2.l
+; GFX11-SAFE-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_fp_extend_store_use_fneg_f16_to_f64:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v2
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX11-SAFE-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_fp_extend_store_use_fneg_f16_to_f64:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v2.l, 0x8000, v2.l
+; GFX11-NSZ-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_fp_extend_store_use_fneg_f16_to_f64:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NSZ-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v2
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX11-NSZ-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg half %a
%fpext = fpext half %fneg.a to double
%fneg = fneg double %fpext
@@ -2884,16 +4282,49 @@ define { double, double } @v_fneg_multi_use_fp_extend_fneg_f16_to_f64(half %a) #
; VI-NEXT: v_mov_b32_e32 v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
-; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, v2
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v1, 0x80000000, v3
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x80000000, v3
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b32_e32 v1, 0x80000000, v3
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v1, 0x80000000, v3
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fpext = fpext half %a to double
%fneg = fneg double %fpext
%insert.0 = insertvalue { double, double } poison, double %fneg, 0
@@ -2920,15 +4351,45 @@ define { double, double } @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64(h
; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT: v_mul_f64 v[2:3], v[0:1], 4.0
-; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f64 v[2:3], v[0:1], 4.0
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f64 v[2:3], v[0:1], 4.0
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f64 v[2:3], v[0:1], 4.0
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f64 v[2:3], v[0:1], 4.0
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fpext = fpext half %a to double
%fneg = fneg double %fpext
%mul = fmul double %fpext, 4.0
@@ -2952,13 +4413,37 @@ define { float, float } @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(half %a) #0
; VI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fpext = fpext half %a to float
%fneg = fneg float %fpext
%insert.0 = insertvalue { float, float } poison, float %fneg, 0
@@ -3378,56 +4863,211 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
; VI-NEXT: v_mov_b32_e32 v1, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_fp_round_store_use_fneg_f64_to_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_or_b32 v2, 0x1ff, v1, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1
-; GFX11-NEXT: v_bfe_u32 v4, v1, 20, 11
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT: v_sub_nc_u32_e32 v5, 0x3f1, v4
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_or_b32 v2, 0xffe, v3, v2
-; GFX11-NEXT: v_med3_i32 v3, v5, 0, 13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v5, 0x1000, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v3, v6
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v5
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v3, v6, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0xfffffc10, v4
-; GFX11-NEXT: v_lshl_or_b32 v5, v4, 12, v2
-; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v5, 7, v3
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 2, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v5
-; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v5
-; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT: v_or_b32_e32 v5, v5, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v6, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v6, vcc_lo
-; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v4
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_or_b32 v3, 0x8000, v5, v2
-; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_fp_round_store_use_fneg_f64_to_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_and_or_b32 v2, 0x1ff, v1, v0
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-SAFE-TRUE16-NEXT: v_bfe_u32 v4, v1, 20, 11
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-SAFE-TRUE16-NEXT: v_sub_nc_u32_e32 v5, 0x3f1, v4
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SAFE-TRUE16-NEXT: v_and_or_b32 v2, 0xffe, v3, v2
+; GFX11-SAFE-TRUE16-NEXT: v_med3_i32 v3, v5, 0, 13
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_or_b32_e32 v5, 0x1000, v2
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v3, v5
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v3, v6
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v5
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
+; GFX11-SAFE-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0xfffffc10, v4
+; GFX11-SAFE-TRUE16-NEXT: v_lshl_or_b32 v5, v4, 12, v2
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v4
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v5, 7, v3
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 2, v3
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v5
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v5
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-SAFE-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_dual_mov_b32 v6, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v5
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v6, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v4
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v4
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-SAFE-TRUE16-NEXT: v_and_or_b32 v3, 0x8000, v5, v2
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
+; GFX11-SAFE-TRUE16-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_fp_round_store_use_fneg_f64_to_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_and_or_b32 v2, 0x1ff, v1, v0
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-SAFE-FAKE16-NEXT: v_bfe_u32 v4, v1, 20, 11
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-SAFE-FAKE16-NEXT: v_sub_nc_u32_e32 v5, 0x3f1, v4
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SAFE-FAKE16-NEXT: v_and_or_b32 v2, 0xffe, v3, v2
+; GFX11-SAFE-FAKE16-NEXT: v_med3_i32 v3, v5, 0, 13
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v5, 0x1000, v2
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v3, v5
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v3, v6
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v5
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v3, v6, v3
+; GFX11-SAFE-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0xfffffc10, v4
+; GFX11-SAFE-FAKE16-NEXT: v_lshl_or_b32 v5, v4, 12, v2
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v4
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v5, 7, v3
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 2, v3
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v5
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v5
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_dual_mov_b32 v6, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v5
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v6, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v4
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v4
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_and_or_b32 v3, 0x8000, v5, v2
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
+; GFX11-SAFE-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, v3
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_fp_round_store_use_fneg_f64_to_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_and_or_b32 v2, 0x1ff, v1, v0
+; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-NSZ-TRUE16-NEXT: v_bfe_u32 v4, v1, 20, 11
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NSZ-TRUE16-NEXT: v_sub_nc_u32_e32 v5, 0x3f1, v4
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NSZ-TRUE16-NEXT: v_and_or_b32 v2, 0xffe, v3, v2
+; GFX11-NSZ-TRUE16-NEXT: v_med3_i32 v3, v5, 0, 13
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_or_b32_e32 v5, 0x1000, v2
+; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v3, v5
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v3, v6
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v5
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3
+; GFX11-NSZ-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0xfffffc10, v4
+; GFX11-NSZ-TRUE16-NEXT: v_lshl_or_b32 v5, v4, 12, v2
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v4
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v5, 7, v3
+; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 2, v3
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v5
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v5
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NSZ-TRUE16-NEXT: v_or_b32_e32 v5, v5, v6
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_dual_mov_b32 v6, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v5
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v6, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v4
+; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v4
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NSZ-TRUE16-NEXT: v_and_or_b32 v3, 0x8000, v5, v2
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
+; GFX11-NSZ-TRUE16-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_fp_round_store_use_fneg_f64_to_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_and_or_b32 v2, 0x1ff, v1, v0
+; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1
+; GFX11-NSZ-FAKE16-NEXT: v_bfe_u32 v4, v1, 20, 11
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NSZ-FAKE16-NEXT: v_sub_nc_u32_e32 v5, 0x3f1, v4
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NSZ-FAKE16-NEXT: v_and_or_b32 v2, 0xffe, v3, v2
+; GFX11-NSZ-FAKE16-NEXT: v_med3_i32 v3, v5, 0, 13
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v5, 0x1000, v2
+; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v6, v3, v5
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v3, v6
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v5
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v3, v6, v3
+; GFX11-NSZ-FAKE16-NEXT: v_add_nc_u32_e32 v4, 0xfffffc10, v4
+; GFX11-NSZ-FAKE16-NEXT: v_lshl_or_b32 v5, v4, 12, v2
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v4
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v5, 7, v3
+; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 2, v3
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v5
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v5
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v5, v5, v6
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_dual_mov_b32 v6, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v5
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v6, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v4
+; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v4
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NSZ-FAKE16-NEXT: v_and_or_b32 v3, 0x8000, v5, v2
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
+; GFX11-NSZ-FAKE16-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, v3
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg double %a
%fpround = fptrunc double %fneg.a to half
%fneg = fneg half %fpround
@@ -3696,55 +5336,205 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v1
-; GFX11-NEXT: v_bfe_u32 v3, v1, 20, 11
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v3
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v2, v0
-; GFX11-NEXT: v_med3_i32 v2, v4, 0, 13
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3
-; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0
-; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
-; GFX11-NEXT: v_and_b32_e32 v4, 7, v2
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
-; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4
-; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: v_or_b32_e32 v4, v4, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo
-; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v1, 0x8000, v1, v0
-; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v1
+; GFX11-SAFE-TRUE16-NEXT: v_bfe_u32 v3, v1, 20, 11
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-TRUE16-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v3
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SAFE-TRUE16-NEXT: v_and_or_b32 v0, 0xffe, v2, v0
+; GFX11-SAFE-TRUE16-NEXT: v_med3_i32 v2, v4, 0, 13
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_or_b32_e32 v4, 0x1000, v0
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v2, v4
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v2, v5
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX11-SAFE-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3
+; GFX11-SAFE-TRUE16-NEXT: v_lshl_or_b32 v4, v3, 12, v0
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v4, 7, v2
+; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 2, v2
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-TRUE16-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_and_or_b32 v1, 0x8000, v1, v0
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v1
+; GFX11-SAFE-FAKE16-NEXT: v_bfe_u32 v3, v1, 20, 11
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-FAKE16-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v3
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SAFE-FAKE16-NEXT: v_and_or_b32 v0, 0xffe, v2, v0
+; GFX11-SAFE-FAKE16-NEXT: v_med3_i32 v2, v4, 0, 13
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v4, 0x1000, v0
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v2, v4
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v2, v5
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX11-SAFE-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3
+; GFX11-SAFE-FAKE16-NEXT: v_lshl_or_b32 v4, v3, 12, v0
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v4, 7, v2
+; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 2, v2
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_and_or_b32 v1, 0x8000, v1, v0
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0
+; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v1
+; GFX11-NSZ-TRUE16-NEXT: v_bfe_u32 v3, v1, 20, 11
+; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-TRUE16-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v3
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NSZ-TRUE16-NEXT: v_and_or_b32 v0, 0xffe, v2, v0
+; GFX11-NSZ-TRUE16-NEXT: v_med3_i32 v2, v4, 0, 13
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_or_b32_e32 v4, 0x1000, v0
+; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v2, v4
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v2, v5
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX11-NSZ-TRUE16-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3
+; GFX11-NSZ-TRUE16-NEXT: v_lshl_or_b32 v4, v3, 12, v0
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v4, 7, v2
+; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 2, v2
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-TRUE16-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_and_or_b32 v1, 0x8000, v1, v0
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0
+; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v1
+; GFX11-NSZ-FAKE16-NEXT: v_bfe_u32 v3, v1, 20, 11
+; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-FAKE16-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v3
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NSZ-FAKE16-NEXT: v_and_or_b32 v0, 0xffe, v2, v0
+; GFX11-NSZ-FAKE16-NEXT: v_med3_i32 v2, v4, 0, 13
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v4, 0x1000, v0
+; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v2, v4
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v2, v5
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX11-NSZ-FAKE16-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3
+; GFX11-NSZ-FAKE16-NEXT: v_lshl_or_b32 v4, v3, 12, v0
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v4, 7, v2
+; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 2, v2
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v4, v4, v5
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_and_or_b32 v1, 0x8000, v1, v0
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fpround = fptrunc double %a to half
%fneg = fneg half %fpround
%insert.0 = insertvalue { half, half } poison, half %fneg, 0
@@ -3771,11 +5561,29 @@ define half @v_fneg_trunc_f16(half %a) #0 {
; VI-NEXT: v_trunc_f16_e64 v0, -v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_trunc_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_trunc_f16_e64 v0, -v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_trunc_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_trunc_f16_e64 v0.l, -v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_trunc_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_trunc_f16_e64 v0, -v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_trunc_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_trunc_f16_e64 v0.l, -v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_trunc_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_trunc_f16_e64 v0, -v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%trunc = call half @llvm.trunc.f16(half %a)
%fneg = fneg half %trunc
ret half %fneg
@@ -3842,34 +5650,69 @@ define half @v_fneg_round_f16(half %a) #0 {
; VI-NSZ-NEXT: v_sub_f16_e64 v0, -v1, v0
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: v_fneg_round_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_trunc_f16_e32 v1, v0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_sub_f16_e32 v2, v0, v1
-; GFX11-SAFE-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0
-; GFX11-SAFE-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_add_f16_e32 v0, v1, v0
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: v_fneg_round_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_trunc_f16_e32 v1, v0
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NSZ-NEXT: v_sub_f16_e32 v2, v0, v1
-; GFX11-NSZ-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0
-; GFX11-NSZ-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NSZ-NEXT: v_sub_f16_e64 v0, -v1, v0
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_round_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_trunc_f16_e32 v1.h, v1.l
+; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v1.l, v1.l, v1.h
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_ge_f16_e64 s0, |v1.l|, 0.5
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, 0, 0x3c00, s0
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, v1.h, v0.l
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_round_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_trunc_f16_e32 v1, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v2, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-SAFE-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v0, v1, v0
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_round_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_trunc_f16_e32 v1.h, v1.l
+; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v1.l, v1.l, v1.h
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_ge_f16_e64 s0, |v1.l|, 0.5
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.l, 0, 0x3c00, s0
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e64 v0.l, -v1.h, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_round_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_trunc_f16_e32 v1, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v2, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0
+; GFX11-NSZ-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v2, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e64 v0, -v1, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%round = call half @llvm.round.f16(half %a)
%fneg = fneg half %round
ret half %fneg
@@ -3894,11 +5737,29 @@ define half @v_fneg_rint_f16(half %a) #0 {
; VI-NEXT: v_rndne_f16_e64 v0, -v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_rint_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_rndne_f16_e64 v0, -v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_rint_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_rndne_f16_e64 v0.l, -v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_rint_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_rndne_f16_e64 v0, -v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_rint_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_rndne_f16_e64 v0.l, -v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_rint_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_rndne_f16_e64 v0, -v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%rint = call half @llvm.rint.f16(half %a)
%fneg = fneg half %rint
ret half %fneg
@@ -3923,11 +5784,29 @@ define half @v_fneg_nearbyint_f16(half %a) #0 {
; VI-NEXT: v_rndne_f16_e64 v0, -v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_nearbyint_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_rndne_f16_e64 v0, -v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_nearbyint_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_rndne_f16_e64 v0.l, -v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_nearbyint_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_rndne_f16_e64 v0, -v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_nearbyint_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_rndne_f16_e64 v0.l, -v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_nearbyint_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_rndne_f16_e64 v0, -v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%nearbyint = call half @llvm.nearbyint.f16(half %a)
%fneg = fneg half %nearbyint
ret half %fneg
@@ -3956,13 +5835,37 @@ define half @v_fneg_sin_f16(half %a) #0 {
; VI-NEXT: v_sin_f16_e32 v0, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_sin_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_f16_e32 v0, 0xb118, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_sin_f16_e32 v0, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_sin_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0xb118, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_sin_f16_e32 v0.l, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_sin_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v0, 0xb118, v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_sin_f16_e32 v0, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_sin_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0xb118, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_sin_f16_e32 v0.l, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_sin_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v0, 0xb118, v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_sin_f16_e32 v0, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sin = call half @llvm.sin.f16(half %a)
%fneg = fneg half %sin
ret half %fneg
@@ -3986,11 +5889,29 @@ define half @v_fneg_canonicalize_f16(half %a) #0 {
; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_canonicalize_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_canonicalize_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_canonicalize_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_canonicalize_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_canonicalize_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%trunc = call half @llvm.canonicalize.f16(half %a)
%fneg = fneg half %trunc
ret half %fneg
@@ -4053,27 +5974,93 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_copytoreg_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v6, 0x3ff, v31
-; GFX11-NEXT: v_mul_f16_e32 v2, v2, v3
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 1, v6
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v5
-; GFX11-NEXT: s_cbranch_execz .LBB81_2
-; GFX11-NEXT: ; %bb.1: ; %if
-; GFX11-NEXT: v_mul_f16_e64 v3, -v2, v4
-; GFX11-NEXT: global_store_b16 v[0:1], v3, off dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: .LBB81_2: ; %endif
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: global_store_b16 v[0:1], v2, off dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_copytoreg_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v6, 0x3ff, v31
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v2.l, v2.l, v3.l
+; GFX11-SAFE-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 1, v6
+; GFX11-SAFE-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6
+; GFX11-SAFE-TRUE16-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v5
+; GFX11-SAFE-TRUE16-NEXT: s_cbranch_execz .LBB81_2
+; GFX11-SAFE-TRUE16-NEXT: ; %bb.1: ; %if
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v2.h, -v2.l, v4.l
+; GFX11-SAFE-TRUE16-NEXT: global_store_d16_hi_b16 v[0:1], v2, off dlc
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-TRUE16-NEXT: .LBB81_2: ; %endif
+; GFX11-SAFE-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-SAFE-TRUE16-NEXT: global_store_b16 v[0:1], v2, off dlc
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_copytoreg_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v6, 0x3ff, v31
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX11-SAFE-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 1, v6
+; GFX11-SAFE-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6
+; GFX11-SAFE-FAKE16-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v5
+; GFX11-SAFE-FAKE16-NEXT: s_cbranch_execz .LBB81_2
+; GFX11-SAFE-FAKE16-NEXT: ; %bb.1: ; %if
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v3, -v2, v4
+; GFX11-SAFE-FAKE16-NEXT: global_store_b16 v[0:1], v3, off dlc
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-FAKE16-NEXT: .LBB81_2: ; %endif
+; GFX11-SAFE-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-SAFE-FAKE16-NEXT: global_store_b16 v[0:1], v2, off dlc
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_copytoreg_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v6, 0x3ff, v31
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v2.l, v2.l, v3.l
+; GFX11-NSZ-TRUE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 1, v6
+; GFX11-NSZ-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6
+; GFX11-NSZ-TRUE16-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v5
+; GFX11-NSZ-TRUE16-NEXT: s_cbranch_execz .LBB81_2
+; GFX11-NSZ-TRUE16-NEXT: ; %bb.1: ; %if
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v2.h, -v2.l, v4.l
+; GFX11-NSZ-TRUE16-NEXT: global_store_d16_hi_b16 v[0:1], v2, off dlc
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NSZ-TRUE16-NEXT: .LBB81_2: ; %endif
+; GFX11-NSZ-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NSZ-TRUE16-NEXT: global_store_b16 v[0:1], v2, off dlc
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_copytoreg_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v6, 0x3ff, v31
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v2, v2, v3
+; GFX11-NSZ-FAKE16-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 1, v6
+; GFX11-NSZ-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6
+; GFX11-NSZ-FAKE16-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v5
+; GFX11-NSZ-FAKE16-NEXT: s_cbranch_execz .LBB81_2
+; GFX11-NSZ-FAKE16-NEXT: ; %bb.1: ; %if
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v3, -v2, v4
+; GFX11-NSZ-FAKE16-NEXT: global_store_b16 v[0:1], v3, off dlc
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NSZ-FAKE16-NEXT: .LBB81_2: ; %endif
+; GFX11-NSZ-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NSZ-FAKE16-NEXT: global_store_b16 v[0:1], v2, off dlc
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
@@ -4121,14 +6108,41 @@ define half @v_fneg_inlineasm_f16(half %a, half %b, half %c, i32 %d) #0 {
; VI-NEXT: ;;#ASMEND
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_inlineasm_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_f16_e64 v0, v0, -v1
-; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use v0
-; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_inlineasm_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -v1.l
+; GFX11-SAFE-TRUE16-NEXT: ;;#ASMSTART
+; GFX11-SAFE-TRUE16-NEXT: ; use v0
+; GFX11-SAFE-TRUE16-NEXT: ;;#ASMEND
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_inlineasm_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v0, v0, -v1
+; GFX11-SAFE-FAKE16-NEXT: ;;#ASMSTART
+; GFX11-SAFE-FAKE16-NEXT: ; use v0
+; GFX11-SAFE-FAKE16-NEXT: ;;#ASMEND
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_inlineasm_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -v1.l
+; GFX11-NSZ-TRUE16-NEXT: ;;#ASMSTART
+; GFX11-NSZ-TRUE16-NEXT: ; use v0
+; GFX11-NSZ-TRUE16-NEXT: ;;#ASMEND
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_inlineasm_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v0, v0, -v1
+; GFX11-NSZ-FAKE16-NEXT: ;;#ASMSTART
+; GFX11-NSZ-FAKE16-NEXT: ; use v0
+; GFX11-NSZ-FAKE16-NEXT: ;;#ASMEND
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mul = fmul half %a, %b
%fneg = fneg half %mul
call void asm sideeffect "; use $0", "v"(half %fneg) #0
@@ -4165,16 +6179,49 @@ define half @v_fneg_inlineasm_multi_use_src_f16(ptr addrspace(1) %out, half %a,
; VI-NEXT: ;;#ASMEND
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fneg_inlineasm_multi_use_src_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_f16_e32 v0, v2, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v0
-; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use v1
-; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: v_fneg_inlineasm_multi_use_src_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, v2.l, v3.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: ;;#ASMSTART
+; GFX11-SAFE-TRUE16-NEXT: ; use v1
+; GFX11-SAFE-TRUE16-NEXT: ;;#ASMEND
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: v_fneg_inlineasm_multi_use_src_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v0
+; GFX11-SAFE-FAKE16-NEXT: ;;#ASMSTART
+; GFX11-SAFE-FAKE16-NEXT: ; use v1
+; GFX11-SAFE-FAKE16-NEXT: ;;#ASMEND
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: v_fneg_inlineasm_multi_use_src_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, v2.l, v3.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v0.l
+; GFX11-NSZ-TRUE16-NEXT: ;;#ASMSTART
+; GFX11-NSZ-TRUE16-NEXT: ; use v1
+; GFX11-NSZ-TRUE16-NEXT: ;;#ASMEND
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: v_fneg_inlineasm_multi_use_src_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v0, v2, v3
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v0
+; GFX11-NSZ-FAKE16-NEXT: ;;#ASMSTART
+; GFX11-NSZ-FAKE16-NEXT: ; use v1
+; GFX11-NSZ-FAKE16-NEXT: ;;#ASMEND
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
@@ -4212,14 +6259,41 @@ define { half, half } @multiuse_fneg_2_vop3_users_f16(half %a, half %b, half %c)
; VI-NEXT: v_mov_b32_e32 v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: multiuse_fneg_2_vop3_users_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v3, -v0, v1, v2
-; GFX11-NEXT: v_fma_f16 v1, -v0, v2, 2.0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: multiuse_fneg_2_vop3_users_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v2.l
+; GFX11-SAFE-TRUE16-NEXT: v_fma_f16 v1.l, -v0.h, v2.l, 2.0
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: multiuse_fneg_2_vop3_users_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_fma_f16 v3, -v0, v1, v2
+; GFX11-SAFE-FAKE16-NEXT: v_fma_f16 v1, -v0, v2, 2.0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: multiuse_fneg_2_vop3_users_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v1.l, v2.l
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v1.l, -v0.h, v2.l, 2.0
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: multiuse_fneg_2_vop3_users_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v3, -v0, v1, v2
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v1, -v0, v2, 2.0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg half %a
%fma0 = call half @llvm.fma.f16(half %fneg.a, half %b, half %c)
%fma1 = call half @llvm.fma.f16(half %fneg.a, half %c, half 2.0)
@@ -4252,14 +6326,41 @@ define { half, half } @multiuse_fneg_2_vop2_users_f16(half %a, half %b, half %c)
; VI-NEXT: v_mov_b32_e32 v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: multiuse_fneg_2_vop2_users_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_f16_e64 v3, -v0, v1
-; GFX11-NEXT: v_mul_f16_e64 v1, -v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: multiuse_fneg_2_vop2_users_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.h, v1.l
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v1.l, -v0.h, v2.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: multiuse_fneg_2_vop2_users_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v3, -v0, v1
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v1, -v0, v2
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: multiuse_fneg_2_vop2_users_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.h, v1.l
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v1.l, -v0.h, v2.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: multiuse_fneg_2_vop2_users_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v3, -v0, v1
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v1, -v0, v2
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg half %a
%mul0 = fmul half %fneg.a, %b
%mul1 = fmul half %fneg.a, %c
@@ -4290,12 +6391,33 @@ define { half, half } @multiuse_fneg_vop2_vop3_users_f16(ptr addrspace(1) %out,
; VI-NEXT: v_mul_f16_e64 v1, -v2, v4
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: multiuse_fneg_vop2_vop3_users_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_fma_f16 v0, -v2, v3, 2.0
-; GFX11-NEXT: v_mul_f16_e64 v1, -v2, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: multiuse_fneg_vop2_vop3_users_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_fma_f16 v0.l, -v2.l, v3.l, 2.0
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v1.l, -v2.l, v4.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: multiuse_fneg_vop2_vop3_users_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_fma_f16 v0, -v2, v3, 2.0
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v1, -v2, v4
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: multiuse_fneg_vop2_vop3_users_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.l, -v2.l, v3.l, 2.0
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v1.l, -v2.l, v4.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: multiuse_fneg_vop2_vop3_users_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v0, -v2, v3, 2.0
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v1, -v2, v4
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
@@ -4360,23 +6482,41 @@ define { half, half } @free_fold_src_code_size_cost_use_f16(ptr addrspace(1) %ou
; VI-NSZ-NEXT: v_mul_f16_e32 v1, v1, v5
; VI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SAFE-LABEL: free_fold_src_code_size_cost_use_f16:
-; GFX11-SAFE: ; %bb.0:
-; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SAFE-NEXT: v_fma_f16 v1, v2, v3, 2.0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_mul_f16_e64 v0, -v1, v4
-; GFX11-SAFE-NEXT: v_mul_f16_e64 v1, -v1, v5
-; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-NSZ-LABEL: free_fold_src_code_size_cost_use_f16:
-; GFX11-NSZ: ; %bb.0:
-; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NSZ-NEXT: v_fma_f16 v1, v2, -v3, -2.0
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, v1, v4
-; GFX11-NSZ-NEXT: v_mul_f16_e32 v1, v1, v5
-; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: free_fold_src_code_size_cost_use_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_fma_f16 v0.h, v2.l, v3.l, 2.0
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.h, v4.l
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v1.l, -v0.h, v5.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: free_fold_src_code_size_cost_use_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_fma_f16 v1, v2, v3, 2.0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v0, -v1, v4
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v1, -v1, v5
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: free_fold_src_code_size_cost_use_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.h, v2.l, -v3.l, -2.0
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.h, v4.l
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.h, v5.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: free_fold_src_code_size_cost_use_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v1, v2, -v3, -2.0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v0, v1, v4
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v5
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
@@ -4414,13 +6554,37 @@ define half @one_use_cost_to_fold_into_src_f16(ptr addrspace(1) %out, half %a, h
; VI-NEXT: v_fma_f16 v0, -v0, v3, v4
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: one_use_cost_to_fold_into_src_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_trunc_f16_e32 v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_fma_f16 v0, -v0, v3, v4
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: one_use_cost_to_fold_into_src_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v2.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v4.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: one_use_cost_to_fold_into_src_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_trunc_f16_e32 v0, v2
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_fma_f16 v0, -v0, v3, v4
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: one_use_cost_to_fold_into_src_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v2.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v3.l, v4.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: one_use_cost_to_fold_into_src_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_trunc_f16_e32 v0, v2
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v0, -v0, v3, v4
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
@@ -4456,14 +6620,41 @@ define { half, half } @multi_use_cost_to_fold_into_src(ptr addrspace(1) %out, ha
; VI-NEXT: v_mul_f16_e32 v1, v1, v5
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: multi_use_cost_to_fold_into_src:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_trunc_f16_e32 v1, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_fma_f16 v0, -v1, v3, v4
-; GFX11-NEXT: v_mul_f16_e32 v1, v1, v5
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: multi_use_cost_to_fold_into_src:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v2.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v3.l, v4.l
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.h, v5.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: multi_use_cost_to_fold_into_src:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_trunc_f16_e32 v1, v2
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_fma_f16 v0, -v1, v3, v4
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v5
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: multi_use_cost_to_fold_into_src:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v2.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.l, -v0.h, v3.l, v4.l
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.h, v5.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: multi_use_cost_to_fold_into_src:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_trunc_f16_e32 v1, v2
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v0, -v1, v3, v4
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v5
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%out.gep = getelementptr inbounds half, ptr addrspace(1) %out, i64 %tid.ext
@@ -4561,11 +6752,29 @@ define half @nnan_fmul_neg1_to_fneg(half %x, half %y) #0 {
; VI-NEXT: v_mul_f16_e64 v0, -v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: nnan_fmul_neg1_to_fneg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: nnan_fmul_neg1_to_fneg:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: nnan_fmul_neg1_to_fneg:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: nnan_fmul_neg1_to_fneg:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: nnan_fmul_neg1_to_fneg:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mul = fmul half %x, -1.0
%add = fmul nnan half %mul, %y
ret half %add
@@ -4590,11 +6799,29 @@ define half @denormal_fmul_neg1_to_fneg(half %x, half %y) {
; VI-NEXT: v_mul_f16_e64 v0, -v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: denormal_fmul_neg1_to_fneg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: denormal_fmul_neg1_to_fneg:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: denormal_fmul_neg1_to_fneg:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: denormal_fmul_neg1_to_fneg:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: denormal_fmul_neg1_to_fneg:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v0, -v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mul = fmul nnan half %x, -1.0
%add = fmul half %mul, %y
ret half %add
@@ -4621,13 +6848,37 @@ define half @denorm_snan_fmul_neg1_to_fneg(half %x, half %y) {
; VI-NEXT: v_mul_f16_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: denorm_snan_fmul_neg1_to_fneg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mul_f16_e64 v0, v0, -v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: denorm_snan_fmul_neg1_to_fneg:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: denorm_snan_fmul_neg1_to_fneg:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e64 v0, v0, -v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: denorm_snan_fmul_neg1_to_fneg:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: denorm_snan_fmul_neg1_to_fneg:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e64 v0, v0, -v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%canonical = fmul half %x, %x
%mul = fmul half %canonical, -1.0
%add = fmul half %mul, %y
@@ -4652,13 +6903,37 @@ define half @flush_snan_fmul_neg1_to_fneg(half %x, half %y) #0 {
; VI-NEXT: v_mul_f16_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flush_snan_fmul_neg1_to_fneg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v0, -v0, -v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: flush_snan_fmul_neg1_to_fneg:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: flush_snan_fmul_neg1_to_fneg:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: flush_snan_fmul_neg1_to_fneg:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: flush_snan_fmul_neg1_to_fneg:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_max_f16_e64 v0, -v0, -v0
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%quiet = call half @llvm.canonicalize.f16(half %x)
%mul = fmul half %quiet, -1.0
%add = fmul half %mul, %y
@@ -4688,14 +6963,41 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) {
; VI-NEXT: v_sub_f16_e32 v0, v3, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: fadd_select_fneg_fneg_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SAFE-TRUE16-LABEL: fadd_select_fneg_fneg_f16:
+; GFX11-SAFE-TRUE16: ; %bb.0:
+; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v3.l, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-FAKE16-LABEL: fadd_select_fneg_fneg_f16:
+; GFX11-SAFE-FAKE16: ; %bb.0:
+; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v3, v0
+; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-TRUE16-LABEL: fadd_select_fneg_fneg_f16:
+; GFX11-NSZ-TRUE16: ; %bb.0:
+; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
+; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v3.l, v0.l
+; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-NSZ-FAKE16-LABEL: fadd_select_fneg_fneg_f16:
+; GFX11-NSZ-FAKE16: ; %bb.0:
+; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v3, v0
+; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %arg0, 0
%neg.x = fneg half %x
%neg.y = fneg half %y
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index d6f6d440f9a83..9642b36ecb7e8 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -2,7 +2,8 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=CIVI,CI %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefixes=CIVI,VI %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, half %y) {
; CI-LABEL: fneg_fabs_fadd_f16:
@@ -46,18 +47,33 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fneg_fabs_fadd_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s3, s2, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_sub_f16_e64 v1, s3, |s2|
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fneg_fabs_fadd_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_sub_f16_e64 v0.l, v0.l, |v0.h|
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fneg_fabs_fadd_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_sub_f16_e64 v1, s3, |s2|
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%fabs = call half @llvm.fabs.f16(half %x)
%fsub = fsub half -0.0, %fabs
%fadd = fadd half %y, %fsub
@@ -108,18 +124,33 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fneg_fabs_fmul_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s3, s2, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mul_f16_e64 v1, s3, -|s2|
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fneg_fabs_fmul_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, v0.l, -|v0.h|
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fneg_fabs_fmul_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_mul_f16_e64 v1, s3, -|s2|
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%fabs = call half @llvm.fabs.f16(half %x)
%fsub = fsub half -0.0, %fabs
%fmul = fmul half %y, %fsub
@@ -166,17 +197,30 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fneg_fabs_free_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s2, 15
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fneg_fabs_free_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fneg_fabs_free_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s2, 15
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%bc = bitcast i16 %in to half
%fabs = call half @llvm.fabs.f16(half %bc)
%fsub = fsub half -0.0, %fabs
@@ -220,17 +264,30 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: fneg_fabs_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s2, 15
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fneg_fabs_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 15
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fneg_fabs_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_bitset1_b32 s2, 15
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%fabs = call half @llvm.fabs.f16(half %in)
%fsub = fsub half -0.0, %fabs
store half %fsub, ptr addrspace(1) %out, align 2
@@ -263,16 +320,27 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: v_fneg_fabs_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_fneg_fabs_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, 0x8000, v0
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_fneg_fabs_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v1, 0x8000, v1
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%val = load half, ptr addrspace(1) %in, align 2
%fabs = call half @llvm.fabs.f16(half %val)
%fsub = fsub half -0.0, %fabs
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index f904d6be80568..23e4ba9fd4ed7 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -2,7 +2,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,CI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,GFX8 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1100 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1100 -mattr=+real-true16 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mcpu=gfx1100 -mattr=-real-true16 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
; FIXME: Should be able to do scalar op
define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
@@ -41,17 +42,30 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_fneg_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_fneg_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_xor_b32 s2, s2, 0x8000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_fneg_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_xor_b32 s2, s2, 0x8000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%fneg = fsub half -0.0, %in
store half %fneg, ptr addrspace(1) %out
ret void
@@ -99,18 +113,31 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: v_fneg_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_fneg_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 1, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_fneg_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.in = getelementptr inbounds half, ptr addrspace(1) %in, i32 %tid
%gep.out = getelementptr inbounds half, ptr addrspace(1) %in, i32 %tid
@@ -156,17 +183,30 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: s_fneg_free_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s2, s2, 0x8000
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_fneg_free_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_xor_b32 s2, s2, 0x8000
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_fneg_free_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_xor_b32 s2, s2, 0x8000
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%bc = bitcast i16 %in to half
%fsub = fsub half -0.0, %bc
store half %fsub, ptr addrspace(1) %out
@@ -216,16 +256,27 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: global_store_short v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: v_fneg_fold_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mul_f16_e64 v1, -v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_fneg_fold_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mul_f16_e64 v0.l, -v0.l, v0.l
+; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_fneg_fold_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_mul_f16_e64 v1, -v1, v1
+; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%val = load half, ptr addrspace(1) %in
%fsub = fsub half -0.0, %val
%fmul = fmul half %fsub, %val
@@ -572,22 +623,39 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
-; GFX11-LABEL: v_extract_fneg_fold_v2f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_mul_f16_e32 v0, -4.0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_sub_f16_e32 v1, 2.0, v1
-; GFX11-NEXT: global_store_b16 v[0:1], v0, off dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b16 v[0:1], v1, off dlc
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_extract_fneg_fold_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, -4.0, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v0.h, 2.0, v1.l
+; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v0, off dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc
+; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_extract_fneg_fold_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, -4.0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v1, 2.0, v1
+; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v0, off dlc
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v1, off dlc
+; GFX11-FAKE16-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT: s_endpgm
%val = load <2 x half>, ptr addrspace(1) %in
%fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
%elt0 = extractelement <2 x half> %fneg, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
index 123b43cf76143..a764681645c42 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -2,7 +2,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @fsub_f16(
; SI-LABEL: fsub_f16:
@@ -54,29 +55,53 @@ define amdgpu_kernel void @fsub_f16(
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
-; GFX11-LABEL: fsub_f16:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s6, s10
-; GFX11-NEXT: s_mov_b32 s7, s11
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
-; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fsub_f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10
+; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fsub_f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10
+; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b) {
@@ -127,23 +152,41 @@ define amdgpu_kernel void @fsub_f16_imm_a(
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
-; GFX11-LABEL: fsub_f16_imm_a:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: v_sub_f16_e32 v0, 1.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fsub_f16_imm_a:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: v_sub_f16_e32 v0.l, 1.0, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fsub_f16_imm_a:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: v_sub_f16_e32 v0, 1.0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %b) {
entry:
@@ -192,23 +235,41 @@ define amdgpu_kernel void @fsub_f16_imm_b(
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
-; GFX11-LABEL: fsub_f16_imm_b:
-; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: v_add_f16_e32 v0, -2.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: fsub_f16_imm_b:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -2.0, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: fsub_f16_imm_b:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, -2.0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
index a46d629c02b85..96533bda8d07e 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
@@ -1,8 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefix=GFX7
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx942 | FileCheck %s -check-prefix=GFX9
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefix=GFX11
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefix=GFX12
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX12,GFX12-TRUE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX12,GFX12-FAKE16
define bfloat @v_uitofp_i1_to_bf16(i1 %num) {
; GFX7-LABEL: v_uitofp_i1_to_bf16:
@@ -185,65 +187,129 @@ define <2 x bfloat> @v_uitofp_v2i1_to_v2bf16(<2 x i1> %num) {
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_uitofp_v2i1_to_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_uitofp_v2i1_to_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v4, v1, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: v_uitofp_v2i1_to_v2bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
-; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX12-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: v_uitofp_v2i1_to_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_uitofp_v2i1_to_v2bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: v_add3_u32 v2, v4, v1, 0x7fff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_uitofp_v2i1_to_v2bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <2 x i1> %num to <2 x bfloat>
ret <2 x bfloat> %op
}
@@ -423,89 +489,176 @@ define <3 x bfloat> @v_uitofp_v3i1_to_v3bf16(<3 x i1> %num) {
; GFX9-NEXT: v_alignbit_b32 v1, s0, v2, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_uitofp_v3i1_to_v3bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
-; GFX11-NEXT: v_alignbit_b32 v1, s0, v2, 16
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_uitofp_v3i1_to_v3bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: v_uitofp_v3i1_to_v3bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX12-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX12-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_alignbit_b32 v1, s0, v2, 16
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: v_uitofp_v3i1_to_v3bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_uitofp_v3i1_to_v3bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_uitofp_v3i1_to_v3bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <3 x i1> %num to <3 x bfloat>
ret <3 x bfloat> %op
}
@@ -741,109 +894,214 @@ define <4 x bfloat> @v_uitofp_v4i1_to_v4bf16(<4 x i1> %num) {
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_uitofp_v4i1_to_v4bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_uitofp_v4i1_to_v4bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v2, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: v_uitofp_v4i1_to_v4bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc_lo
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX12-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v9, vcc_lo
-; GFX12-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: v_uitofp_v4i1_to_v4bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_uitofp_v4i1_to_v4bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX12-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v2, v3
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_uitofp_v4i1_to_v4bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = uitofp <4 x i1> %num to <4 x bfloat>
ret <4 x bfloat> %op
}
@@ -1238,65 +1496,129 @@ define <2 x bfloat> @v_sitofp_v2i1_to_v2bf16(<2 x i1> %num) {
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_sitofp_v2i1_to_v2bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_sitofp_v2i1_to_v2bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v4, v1, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: v_sitofp_v2i1_to_v2bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
-; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX12-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v3, v1, 16, 1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
-; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v1
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
-; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: v_sitofp_v2i1_to_v2bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_sitofp_v2i1_to_v2bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: v_add3_u32 v2, v4, v1, 0x7fff
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_sitofp_v2i1_to_v2bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <2 x i1> %num to <2 x bfloat>
ret <2 x bfloat> %op
}
@@ -1476,89 +1798,176 @@ define <3 x bfloat> @v_sitofp_v3i1_to_v3bf16(<3 x i1> %num) {
; GFX9-NEXT: v_alignbit_b32 v1, s0, v2, 16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_sitofp_v3i1_to_v3bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
-; GFX11-NEXT: v_alignbit_b32 v1, s0, v2, 16
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_sitofp_v3i1_to_v3bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: v_sitofp_v3i1_to_v3bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_bfe_u32 v3, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX12-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
-; GFX12-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_bfe_u32 v5, v1, 16, 1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_alignbit_b32 v1, s0, v2, 16
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: v_sitofp_v3i1_to_v3bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_sitofp_v3i1_to_v3bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc_lo
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_sitofp_v3i1_to_v3bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX12-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v1, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, s0, v2, 16
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <3 x i1> %num to <3 x bfloat>
ret <3 x bfloat> %op
}
@@ -1791,109 +2200,214 @@ define <4 x bfloat> @v_sitofp_v4i1_to_v4bf16(<4 x i1> %num) {
; GFX9-NEXT: v_perm_b32 v1, v3, v2, s0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_sitofp_v4i1_to_v4bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX11-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
-; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v9, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_sitofp_v4i1_to_v4bf16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
+; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v2, v3
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX12-LABEL: v_sitofp_v4i1_to_v4bf16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v4, v2, 16, 1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc_lo
-; GFX12-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX12-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX12-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
-; GFX12-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
-; GFX12-NEXT: v_or_b32_e32 v4, 0x400000, v0
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
-; GFX12-NEXT: s_wait_alu 0xfffd
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v9, vcc_lo
-; GFX12-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX11-FAKE16-LABEL: v_sitofp_v4i1_to_v4bf16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX11-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
+; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
+; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v9, vcc_lo
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-TRUE16-LABEL: v_sitofp_v4i1_to_v4bf16:
+; GFX12-TRUE16: ; %bb.0:
+; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
+; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v9, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
+; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc_lo
+; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd
+; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1
+; GFX12-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v2, v3
+; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-FAKE16-LABEL: v_sitofp_v4i1_to_v4bf16:
+; GFX12-FAKE16: ; %bb.0:
+; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX12-FAKE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX12-FAKE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX12-FAKE16-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX12-FAKE16-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v2, v4, v7, vcc_lo
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-FAKE16-NEXT: v_add3_u32 v7, v8, v1, 0x7fff
+; GFX12-FAKE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v0, v6, v4, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc_lo
+; GFX12-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
+; GFX12-FAKE16-NEXT: s_wait_alu 0xfffd
+; GFX12-FAKE16-NEXT: v_cndmask_b32_e32 v3, v5, v9, vcc_lo
+; GFX12-FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x7060302
+; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = sitofp <4 x i1> %num to <4 x bfloat>
ret <4 x bfloat> %op
}
diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll
index a2cc427bf6e54..bc4a8634dbe50 100644
--- a/llvm/test/CodeGen/AMDGPU/imm16.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm16.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
@@ -639,17 +640,31 @@ define amdgpu_kernel void @add_inline_imm_0.0_f16(ptr addrspace(1) %out, half %x
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: add_inline_imm_0.0_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_add_f16_e64 v0, s2, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x00,0x01,0x00]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: add_inline_imm_0.0_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0, v0.l ; encoding: [0x80,0x00,0x00,0x64]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_0.0_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x00,0x01,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_0.0_f16:
; VI: ; %bb.0:
@@ -693,17 +708,31 @@ define amdgpu_kernel void @add_inline_imm_0.5_f16(ptr addrspace(1) %out, half %x
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: add_inline_imm_0.5_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_add_f16_e64 v0, s2, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe0,0x01,0x00]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: add_inline_imm_0.5_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0.5, v0.l ; encoding: [0xf0,0x00,0x00,0x64]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_0.5_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe0,0x01,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_0.5_f16:
; VI: ; %bb.0:
@@ -747,17 +776,31 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(ptr addrspace(1) %out, hal
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: add_inline_imm_neg_0.5_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_add_f16_e64 v0, s2, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe2,0x01,0x00]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: add_inline_imm_neg_0.5_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -0.5, v0.l ; encoding: [0xf1,0x00,0x00,0x64]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_neg_0.5_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, -0.5 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe2,0x01,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_neg_0.5_f16:
; VI: ; %bb.0:
@@ -801,17 +844,31 @@ define amdgpu_kernel void @add_inline_imm_1.0_f16(ptr addrspace(1) %out, half %x
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: add_inline_imm_1.0_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_add_f16_e64 v0, s2, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe4,0x01,0x00]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: add_inline_imm_1.0_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l ; encoding: [0xf2,0x00,0x00,0x64]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_1.0_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe4,0x01,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_1.0_f16:
; VI: ; %bb.0:
@@ -855,17 +912,31 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(ptr addrspace(1) %out, hal
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: add_inline_imm_neg_1.0_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_add_f16_e64 v0, s2, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe6,0x01,0x00]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: add_inline_imm_neg_1.0_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -1.0, v0.l ; encoding: [0xf3,0x00,0x00,0x64]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_neg_1.0_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, -1.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe6,0x01,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_neg_1.0_f16:
; VI: ; %bb.0:
@@ -909,17 +980,31 @@ define amdgpu_kernel void @add_inline_imm_2.0_f16(ptr addrspace(1) %out, half %x
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: add_inline_imm_2.0_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_add_f16_e64 v0, s2, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe8,0x01,0x00]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: add_inline_imm_2.0_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 2.0, v0.l ; encoding: [0xf4,0x00,0x00,0x64]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_2.0_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xe8,0x01,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_2.0_f16:
; VI: ; %bb.0:
@@ -963,17 +1048,31 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(ptr addrspace(1) %out, hal
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: add_inline_imm_neg_2.0_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_add_f16_e64 v0, s2, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xea,0x01,0x00]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: add_inline_imm_neg_2.0_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -2.0, v0.l ; encoding: [0xf5,0x00,0x00,0x64]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_neg_2.0_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, -2.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xea,0x01,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_neg_2.0_f16:
; VI: ; %bb.0:
@@ -1017,17 +1116,31 @@ define amdgpu_kernel void @add_inline_imm_4.0_f16(ptr addrspace(1) %out, half %x
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: add_inline_imm_4.0_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_add_f16_e64 v0, s2, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xec,0x01,0x00]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: add_inline_imm_4.0_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 4.0, v0.l ; encoding: [0xf6,0x00,0x00,0x64]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_4.0_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xec,0x01,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_4.0_f16:
; VI: ; %bb.0:
@@ -1071,17 +1184,31 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(ptr addrspace(1) %out, hal
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: add_inline_imm_neg_4.0_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_add_f16_e64 v0, s2, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xee,0x01,0x00]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: add_inline_imm_neg_4.0_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, -4.0, v0.l ; encoding: [0xf7,0x00,0x00,0x64]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_neg_4.0_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, -4.0 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0xee,0x01,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_neg_4.0_f16:
; VI: ; %bb.0:
@@ -1131,23 +1258,41 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out,
; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: commute_add_inline_imm_0.5_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
-; GFX11-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
-; GFX11-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
-; GFX11-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80]
-; GFX11-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
-; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT: v_add_f16_e32 v0, 0.5, v0 ; encoding: [0xf0,0x00,0x00,0x64]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: commute_add_inline_imm_0.5_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80]
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0.5, v0.l ; encoding: [0xf0,0x00,0x00,0x64]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: commute_add_inline_imm_0.5_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80]
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 0.5, v0 ; encoding: [0xf0,0x00,0x00,0x64]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: commute_add_inline_imm_0.5_f16:
; VI: ; %bb.0:
@@ -1211,23 +1356,41 @@ define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr ad
; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: commute_add_literal_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
-; GFX11-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
-; GFX11-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
-; GFX11-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
-; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80]
-; GFX11-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
-; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT: v_add_f16_e32 v0, 0x6400, v0 ; encoding: [0xff,0x00,0x00,0x64,0x00,0x64,0x00,0x00]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: commute_add_literal_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80]
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0x6400, v0.l ; encoding: [0xff,0x00,0x00,0x64,0x00,0x64,0x00,0x00]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: commute_add_literal_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80]
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 0x6400, v0 ; encoding: [0xff,0x00,0x00,0x64,0x00,0x64,0x00,0x00]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x01,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: commute_add_literal_f16:
; VI: ; %bb.0:
@@ -1285,17 +1448,31 @@ define amdgpu_kernel void @add_inline_imm_1_f16(ptr addrspace(1) %out, half %x)
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: add_inline_imm_1_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_add_f16_e64 v0, s2, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x02,0x01,0x00]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: add_inline_imm_1_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1, v0.l ; encoding: [0x81,0x00,0x00,0x64]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_1_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 1 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x02,0x01,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_1_f16:
; VI: ; %bb.0:
@@ -1339,17 +1516,31 @@ define amdgpu_kernel void @add_inline_imm_2_f16(ptr addrspace(1) %out, half %x)
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: add_inline_imm_2_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_add_f16_e64 v0, s2, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x04,0x01,0x00]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: add_inline_imm_2_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 2, v0.l ; encoding: [0x82,0x00,0x00,0x64]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_2_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 2 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x04,0x01,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_2_f16:
; VI: ; %bb.0:
@@ -1393,17 +1584,31 @@ define amdgpu_kernel void @add_inline_imm_16_f16(ptr addrspace(1) %out, half %x)
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: add_inline_imm_16_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_add_f16_e64 v0, s2, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x20,0x01,0x00]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: add_inline_imm_16_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 16, v0.l ; encoding: [0x90,0x00,0x00,0x64]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_16_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 16 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x20,0x01,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_16_f16:
; VI: ; %bb.0:
@@ -1684,17 +1889,31 @@ define amdgpu_kernel void @add_inline_imm_63_f16(ptr addrspace(1) %out, half %x)
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: add_inline_imm_63_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_add_f16_e64 v0, s2, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x7e,0x01,0x00]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: add_inline_imm_63_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 63, v0.l ; encoding: [0xbf,0x00,0x00,0x64]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_63_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 63 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x7e,0x01,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_63_f16:
; VI: ; %bb.0:
@@ -1738,17 +1957,31 @@ define amdgpu_kernel void @add_inline_imm_64_f16(ptr addrspace(1) %out, half %x)
; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
-; GFX11-LABEL: add_inline_imm_64_f16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_add_f16_e64 v0, s2, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x80,0x01,0x00]
-; GFX11-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: add_inline_imm_64_f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 ; encoding: [0x02,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
+; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 64, v0.l ; encoding: [0xc0,0x00,0x00,0x64]
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_64_f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, 64 ; encoding: [0x00,0x00,0x32,0xd5,0x02,0x80,0x01,0x00]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
;
; VI-LABEL: add_inline_imm_64_f16:
; VI: ; %bb.0:
@@ -1789,12 +2022,19 @@ define void @mul_inline_imm_0.5_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
-; GFX11-LABEL: mul_inline_imm_0.5_i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
-; GFX11-NEXT: v_mul_lo_u16 v2, 0x3800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x38,0x00,0x00]
-; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
-; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+; GFX11-TRUE16-LABEL: mul_inline_imm_0.5_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mul_lo_u16 v2.l, 0x3800, v2.l ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x38,0x00,0x00]
+; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: mul_inline_imm_0.5_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_mul_lo_u16 v2, 0x3800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x38,0x00,0x00]
+; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: mul_inline_imm_0.5_i16:
; VI: ; %bb.0:
@@ -1829,12 +2069,19 @@ define void @mul_inline_imm_neg_0.5_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
-; GFX11-LABEL: mul_inline_imm_neg_0.5_i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
-; GFX11-NEXT: v_mul_lo_u16 v2, 0xb800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xb8,0xff,0xff]
-; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
-; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+; GFX11-TRUE16-LABEL: mul_inline_imm_neg_0.5_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mul_lo_u16 v2.l, 0xb800, v2.l ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xb8,0xff,0xff]
+; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: mul_inline_imm_neg_0.5_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_mul_lo_u16 v2, 0xb800, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xb8,0xff,0xff]
+; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: mul_inline_imm_neg_0.5_i16:
; VI: ; %bb.0:
@@ -1869,12 +2116,19 @@ define void @mul_inline_imm_1.0_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
-; GFX11-LABEL: mul_inline_imm_1.0_i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
-; GFX11-NEXT: v_mul_lo_u16 v2, 0x3c00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x3c,0x00,0x00]
-; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
-; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+; GFX11-TRUE16-LABEL: mul_inline_imm_1.0_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mul_lo_u16 v2.l, 0x3c00, v2.l ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x3c,0x00,0x00]
+; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: mul_inline_imm_1.0_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_mul_lo_u16 v2, 0x3c00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x3c,0x00,0x00]
+; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: mul_inline_imm_1.0_i16:
; VI: ; %bb.0:
@@ -1909,12 +2163,19 @@ define void @mul_inline_imm_neg_1.0_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
-; GFX11-LABEL: mul_inline_imm_neg_1.0_i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
-; GFX11-NEXT: v_mul_lo_u16 v2, 0xbc00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xbc,0xff,0xff]
-; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
-; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+; GFX11-TRUE16-LABEL: mul_inline_imm_neg_1.0_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mul_lo_u16 v2.l, 0xbc00, v2.l ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xbc,0xff,0xff]
+; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: mul_inline_imm_neg_1.0_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_mul_lo_u16 v2, 0xbc00, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xbc,0xff,0xff]
+; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: mul_inline_imm_neg_1.0_i16:
; VI: ; %bb.0:
@@ -1949,12 +2210,19 @@ define void @shl_inline_imm_2.0_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
-; GFX11-LABEL: shl_inline_imm_2.0_i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
-; GFX11-NEXT: v_lshlrev_b16 v2, v2, 0x4000 ; encoding: [0x02,0x00,0x38,0xd7,0x02,0xff,0x01,0x00,0x00,0x40,0x00,0x00]
-; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
-; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+; GFX11-TRUE16-LABEL: shl_inline_imm_2.0_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, v2.l, 0x4000 ; encoding: [0x02,0x00,0x38,0xd7,0x02,0xff,0x01,0x00,0x00,0x40,0x00,0x00]
+; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: shl_inline_imm_2.0_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v2, v2, 0x4000 ; encoding: [0x02,0x00,0x38,0xd7,0x02,0xff,0x01,0x00,0x00,0x40,0x00,0x00]
+; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: shl_inline_imm_2.0_i16:
; VI: ; %bb.0:
@@ -1989,12 +2257,19 @@ define void @shl_inline_imm_neg_2.0_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
-; GFX11-LABEL: shl_inline_imm_neg_2.0_i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
-; GFX11-NEXT: v_lshlrev_b16 v2, v2, 0xc000 ; encoding: [0x02,0x00,0x38,0xd7,0x02,0xff,0x01,0x00,0x00,0xc0,0xff,0xff]
-; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
-; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+; GFX11-TRUE16-LABEL: shl_inline_imm_neg_2.0_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, v2.l, 0xc000 ; encoding: [0x02,0x00,0x38,0xd7,0x02,0xff,0x01,0x00,0x00,0xc0,0xff,0xff]
+; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: shl_inline_imm_neg_2.0_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_lshlrev_b16 v2, v2, 0xc000 ; encoding: [0x02,0x00,0x38,0xd7,0x02,0xff,0x01,0x00,0x00,0xc0,0xff,0xff]
+; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: shl_inline_imm_neg_2.0_i16:
; VI: ; %bb.0:
@@ -2029,12 +2304,19 @@ define void @mul_inline_imm_4.0_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
-; GFX11-LABEL: mul_inline_imm_4.0_i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
-; GFX11-NEXT: v_mul_lo_u16 v2, 0x4400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x44,0x00,0x00]
-; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
-; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+; GFX11-TRUE16-LABEL: mul_inline_imm_4.0_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mul_lo_u16 v2.l, 0x4400, v2.l ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x44,0x00,0x00]
+; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: mul_inline_imm_4.0_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_mul_lo_u16 v2, 0x4400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0x44,0x00,0x00]
+; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: mul_inline_imm_4.0_i16:
; VI: ; %bb.0:
@@ -2069,12 +2351,19 @@ define void @mul_inline_imm_neg_4.0_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
-; GFX11-LABEL: mul_inline_imm_neg_4.0_i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
-; GFX11-NEXT: v_mul_lo_u16 v2, 0xc400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xc4,0xff,0xff]
-; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
-; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+; GFX11-TRUE16-LABEL: mul_inline_imm_neg_4.0_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mul_lo_u16 v2.l, 0xc400, v2.l ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xc4,0xff,0xff]
+; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: mul_inline_imm_neg_4.0_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_mul_lo_u16 v2, 0xc400, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x00,0xc4,0xff,0xff]
+; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: mul_inline_imm_neg_4.0_i16:
; VI: ; %bb.0:
@@ -2109,12 +2398,19 @@ define void @mul_inline_imm_inv2pi_i16(ptr addrspace(1) %out, i16 %x) {
; GFX10-NEXT: global_store_short v[0:1], v2, off ; encoding: [0x00,0x80,0x68,0xdc,0x00,0x02,0x7d,0x00]
; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
;
-; GFX11-LABEL: mul_inline_imm_inv2pi_i16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
-; GFX11-NEXT: v_mul_lo_u16 v2, 0x3118, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x18,0x31,0x00,0x00]
-; GFX11-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
-; GFX11-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+; GFX11-TRUE16-LABEL: mul_inline_imm_inv2pi_i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_mul_lo_u16 v2.l, 0x3118, v2.l ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x18,0x31,0x00,0x00]
+; GFX11-TRUE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: mul_inline_imm_inv2pi_i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_mul_lo_u16 v2, 0x3118, v2 ; encoding: [0x02,0x00,0x05,0xd7,0xff,0x04,0x02,0x00,0x18,0x31,0x00,0x00]
+; GFX11-FAKE16-NEXT: global_store_b16 v[0:1], v2, off ; encoding: [0x00,0x00,0x66,0xdc,0x00,0x02,0x7c,0x00]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
;
; VI-LABEL: mul_inline_imm_inv2pi_i16:
; VI: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll
index 342d7b0237118..4848e087467f2 100644
--- a/llvm/test/CodeGen/AMDGPU/immv216.ll
+++ b/llvm/test/CodeGen/AMDGPU/immv216.ll
@@ -1,4 +1,6 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11-FAKE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global,-xnack -verify-machineinstrs -show-mc-encoding < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
@@ -9,6 +11,55 @@
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000 ; encoding
; GCN: buffer_store_{{dword|b32}} [[REG]]
define amdgpu_kernel void @store_inline_imm_neg_0.0_v2i16(ptr addrspace(1) %out) #0 {
+; GFX11-TRUE16-LABEL: store_inline_imm_neg_0.0_v2i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x80008000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_neg_0.0_v2i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x80008000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: store_inline_imm_neg_0.0_v2i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x80008000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0x00,0x80]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: store_inline_imm_neg_0.0_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x80008000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0x00,0x80]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: store_inline_imm_neg_0.0_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: v_mov_b32_e32 v0, 0x80008000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0x00,0x80]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
store <2 x i16> <i16 -32768, i16 -32768>, ptr addrspace(1) %out
ret void
}
@@ -17,6 +68,55 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_v2i16(ptr addrspace(1) %out)
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0 ; encoding
; GCN: buffer_store_{{dword|b32}} [[REG]]
define amdgpu_kernel void @store_inline_imm_0.0_v2f16(ptr addrspace(1) %out) #0 {
+; GFX11-TRUE16-LABEL: store_inline_imm_0.0_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_0.0_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: store_inline_imm_0.0_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: store_inline_imm_0.0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: store_inline_imm_0.0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
store <2 x half> <half 0.0, half 0.0>, ptr addrspace(1) %out
ret void
}
@@ -25,6 +125,55 @@ define amdgpu_kernel void @store_inline_imm_0.0_v2f16(ptr addrspace(1) %out) #0
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000 ; encoding
; GCN: buffer_store_{{dword|b32}} [[REG]]
define amdgpu_kernel void @store_imm_neg_0.0_v2f16(ptr addrspace(1) %out) #0 {
+; GFX11-TRUE16-LABEL: store_imm_neg_0.0_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x80008000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_imm_neg_0.0_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x80008000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: store_imm_neg_0.0_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x80008000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0x00,0x80]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: store_imm_neg_0.0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x80008000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0x00,0x80]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: store_imm_neg_0.0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: v_mov_b32_e32 v0, 0x80008000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0x00,0x80]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
store <2 x half> <half -0.0, half -0.0>, ptr addrspace(1) %out
ret void
}
@@ -33,6 +182,55 @@ define amdgpu_kernel void @store_imm_neg_0.0_v2f16(ptr addrspace(1) %out) #0 {
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x38003800 ; encoding
; GCN: buffer_store_{{dword|b32}} [[REG]]
define amdgpu_kernel void @store_inline_imm_0.5_v2f16(ptr addrspace(1) %out) #0 {
+; GFX11-TRUE16-LABEL: store_inline_imm_0.5_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x38003800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x38]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_0.5_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x38003800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x38]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: store_inline_imm_0.5_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x38003800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x38]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: store_inline_imm_0.5_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x38003800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x38]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: store_inline_imm_0.5_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: v_mov_b32_e32 v0, 0x38003800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x38]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
store <2 x half> <half 0.5, half 0.5>, ptr addrspace(1) %out
ret void
}
@@ -41,6 +239,55 @@ define amdgpu_kernel void @store_inline_imm_0.5_v2f16(ptr addrspace(1) %out) #0
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800b800 ; encoding
; GCN: buffer_store_{{dword|b32}} [[REG]]
define amdgpu_kernel void @store_inline_imm_m_0.5_v2f16(ptr addrspace(1) %out) #0 {
+; GFX11-TRUE16-LABEL: store_inline_imm_m_0.5_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0xb800b800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0x00,0xb8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_m_0.5_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0xb800b800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0x00,0xb8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: store_inline_imm_m_0.5_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xb800b800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0x00,0xb8]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: store_inline_imm_m_0.5_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0xb800b800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0x00,0xb8]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: store_inline_imm_m_0.5_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: v_mov_b32_e32 v0, 0xb800b800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0x00,0xb8]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
store <2 x half> <half -0.5, half -0.5>, ptr addrspace(1) %out
ret void
}
@@ -49,6 +296,55 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_v2f16(ptr addrspace(1) %out) #
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00 ; encoding
; GCN: buffer_store_{{dword|b32}} [[REG]]
define amdgpu_kernel void @store_inline_imm_1.0_v2f16(ptr addrspace(1) %out) #0 {
+; GFX11-TRUE16-LABEL: store_inline_imm_1.0_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x3c]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_1.0_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x3c]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: store_inline_imm_1.0_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x3c]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: store_inline_imm_1.0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x3c]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: store_inline_imm_1.0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: v_mov_b32_e32 v0, 0x3c003c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x3c]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
store <2 x half> <half 1.0, half 1.0>, ptr addrspace(1) %out
ret void
}
@@ -57,6 +353,55 @@ define amdgpu_kernel void @store_inline_imm_1.0_v2f16(ptr addrspace(1) %out) #0
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00 ; encoding
; GCN: buffer_store_{{dword|b32}} [[REG]]
define amdgpu_kernel void @store_inline_imm_m_1.0_v2f16(ptr addrspace(1) %out) #0 {
+; GFX11-TRUE16-LABEL: store_inline_imm_m_1.0_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0x00,0xbc]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_m_1.0_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0x00,0xbc]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: store_inline_imm_m_1.0_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0x00,0xbc]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: store_inline_imm_m_1.0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0x00,0xbc]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: store_inline_imm_m_1.0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0x00,0xbc]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
store <2 x half> <half -1.0, half -1.0>, ptr addrspace(1) %out
ret void
}
@@ -65,6 +410,55 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_v2f16(ptr addrspace(1) %out) #
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x40004000 ; encoding
; GCN: buffer_store_{{dword|b32}} [[REG]]
define amdgpu_kernel void @store_inline_imm_2.0_v2f16(ptr addrspace(1) %out) #0 {
+; GFX11-TRUE16-LABEL: store_inline_imm_2.0_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x40004000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x40]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_2.0_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x40004000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x40]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: store_inline_imm_2.0_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x40004000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x40]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: store_inline_imm_2.0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x40004000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x40]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: store_inline_imm_2.0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: v_mov_b32_e32 v0, 0x40004000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x40]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
store <2 x half> <half 2.0, half 2.0>, ptr addrspace(1) %out
ret void
}
@@ -73,6 +467,55 @@ define amdgpu_kernel void @store_inline_imm_2.0_v2f16(ptr addrspace(1) %out) #0
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000c000 ; encoding
; GCN: buffer_store_{{dword|b32}} [[REG]]
define amdgpu_kernel void @store_inline_imm_m_2.0_v2f16(ptr addrspace(1) %out) #0 {
+; GFX11-TRUE16-LABEL: store_inline_imm_m_2.0_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0xc000c000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0x00,0xc0]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_m_2.0_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0xc000c000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0x00,0xc0]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: store_inline_imm_m_2.0_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xc000c000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0x00,0xc0]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: store_inline_imm_m_2.0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0xc000c000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0x00,0xc0]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: store_inline_imm_m_2.0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: v_mov_b32_e32 v0, 0xc000c000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0x00,0xc0]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
store <2 x half> <half -2.0, half -2.0>, ptr addrspace(1) %out
ret void
}
@@ -81,6 +524,55 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_v2f16(ptr addrspace(1) %out) #
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x44004400 ; encoding
; GCN: buffer_store_{{dword|b32}} [[REG]]
define amdgpu_kernel void @store_inline_imm_4.0_v2f16(ptr addrspace(1) %out) #0 {
+; GFX11-TRUE16-LABEL: store_inline_imm_4.0_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x44004400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x44]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_4.0_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x44004400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x44]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: store_inline_imm_4.0_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x44004400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x44]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: store_inline_imm_4.0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x44004400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x44]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: store_inline_imm_4.0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: v_mov_b32_e32 v0, 0x44004400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x44]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
store <2 x half> <half 4.0, half 4.0>, ptr addrspace(1) %out
ret void
}
@@ -89,6 +581,55 @@ define amdgpu_kernel void @store_inline_imm_4.0_v2f16(ptr addrspace(1) %out) #0
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400c400 ; encoding
; GCN: buffer_store_{{dword|b32}} [[REG]]
define amdgpu_kernel void @store_inline_imm_m_4.0_v2f16(ptr addrspace(1) %out) #0 {
+; GFX11-TRUE16-LABEL: store_inline_imm_m_4.0_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0xc400c400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0x00,0xc4]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_m_4.0_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0xc400c400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0x00,0xc4]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: store_inline_imm_m_4.0_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xc400c400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0x00,0xc4]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: store_inline_imm_m_4.0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0xc400c400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0x00,0xc4]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: store_inline_imm_m_4.0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: v_mov_b32_e32 v0, 0xc400c400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0x00,0xc4]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
store <2 x half> <half -4.0, half -4.0>, ptr addrspace(1) %out
ret void
}
@@ -97,6 +638,55 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_v2f16(ptr addrspace(1) %out) #
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x31183118 ; encoding
; GCN: buffer_store_{{dword|b32}} [[REG]]
define amdgpu_kernel void @store_inline_imm_inv_2pi_v2f16(ptr addrspace(1) %out) #0 {
+; GFX11-TRUE16-LABEL: store_inline_imm_inv_2pi_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x31183118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x18,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_inv_2pi_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x31183118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x18,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: store_inline_imm_inv_2pi_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x31183118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x18,0x31]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: store_inline_imm_inv_2pi_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x31183118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x18,0x31]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: store_inline_imm_inv_2pi_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: v_mov_b32_e32 v0, 0x31183118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x18,0x31]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
store <2 x half> <half 0xH3118, half 0xH3118>, ptr addrspace(1) %out
ret void
}
@@ -105,6 +695,55 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_v2f16(ptr addrspace(1) %out)
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118b118 ; encoding
; GCN: buffer_store_{{dword|b32}} [[REG]]
define amdgpu_kernel void @store_inline_imm_m_inv_2pi_v2f16(ptr addrspace(1) %out) #0 {
+; GFX11-TRUE16-LABEL: store_inline_imm_m_inv_2pi_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0xb118b118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0x18,0xb1]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_m_inv_2pi_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0xb118b118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0x18,0xb1]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: store_inline_imm_m_inv_2pi_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xb118b118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0x18,0xb1]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: store_inline_imm_m_inv_2pi_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0xb118b118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0x18,0xb1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: store_inline_imm_m_inv_2pi_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: v_mov_b32_e32 v0, 0xb118b118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0x18,0xb1]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
store <2 x half> <half 0xHB118, half 0xHB118>, ptr addrspace(1) %out
ret void
}
@@ -113,6 +752,55 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_v2f16(ptr addrspace(1) %ou
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c006c00
; GCN: buffer_store_{{dword|b32}} [[REG]]
define amdgpu_kernel void @store_literal_imm_v2f16(ptr addrspace(1) %out) #0 {
+; GFX11-TRUE16-LABEL: store_literal_imm_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0x6c006c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x6c]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_literal_imm_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0x6c006c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x6c]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: store_literal_imm_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x6c006c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x6c]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: store_literal_imm_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: v_mov_b32_e32 v0, 0x6c006c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x6c]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: store_literal_imm_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: v_mov_b32_e32 v0, 0x6c006c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x6c]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
store <2 x half> <half 4096.0, half 4096.0>, ptr addrspace(1) %out
ret void
}
@@ -133,6 +821,68 @@ define amdgpu_kernel void @store_literal_imm_v2f16(ptr addrspace(1) %out) #0 {
; VI: v_or_b32
; VI: buffer_store_dword
define amdgpu_kernel void @add_inline_imm_0.0_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
+; GFX11-TRUE16-LABEL: add_inline_imm_0.0_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, s2, 0 ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x00,0x01,0x18]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_0.0_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, s2, 0 ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x00,0x01,0x18]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: add_inline_imm_0.0_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: v_pk_add_f16 v0, s2, 0 ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x00,0x01,0x18]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: add_inline_imm_0.0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: v_pk_add_f16 v0, s6, 0 ; encoding: [0x00,0x40,0x8f,0xd3,0x06,0x00,0x01,0x18]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: add_inline_imm_0.0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_lshr_b32 s4, s6, 16 ; encoding: [0x06,0x90,0x04,0x8f]
+; VI-NEXT: v_mov_b32_e32 v1, s4 ; encoding: [0x04,0x02,0x02,0x7e]
+; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x3e,0x01,0x05,0x06,0x06]
+; VI-NEXT: v_add_f16_e64 v1, s6, 0 ; encoding: [0x01,0x00,0x1f,0xd1,0x06,0x00,0x01,0x00]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%y = fadd <2 x half> %x, <half 0.0, half 0.0>
store <2 x half> %y, ptr addrspace(1) %out
ret void
@@ -158,6 +908,68 @@ define amdgpu_kernel void @add_inline_imm_0.0_v2f16(ptr addrspace(1) %out, <2 x
; VI: v_or_b32
; VI: buffer_store_dword
define amdgpu_kernel void @add_inline_imm_0.5_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
+; GFX11-TRUE16-LABEL: add_inline_imm_0.5_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, s2, 0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xe0,0x01,0x08]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_0.5_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, s2, 0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xe0,0x01,0x08]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: add_inline_imm_0.5_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: v_pk_add_f16 v0, s2, 0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xe0,0x01,0x08]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: add_inline_imm_0.5_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: v_pk_add_f16 v0, s6, 0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x06,0xe0,0x01,0x08]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: add_inline_imm_0.5_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_lshr_b32 s4, s6, 16 ; encoding: [0x06,0x90,0x04,0x8f]
+; VI-NEXT: v_mov_b32_e32 v1, s4 ; encoding: [0x04,0x02,0x02,0x7e]
+; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x3e,0x01,0x05,0x06,0x06]
+; VI-NEXT: v_add_f16_e64 v1, s6, 0.5 ; encoding: [0x01,0x00,0x1f,0xd1,0x06,0xe0,0x01,0x00]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%y = fadd <2 x half> %x, <half 0.5, half 0.5>
store <2 x half> %y, ptr addrspace(1) %out
ret void
@@ -183,6 +995,68 @@ define amdgpu_kernel void @add_inline_imm_0.5_v2f16(ptr addrspace(1) %out, <2 x
; VI: v_or_b32
; VI: buffer_store_dword
define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
+; GFX11-TRUE16-LABEL: add_inline_imm_neg_0.5_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, s2, -0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xe2,0x01,0x08]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_neg_0.5_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, s2, -0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xe2,0x01,0x08]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: add_inline_imm_neg_0.5_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: v_pk_add_f16 v0, s2, -0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xe2,0x01,0x08]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: add_inline_imm_neg_0.5_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: v_pk_add_f16 v0, s6, -0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x06,0xe2,0x01,0x08]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: add_inline_imm_neg_0.5_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: v_mov_b32_e32 v0, 0xb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_lshr_b32 s4, s6, 16 ; encoding: [0x06,0x90,0x04,0x8f]
+; VI-NEXT: v_mov_b32_e32 v1, s4 ; encoding: [0x04,0x02,0x02,0x7e]
+; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x3e,0x01,0x05,0x06,0x06]
+; VI-NEXT: v_add_f16_e64 v1, s6, -0.5 ; encoding: [0x01,0x00,0x1f,0xd1,0x06,0xe2,0x01,0x00]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%y = fadd <2 x half> %x, <half -0.5, half -0.5>
store <2 x half> %y, ptr addrspace(1) %out
ret void
@@ -208,6 +1082,68 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(ptr addrspace(1) %out, <
; VI: v_or_b32
; VI: buffer_store_dword
define amdgpu_kernel void @add_inline_imm_1.0_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
+; GFX11-TRUE16-LABEL: add_inline_imm_1.0_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, s2, 1.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xe4,0x01,0x08]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_1.0_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, s2, 1.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xe4,0x01,0x08]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: add_inline_imm_1.0_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: v_pk_add_f16 v0, s2, 1.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xe4,0x01,0x08]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: add_inline_imm_1.0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: v_pk_add_f16 v0, s6, 1.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x06,0xe4,0x01,0x08]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: add_inline_imm_1.0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_lshr_b32 s4, s6, 16 ; encoding: [0x06,0x90,0x04,0x8f]
+; VI-NEXT: v_mov_b32_e32 v1, s4 ; encoding: [0x04,0x02,0x02,0x7e]
+; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x3e,0x01,0x05,0x06,0x06]
+; VI-NEXT: v_add_f16_e64 v1, s6, 1.0 ; encoding: [0x01,0x00,0x1f,0xd1,0x06,0xe4,0x01,0x00]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%y = fadd <2 x half> %x, <half 1.0, half 1.0>
store <2 x half> %y, ptr addrspace(1) %out
ret void
@@ -234,6 +1170,68 @@ define amdgpu_kernel void @add_inline_imm_1.0_v2f16(ptr addrspace(1) %out, <2 x
; VI: v_or_b32
; VI: buffer_store_dword
define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
+; GFX11-TRUE16-LABEL: add_inline_imm_neg_1.0_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, s2, -1.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xe6,0x01,0x08]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_neg_1.0_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, s2, -1.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xe6,0x01,0x08]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: add_inline_imm_neg_1.0_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: v_pk_add_f16 v0, s2, -1.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xe6,0x01,0x08]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: add_inline_imm_neg_1.0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: v_pk_add_f16 v0, s6, -1.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x06,0xe6,0x01,0x08]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: add_inline_imm_neg_1.0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: v_mov_b32_e32 v0, 0xbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_lshr_b32 s4, s6, 16 ; encoding: [0x06,0x90,0x04,0x8f]
+; VI-NEXT: v_mov_b32_e32 v1, s4 ; encoding: [0x04,0x02,0x02,0x7e]
+; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x3e,0x01,0x05,0x06,0x06]
+; VI-NEXT: v_add_f16_e64 v1, s6, -1.0 ; encoding: [0x01,0x00,0x1f,0xd1,0x06,0xe6,0x01,0x00]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%y = fadd <2 x half> %x, <half -1.0, half -1.0>
store <2 x half> %y, ptr addrspace(1) %out
ret void
@@ -259,6 +1257,68 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(ptr addrspace(1) %out, <
; VI: v_or_b32
; VI: buffer_store_dword
define amdgpu_kernel void @add_inline_imm_2.0_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
+; GFX11-TRUE16-LABEL: add_inline_imm_2.0_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, s2, 2.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xe8,0x01,0x08]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_2.0_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, s2, 2.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xe8,0x01,0x08]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: add_inline_imm_2.0_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: v_pk_add_f16 v0, s2, 2.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xe8,0x01,0x08]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: add_inline_imm_2.0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: v_pk_add_f16 v0, s6, 2.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x06,0xe8,0x01,0x08]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: add_inline_imm_2.0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_lshr_b32 s4, s6, 16 ; encoding: [0x06,0x90,0x04,0x8f]
+; VI-NEXT: v_mov_b32_e32 v1, s4 ; encoding: [0x04,0x02,0x02,0x7e]
+; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x3e,0x01,0x05,0x06,0x06]
+; VI-NEXT: v_add_f16_e64 v1, s6, 2.0 ; encoding: [0x01,0x00,0x1f,0xd1,0x06,0xe8,0x01,0x00]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%y = fadd <2 x half> %x, <half 2.0, half 2.0>
store <2 x half> %y, ptr addrspace(1) %out
ret void
@@ -284,6 +1344,68 @@ define amdgpu_kernel void @add_inline_imm_2.0_v2f16(ptr addrspace(1) %out, <2 x
; VI: v_or_b32
; VI: buffer_store_dword
define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
+; GFX11-TRUE16-LABEL: add_inline_imm_neg_2.0_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, s2, -2.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xea,0x01,0x08]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_neg_2.0_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, s2, -2.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xea,0x01,0x08]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: add_inline_imm_neg_2.0_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: v_pk_add_f16 v0, s2, -2.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xea,0x01,0x08]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: add_inline_imm_neg_2.0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: v_pk_add_f16 v0, s6, -2.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x06,0xea,0x01,0x08]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: add_inline_imm_neg_2.0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: v_mov_b32_e32 v0, 0xc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_lshr_b32 s4, s6, 16 ; encoding: [0x06,0x90,0x04,0x8f]
+; VI-NEXT: v_mov_b32_e32 v1, s4 ; encoding: [0x04,0x02,0x02,0x7e]
+; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x3e,0x01,0x05,0x06,0x06]
+; VI-NEXT: v_add_f16_e64 v1, s6, -2.0 ; encoding: [0x01,0x00,0x1f,0xd1,0x06,0xea,0x01,0x00]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%y = fadd <2 x half> %x, <half -2.0, half -2.0>
store <2 x half> %y, ptr addrspace(1) %out
ret void
@@ -309,6 +1431,68 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(ptr addrspace(1) %out, <
; VI: v_or_b32
; VI: buffer_store_dword
define amdgpu_kernel void @add_inline_imm_4.0_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
+; GFX11-TRUE16-LABEL: add_inline_imm_4.0_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, s2, 4.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xec,0x01,0x08]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_4.0_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, s2, 4.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xec,0x01,0x08]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: add_inline_imm_4.0_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: v_pk_add_f16 v0, s2, 4.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xec,0x01,0x08]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: add_inline_imm_4.0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: v_pk_add_f16 v0, s6, 4.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x06,0xec,0x01,0x08]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: add_inline_imm_4.0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_lshr_b32 s4, s6, 16 ; encoding: [0x06,0x90,0x04,0x8f]
+; VI-NEXT: v_mov_b32_e32 v1, s4 ; encoding: [0x04,0x02,0x02,0x7e]
+; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x3e,0x01,0x05,0x06,0x06]
+; VI-NEXT: v_add_f16_e64 v1, s6, 4.0 ; encoding: [0x01,0x00,0x1f,0xd1,0x06,0xec,0x01,0x00]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%y = fadd <2 x half> %x, <half 4.0, half 4.0>
store <2 x half> %y, ptr addrspace(1) %out
ret void
@@ -334,6 +1518,68 @@ define amdgpu_kernel void @add_inline_imm_4.0_v2f16(ptr addrspace(1) %out, <2 x
; VI: v_or_b32
; VI: buffer_store_dword
define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
+; GFX11-TRUE16-LABEL: add_inline_imm_neg_4.0_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, s2, -4.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xee,0x01,0x08]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_neg_4.0_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, s2, -4.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xee,0x01,0x08]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: add_inline_imm_neg_4.0_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: v_pk_add_f16 v0, s2, -4.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0xee,0x01,0x08]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: add_inline_imm_neg_4.0_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: v_pk_add_f16 v0, s6, -4.0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x06,0xee,0x01,0x08]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: add_inline_imm_neg_4.0_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: v_mov_b32_e32 v0, 0xc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_lshr_b32 s4, s6, 16 ; encoding: [0x06,0x90,0x04,0x8f]
+; VI-NEXT: v_mov_b32_e32 v1, s4 ; encoding: [0x04,0x02,0x02,0x7e]
+; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x3e,0x01,0x05,0x06,0x06]
+; VI-NEXT: v_add_f16_e64 v1, s6, -4.0 ; encoding: [0x01,0x00,0x1f,0xd1,0x06,0xee,0x01,0x00]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%y = fadd <2 x half> %x, <half -4.0, half -4.0>
store <2 x half> %y, ptr addrspace(1) %out
ret void
@@ -356,6 +1602,98 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(ptr addrspace(1) %out, <
; VI: v_or_b32
; VI: buffer_store_dword
define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GFX11-TRUE16-LABEL: commute_add_inline_imm_0.5_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x50,0xe0,0x00,0x00,0x02,0x80]
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, v0, 0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x00,0xe1,0x01,0x08]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: commute_add_inline_imm_0.5_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x50,0xe0,0x00,0x00,0x02,0x80]
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, v0, 0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x00,0xe1,0x01,0x08]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: commute_add_inline_imm_0.5_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe]
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe]
+; GFX10-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x03,0x8b,0xbe]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x03,0x88,0xbe]
+; GFX10-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x03,0x89,0xbe]
+; GFX10-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x03,0x84,0xbe]
+; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x00,0x02,0x80]
+; GFX10-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x03,0x85,0xbe]
+; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT: v_pk_add_f16 v0, v0, 0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x00,0xe1,0x01,0x08]
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x01,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: commute_add_inline_imm_0.5_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
+; GFX9-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
+; GFX9-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
+; GFX9-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
+; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x50,0xe0,0x00,0x00,0x02,0x80]
+; GFX9-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
+; GFX9-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
+; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; GFX9-NEXT: v_pk_add_f16 v0, v0, 0.5 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x00,0xe1,0x01,0x08]
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x01,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: commute_add_inline_imm_0.5_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
+; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
+; VI-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
+; VI-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x50,0xe0,0x00,0x00,0x02,0x80]
+; VI-NEXT: v_mov_b32_e32 v1, 0x3800 ; encoding: [0xff,0x02,0x02,0x7e,0x00,0x38,0x00,0x00]
+; VI-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
+; VI-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
+; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x3e,0x00,0x05,0x05,0x06]
+; VI-NEXT: v_add_f16_e32 v0, 0.5, v0 ; encoding: [0xf0,0x00,0x00,0x3e]
+; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x28]
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x01,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%x = load <2 x half>, ptr addrspace(1) %in
%y = fadd <2 x half> %x, <half 0.5, half 0.5>
store <2 x half> %y, ptr addrspace(1) %out
@@ -379,6 +1717,99 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(ptr addrspace(1) %ou
; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; VI: buffer_store_dword
define amdgpu_kernel void @commute_add_literal_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; GFX11-TRUE16-LABEL: commute_add_literal_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
+; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x50,0xe0,0x00,0x00,0x02,0x80]
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, 0x6400, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x0f,0xcc,0xff,0x00,0x02,0x10,0x00,0x64,0x00,0x00]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: commute_add_literal_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; encoding: [0x00,0x00,0x08,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
+; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x50,0xe0,0x00,0x00,0x02,0x80]
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, 0x6400, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x0f,0xcc,0xff,0x00,0x02,0x10,0x00,0x64,0x00,0x00]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x01,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: commute_add_literal_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x08,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x03,0x86,0xbe]
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000 ; encoding: [0xff,0x03,0x87,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x03,0x8a,0xbe]
+; GFX10-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x03,0x8b,0xbe]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x03,0x88,0xbe]
+; GFX10-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x03,0x89,0xbe]
+; GFX10-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x03,0x84,0xbe]
+; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x30,0xe0,0x00,0x00,0x02,0x80]
+; GFX10-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x03,0x85,0xbe]
+; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
+; GFX10-NEXT: v_pk_add_f16 v0, 0x6400, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x0f,0xcc,0xff,0x00,0x02,0x10,0x00,0x64,0x00,0x00]
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x01,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: commute_add_literal_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
+; GFX9-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
+; GFX9-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
+; GFX9-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
+; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x50,0xe0,0x00,0x00,0x02,0x80]
+; GFX9-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
+; GFX9-NEXT: s_movk_i32 s0, 0x6400 ; encoding: [0x00,0x64,0x00,0xb0]
+; GFX9-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
+; GFX9-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; GFX9-NEXT: v_pk_add_f16 v0, v0, s0 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x00,0x01,0x00,0x08]
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x01,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: commute_add_literal_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; encoding: [0x02,0x00,0x0a,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; encoding: [0xff,0x00,0x87,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s6, -1 ; encoding: [0xc1,0x00,0x86,0xbe]
+; VI-NEXT: s_mov_b32 s10, s6 ; encoding: [0x06,0x00,0x8a,0xbe]
+; VI-NEXT: s_mov_b32 s11, s7 ; encoding: [0x07,0x00,0x8b,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
+; VI-NEXT: s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
+; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x50,0xe0,0x00,0x00,0x02,0x80]
+; VI-NEXT: v_mov_b32_e32 v1, 0x6400 ; encoding: [0xff,0x02,0x02,0x7e,0x00,0x64,0x00,0x00]
+; VI-NEXT: s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
+; VI-NEXT: s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
+; VI-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x0f,0x8c,0xbf]
+; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x3e,0x00,0x05,0x05,0x06]
+; VI-NEXT: v_add_f16_e32 v0, 0x6400, v0 ; encoding: [0xff,0x00,0x00,0x3e,0x00,0x64,0x00,0x00]
+; VI-NEXT: v_or_b32_e32 v0, v0, v1 ; encoding: [0x00,0x03,0x00,0x28]
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x01,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%x = load <2 x half>, ptr addrspace(1) %in
%y = fadd <2 x half> %x, <half 1024.0, half 1024.0>
store <2 x half> %y, ptr addrspace(1) %out
@@ -405,6 +1836,68 @@ define amdgpu_kernel void @commute_add_literal_v2f16(ptr addrspace(1) %out, ptr
; VI: v_or_b32
; VI: buffer_store_dword
define amdgpu_kernel void @add_inline_imm_1_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
+; GFX11-TRUE16-LABEL: add_inline_imm_1_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, s2, 1 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x02,0x01,0x08]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_1_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, s2, 1 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x02,0x01,0x08]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: add_inline_imm_1_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: v_pk_add_f16 v0, s2, 1 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x02,0x01,0x08]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: add_inline_imm_1_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: v_pk_add_f16 v0, s6, 1 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x06,0x02,0x01,0x08]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: add_inline_imm_1_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: v_mov_b32_e32 v0, 1 ; encoding: [0x81,0x02,0x00,0x7e]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_lshr_b32 s4, s6, 16 ; encoding: [0x06,0x90,0x04,0x8f]
+; VI-NEXT: v_mov_b32_e32 v1, s4 ; encoding: [0x04,0x02,0x02,0x7e]
+; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x3e,0x01,0x05,0x06,0x06]
+; VI-NEXT: v_add_f16_e64 v1, s6, 1 ; encoding: [0x01,0x00,0x1f,0xd1,0x06,0x02,0x01,0x00]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%y = fadd <2 x half> %x, <half 0xH0001, half 0xH0001>
store <2 x half> %y, ptr addrspace(1) %out
ret void
@@ -431,6 +1924,68 @@ define amdgpu_kernel void @add_inline_imm_1_v2f16(ptr addrspace(1) %out, <2 x ha
; VI: v_or_b32
; VI: buffer_store_dword
define amdgpu_kernel void @add_inline_imm_2_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
+; GFX11-TRUE16-LABEL: add_inline_imm_2_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, s2, 2 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x04,0x01,0x08]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_2_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, s2, 2 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x04,0x01,0x08]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: add_inline_imm_2_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: v_pk_add_f16 v0, s2, 2 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x04,0x01,0x08]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: add_inline_imm_2_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: v_pk_add_f16 v0, s6, 2 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x06,0x04,0x01,0x08]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: add_inline_imm_2_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: v_mov_b32_e32 v0, 2 ; encoding: [0x82,0x02,0x00,0x7e]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_lshr_b32 s4, s6, 16 ; encoding: [0x06,0x90,0x04,0x8f]
+; VI-NEXT: v_mov_b32_e32 v1, s4 ; encoding: [0x04,0x02,0x02,0x7e]
+; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x3e,0x01,0x05,0x06,0x06]
+; VI-NEXT: v_add_f16_e64 v1, s6, 2 ; encoding: [0x01,0x00,0x1f,0xd1,0x06,0x04,0x01,0x00]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%y = fadd <2 x half> %x, <half 0xH0002, half 0xH0002>
store <2 x half> %y, ptr addrspace(1) %out
ret void
@@ -457,6 +2012,68 @@ define amdgpu_kernel void @add_inline_imm_2_v2f16(ptr addrspace(1) %out, <2 x ha
; VI: v_or_b32
; VI: buffer_store_dword
define amdgpu_kernel void @add_inline_imm_16_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
+; GFX11-TRUE16-LABEL: add_inline_imm_16_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, s2, 16 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x20,0x01,0x08]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_16_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, s2, 16 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x20,0x01,0x08]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: add_inline_imm_16_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: v_pk_add_f16 v0, s2, 16 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x20,0x01,0x08]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: add_inline_imm_16_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: v_pk_add_f16 v0, s6, 16 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x06,0x20,0x01,0x08]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: add_inline_imm_16_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: v_mov_b32_e32 v0, 16 ; encoding: [0x90,0x02,0x00,0x7e]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_lshr_b32 s4, s6, 16 ; encoding: [0x06,0x90,0x04,0x8f]
+; VI-NEXT: v_mov_b32_e32 v1, s4 ; encoding: [0x04,0x02,0x02,0x7e]
+; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x3e,0x01,0x05,0x06,0x06]
+; VI-NEXT: v_add_f16_e64 v1, s6, 16 ; encoding: [0x01,0x00,0x1f,0xd1,0x06,0x20,0x01,0x00]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%y = fadd <2 x half> %x, <half 0xH0010, half 0xH0010>
store <2 x half> %y, ptr addrspace(1) %out
ret void
@@ -476,6 +2093,70 @@ define amdgpu_kernel void @add_inline_imm_16_v2f16(ptr addrspace(1) %out, <2 x h
; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
; VI: buffer_store_dword [[REG]]
define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
+; GFX11-TRUE16-LABEL: add_inline_imm_neg_1_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, -1 ; encoding: [0x02,0xc1,0x02,0x81]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; encoding: [0x09,0x00,0x87,0xbf]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_neg_1_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, -1 ; encoding: [0x02,0xc1,0x02,0x81]
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; encoding: [0x09,0x00,0x87,0xbf]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: add_inline_imm_neg_1_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: s_add_i32 s2, s2, -1 ; encoding: [0x02,0xc1,0x02,0x81]
+; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: add_inline_imm_neg_1_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: s_add_i32 s4, s6, -1 ; encoding: [0x06,0xc1,0x04,0x81]
+; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; encoding: [0x04,0x02,0x00,0x7e]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: add_inline_imm_neg_1_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_add_i32 s4, s6, -1 ; encoding: [0x06,0xc1,0x04,0x81]
+; VI-NEXT: v_mov_b32_e32 v0, s4 ; encoding: [0x04,0x02,0x00,0x7e]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%xbc = bitcast <2 x half> %x to i32
%y = add i32 %xbc, -1
%ybc = bitcast i32 %y to <2 x half>
@@ -497,6 +2178,70 @@ define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(ptr addrspace(1) %out, <2
; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
; VI: buffer_store_dword [[REG]]
define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
+; GFX11-TRUE16-LABEL: add_inline_imm_neg_2_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 0xfffefffe ; encoding: [0x02,0xff,0x02,0x81,0xfe,0xff,0xfe,0xff]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; encoding: [0x09,0x00,0x87,0xbf]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_neg_2_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 0xfffefffe ; encoding: [0x02,0xff,0x02,0x81,0xfe,0xff,0xfe,0xff]
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; encoding: [0x09,0x00,0x87,0xbf]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: add_inline_imm_neg_2_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: s_add_i32 s2, s2, 0xfffefffe ; encoding: [0x02,0xff,0x02,0x81,0xfe,0xff,0xfe,0xff]
+; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: add_inline_imm_neg_2_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: s_add_i32 s4, s6, 0xfffefffe ; encoding: [0x06,0xff,0x04,0x81,0xfe,0xff,0xfe,0xff]
+; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; encoding: [0x04,0x02,0x00,0x7e]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: add_inline_imm_neg_2_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_add_i32 s4, s6, 0xfffefffe ; encoding: [0x06,0xff,0x04,0x81,0xfe,0xff,0xfe,0xff]
+; VI-NEXT: v_mov_b32_e32 v0, s4 ; encoding: [0x04,0x02,0x00,0x7e]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%xbc = bitcast <2 x half> %x to i32
%y = add i32 %xbc, 4294901758 ; 0xfffefffe
%ybc = bitcast i32 %y to <2 x half>
@@ -519,6 +2264,70 @@ define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(ptr addrspace(1) %out, <2
; VI: v_mov_b32_e32 [[REG:v[0-9]+]], [[ADD]]
; VI: buffer_store_dword [[REG]]
define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
+; GFX11-TRUE16-LABEL: add_inline_imm_neg_16_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 0xfff0fff0 ; encoding: [0x02,0xff,0x02,0x81,0xf0,0xff,0xf0,0xff]
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; encoding: [0x09,0x00,0x87,0xbf]
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_neg_16_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 0xfff0fff0 ; encoding: [0x02,0xff,0x02,0x81,0xf0,0xff,0xf0,0xff]
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; encoding: [0x09,0x00,0x87,0xbf]
+; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: add_inline_imm_neg_16_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: s_add_i32 s2, s2, 0xfff0fff0 ; encoding: [0x02,0xff,0x02,0x81,0xf0,0xff,0xf0,0xff]
+; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: add_inline_imm_neg_16_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: s_add_i32 s4, s6, 0xfff0fff0 ; encoding: [0x06,0xff,0x04,0x81,0xf0,0xff,0xf0,0xff]
+; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; encoding: [0x04,0x02,0x00,0x7e]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: add_inline_imm_neg_16_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_add_i32 s4, s6, 0xfff0fff0 ; encoding: [0x06,0xff,0x04,0x81,0xf0,0xff,0xf0,0xff]
+; VI-NEXT: v_mov_b32_e32 v0, s4 ; encoding: [0x04,0x02,0x00,0x7e]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%xbc = bitcast <2 x half> %x to i32
%y = add i32 %xbc, 4293984240 ; 0xfff0fff0
%ybc = bitcast i32 %y to <2 x half>
@@ -546,6 +2355,68 @@ define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(ptr addrspace(1) %out, <2
; VI: v_or_b32
; VI: buffer_store_dword
define amdgpu_kernel void @add_inline_imm_63_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
+; GFX11-TRUE16-LABEL: add_inline_imm_63_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, s2, 63 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x7e,0x01,0x08]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_63_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, s2, 63 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x7e,0x01,0x08]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: add_inline_imm_63_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: v_pk_add_f16 v0, s2, 63 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x7e,0x01,0x08]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: add_inline_imm_63_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: v_pk_add_f16 v0, s6, 63 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x06,0x7e,0x01,0x08]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: add_inline_imm_63_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: v_mov_b32_e32 v0, 63 ; encoding: [0xbf,0x02,0x00,0x7e]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_lshr_b32 s4, s6, 16 ; encoding: [0x06,0x90,0x04,0x8f]
+; VI-NEXT: v_mov_b32_e32 v1, s4 ; encoding: [0x04,0x02,0x02,0x7e]
+; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x3e,0x01,0x05,0x06,0x06]
+; VI-NEXT: v_add_f16_e64 v1, s6, 63 ; encoding: [0x01,0x00,0x1f,0xd1,0x06,0x7e,0x01,0x00]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%y = fadd <2 x half> %x, <half 0xH003F, half 0xH003F>
store <2 x half> %y, ptr addrspace(1) %out
ret void
@@ -571,6 +2442,68 @@ define amdgpu_kernel void @add_inline_imm_63_v2f16(ptr addrspace(1) %out, <2 x h
; VI: v_or_b32
; VI: buffer_store_dword
define amdgpu_kernel void @add_inline_imm_64_v2f16(ptr addrspace(1) %out, <2 x half> %x) #0 {
+; GFX11-TRUE16-LABEL: add_inline_imm_64_v2f16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, s2, 64 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x80,0x01,0x08]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: add_inline_imm_64_v2f16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[0:1], 0x8 ; encoding: [0x80,0x00,0x00,0xf4,0x08,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; encoding: [0x00,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, s2, 64 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x80,0x01,0x08]
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX10-LABEL: add_inline_imm_64_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0xa1,0xbf]
+; GFX10-NEXT: s_load_dword s2, s[4:5], 0x8 ; encoding: [0x82,0x00,0x00,0xf4,0x08,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xfa]
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x03,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX10-NEXT: v_pk_add_f16 v0, s2, 64 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x0f,0xcc,0x02,0x80,0x01,0x08]
+; GFX10-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x03,0x82,0xbe]
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; GFX9-LABEL: add_inline_imm_64_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x00]
+; GFX9-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
+; GFX9-NEXT: v_pk_add_f16 v0, s6, 64 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x8f,0xd3,0x06,0x80,0x01,0x08]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; GFX9-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
+;
+; VI-LABEL: add_inline_imm_64_v2f16:
+; VI: ; %bb.0:
+; VI-NEXT: s_load_dword s6, s[4:5], 0x8 ; encoding: [0x82,0x01,0x02,0xc0,0x08,0x00,0x00,0x00]
+; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x06,0xc0,0x00,0x00,0x00,0x00]
+; VI-NEXT: v_mov_b32_e32 v0, 64 ; encoding: [0xc0,0x02,0x00,0x7e]
+; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0xf0,0x00,0x11]
+; VI-NEXT: s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0x00,0x8c,0xbf]
+; VI-NEXT: s_lshr_b32 s4, s6, 16 ; encoding: [0x06,0x90,0x04,0x8f]
+; VI-NEXT: v_mov_b32_e32 v1, s4 ; encoding: [0x04,0x02,0x02,0x7e]
+; VI-NEXT: v_add_f16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x3e,0x01,0x05,0x06,0x06]
+; VI-NEXT: v_add_f16_e64 v1, s6, 64 ; encoding: [0x01,0x00,0x1f,0xd1,0x06,0x80,0x01,0x00]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x80]
+; VI-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
%y = fadd <2 x half> %x, <half 0xH0040, half 0xH0040>
store <2 x half> %y, ptr addrspace(1) %out
ret void
@@ -582,6 +2515,39 @@ define amdgpu_kernel void @add_inline_imm_64_v2f16(ptr addrspace(1) %out, <2 x h
; GFX10: v_pk_mul_lo_u16 v0, 0x3800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x38,0x00,0x00]
define <2 x i16> @mul_inline_imm_0.5_v2i16(<2 x i16> %x) {
+; GFX11-TRUE16-LABEL: mul_inline_imm_0.5_v2i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_mul_lo_u16 v0, 0x3800, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0x38,0x00,0x00]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: mul_inline_imm_0.5_v2i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_mul_lo_u16 v0, 0x3800, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0x38,0x00,0x00]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX10-LABEL: mul_inline_imm_0.5_v2i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX10-NEXT: v_pk_mul_lo_u16 v0, 0x3800, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0x38,0x00,0x00]
+; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
+;
+; GFX9-LABEL: mul_inline_imm_0.5_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX9-NEXT: s_movk_i32 s4, 0x3800 ; encoding: [0x00,0x38,0x04,0xb0]
+; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s4 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x81,0xd3,0x00,0x09,0x00,0x08]
+; GFX9-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
+;
+; VI-LABEL: mul_inline_imm_0.5_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; VI-NEXT: v_mov_b32_e32 v2, 0x3800 ; encoding: [0xff,0x02,0x04,0x7e,0x00,0x38,0x00,0x00]
+; VI-NEXT: v_mul_lo_u16_e32 v1, 0x3800, v0 ; encoding: [0xff,0x00,0x02,0x52,0x00,0x38,0x00,0x00]
+; VI-NEXT: v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x04,0x00,0x52,0x00,0x05,0x05,0x06]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
%y = mul <2 x i16> %x, bitcast (<2 x half> <half 0.5, half 0.5> to <2 x i16>)
ret <2 x i16> %y
}
@@ -592,6 +2558,39 @@ define <2 x i16> @mul_inline_imm_0.5_v2i16(<2 x i16> %x) {
; GFX10: v_pk_mul_lo_u16 v0, 0xffffb800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xb8,0xff,0xff]
define <2 x i16> @mul_inline_imm_neg_0.5_v2i16(<2 x i16> %x) {
+; GFX11-TRUE16-LABEL: mul_inline_imm_neg_0.5_v2i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_mul_lo_u16 v0, 0xffffb800, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0xb8,0xff,0xff]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: mul_inline_imm_neg_0.5_v2i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_mul_lo_u16 v0, 0xffffb800, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0xb8,0xff,0xff]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX10-LABEL: mul_inline_imm_neg_0.5_v2i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX10-NEXT: v_pk_mul_lo_u16 v0, 0xffffb800, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0xb8,0xff,0xff]
+; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
+;
+; GFX9-LABEL: mul_inline_imm_neg_0.5_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX9-NEXT: s_movk_i32 s4, 0xb800 ; encoding: [0x00,0xb8,0x04,0xb0]
+; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s4 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x81,0xd3,0x00,0x09,0x00,0x08]
+; GFX9-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
+;
+; VI-LABEL: mul_inline_imm_neg_0.5_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; VI-NEXT: v_mov_b32_e32 v2, 0xffffb800 ; encoding: [0xff,0x02,0x04,0x7e,0x00,0xb8,0xff,0xff]
+; VI-NEXT: v_mul_lo_u16_e32 v1, 0xb800, v0 ; encoding: [0xff,0x00,0x02,0x52,0x00,0xb8,0xff,0xff]
+; VI-NEXT: v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x04,0x00,0x52,0x00,0x05,0x05,0x06]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
%y = mul <2 x i16> %x, bitcast (<2 x half> <half -0.5, half -0.5> to <2 x i16>)
ret <2 x i16> %y
}
@@ -602,6 +2601,39 @@ define <2 x i16> @mul_inline_imm_neg_0.5_v2i16(<2 x i16> %x) {
; GFX10: v_pk_mul_lo_u16 v0, 0x3c00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x3c,0x00,0x00]
define <2 x i16> @mul_inline_imm_1.0_v2i16(<2 x i16> %x) {
+; GFX11-TRUE16-LABEL: mul_inline_imm_1.0_v2i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_mul_lo_u16 v0, 0x3c00, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0x3c,0x00,0x00]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: mul_inline_imm_1.0_v2i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_mul_lo_u16 v0, 0x3c00, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0x3c,0x00,0x00]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX10-LABEL: mul_inline_imm_1.0_v2i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX10-NEXT: v_pk_mul_lo_u16 v0, 0x3c00, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0x3c,0x00,0x00]
+; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
+;
+; GFX9-LABEL: mul_inline_imm_1.0_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX9-NEXT: s_movk_i32 s4, 0x3c00 ; encoding: [0x00,0x3c,0x04,0xb0]
+; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s4 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x81,0xd3,0x00,0x09,0x00,0x08]
+; GFX9-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
+;
+; VI-LABEL: mul_inline_imm_1.0_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; VI-NEXT: v_mov_b32_e32 v2, 0x3c00 ; encoding: [0xff,0x02,0x04,0x7e,0x00,0x3c,0x00,0x00]
+; VI-NEXT: v_mul_lo_u16_e32 v1, 0x3c00, v0 ; encoding: [0xff,0x00,0x02,0x52,0x00,0x3c,0x00,0x00]
+; VI-NEXT: v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x04,0x00,0x52,0x00,0x05,0x05,0x06]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
%y = mul <2 x i16> %x, bitcast (<2 x half> <half 1.0, half 1.0> to <2 x i16>)
ret <2 x i16> %y
}
@@ -612,6 +2644,39 @@ define <2 x i16> @mul_inline_imm_1.0_v2i16(<2 x i16> %x) {
; GFX10: v_pk_mul_lo_u16 v0, 0xffffbc00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xbc,0xff,0xff]
define <2 x i16> @mul_inline_imm_neg_1.0_v2i16(<2 x i16> %x) {
+; GFX11-TRUE16-LABEL: mul_inline_imm_neg_1.0_v2i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_mul_lo_u16 v0, 0xffffbc00, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0xbc,0xff,0xff]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: mul_inline_imm_neg_1.0_v2i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_mul_lo_u16 v0, 0xffffbc00, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0xbc,0xff,0xff]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX10-LABEL: mul_inline_imm_neg_1.0_v2i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX10-NEXT: v_pk_mul_lo_u16 v0, 0xffffbc00, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0xbc,0xff,0xff]
+; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
+;
+; GFX9-LABEL: mul_inline_imm_neg_1.0_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX9-NEXT: s_movk_i32 s4, 0xbc00 ; encoding: [0x00,0xbc,0x04,0xb0]
+; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s4 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x81,0xd3,0x00,0x09,0x00,0x08]
+; GFX9-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
+;
+; VI-LABEL: mul_inline_imm_neg_1.0_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; VI-NEXT: v_mov_b32_e32 v2, 0xffffbc00 ; encoding: [0xff,0x02,0x04,0x7e,0x00,0xbc,0xff,0xff]
+; VI-NEXT: v_mul_lo_u16_e32 v1, 0xbc00, v0 ; encoding: [0xff,0x00,0x02,0x52,0x00,0xbc,0xff,0xff]
+; VI-NEXT: v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x04,0x00,0x52,0x00,0x05,0x05,0x06]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
%y = mul <2 x i16> %x, bitcast (<2 x half> <half -1.0, half -1.0> to <2 x i16>)
ret <2 x i16> %y
}
@@ -621,6 +2686,39 @@ define <2 x i16> @mul_inline_imm_neg_1.0_v2i16(<2 x i16> %x) {
; GFX10: v_pk_lshlrev_b16 v0, v0, 2.0 op_sel:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xe9,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}}]
define <2 x i16> @shl_inline_imm_2.0_v2i16(<2 x i16> %x) {
+; GFX11-TRUE16-LABEL: shl_inline_imm_2.0_v2i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v0, v0, 2.0 op_sel:[0,1] ; encoding: [0x00,0x50,0x04,0xcc,0x00,0xe9,0x01,0x18]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: shl_inline_imm_2.0_v2i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v0, v0, 2.0 op_sel:[0,1] ; encoding: [0x00,0x50,0x04,0xcc,0x00,0xe9,0x01,0x18]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX10-LABEL: shl_inline_imm_2.0_v2i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, 2.0 op_sel:[0,1] ; encoding: [0x00,0x50,0x04,0xcc,0x00,0xe9,0x01,0x18]
+; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
+;
+; GFX9-LABEL: shl_inline_imm_2.0_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, 2.0 op_sel:[0,1] ; encoding: [0x00,0x50,0x84,0xd3,0x00,0xe9,0x01,0x18]
+; GFX9-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
+;
+; VI-LABEL: shl_inline_imm_2.0_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; VI-NEXT: s_movk_i32 s4, 0x4000 ; encoding: [0x00,0x40,0x04,0xb0]
+; VI-NEXT: v_mov_b32_e32 v2, 0x4000 ; encoding: [0xff,0x02,0x04,0x7e,0x00,0x40,0x00,0x00]
+; VI-NEXT: v_lshlrev_b16_e64 v1, v0, s4 ; encoding: [0x01,0x00,0x2a,0xd1,0x00,0x09,0x00,0x00]
+; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x04,0x00,0x54,0x00,0x05,0x05,0x06]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
%y = shl <2 x i16> bitcast (<2 x half> <half 2.0, half 2.0> to <2 x i16>), %x
ret <2 x i16> %y
}
@@ -630,6 +2728,39 @@ define <2 x i16> @shl_inline_imm_2.0_v2i16(<2 x i16> %x) {
; GFX10: v_pk_lshlrev_b16 v0, v0, -2.0 op_sel:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xeb,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}}]
define <2 x i16> @shl_inline_imm_neg_2.0_v2i16(<2 x i16> %x) {
+; GFX11-TRUE16-LABEL: shl_inline_imm_neg_2.0_v2i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_lshlrev_b16 v0, v0, -2.0 op_sel:[0,1] ; encoding: [0x00,0x50,0x04,0xcc,0x00,0xeb,0x01,0x18]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: shl_inline_imm_neg_2.0_v2i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_lshlrev_b16 v0, v0, -2.0 op_sel:[0,1] ; encoding: [0x00,0x50,0x04,0xcc,0x00,0xeb,0x01,0x18]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX10-LABEL: shl_inline_imm_neg_2.0_v2i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX10-NEXT: v_pk_lshlrev_b16 v0, v0, -2.0 op_sel:[0,1] ; encoding: [0x00,0x50,0x04,0xcc,0x00,0xeb,0x01,0x18]
+; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
+;
+; GFX9-LABEL: shl_inline_imm_neg_2.0_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, -2.0 op_sel:[0,1] ; encoding: [0x00,0x50,0x84,0xd3,0x00,0xeb,0x01,0x18]
+; GFX9-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
+;
+; VI-LABEL: shl_inline_imm_neg_2.0_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; VI-NEXT: s_movk_i32 s4, 0xc000 ; encoding: [0x00,0xc0,0x04,0xb0]
+; VI-NEXT: v_mov_b32_e32 v2, 0xffffc000 ; encoding: [0xff,0x02,0x04,0x7e,0x00,0xc0,0xff,0xff]
+; VI-NEXT: v_lshlrev_b16_e64 v1, v0, s4 ; encoding: [0x01,0x00,0x2a,0xd1,0x00,0x09,0x00,0x00]
+; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x04,0x00,0x54,0x00,0x05,0x05,0x06]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
%y = shl <2 x i16> bitcast (<2 x half> <half -2.0, half -2.0> to <2 x i16>), %x
ret <2 x i16> %y
}
@@ -640,6 +2771,39 @@ define <2 x i16> @shl_inline_imm_neg_2.0_v2i16(<2 x i16> %x) {
; GFX10: v_pk_mul_lo_u16 v0, 0x4400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x44,0x00,0x00]
define <2 x i16> @mul_inline_imm_4.0_v2i16(<2 x i16> %x) {
+; GFX11-TRUE16-LABEL: mul_inline_imm_4.0_v2i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_mul_lo_u16 v0, 0x4400, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0x44,0x00,0x00]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: mul_inline_imm_4.0_v2i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_mul_lo_u16 v0, 0x4400, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0x44,0x00,0x00]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX10-LABEL: mul_inline_imm_4.0_v2i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX10-NEXT: v_pk_mul_lo_u16 v0, 0x4400, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0x44,0x00,0x00]
+; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
+;
+; GFX9-LABEL: mul_inline_imm_4.0_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX9-NEXT: s_movk_i32 s4, 0x4400 ; encoding: [0x00,0x44,0x04,0xb0]
+; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s4 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x81,0xd3,0x00,0x09,0x00,0x08]
+; GFX9-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
+;
+; VI-LABEL: mul_inline_imm_4.0_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; VI-NEXT: v_mov_b32_e32 v2, 0x4400 ; encoding: [0xff,0x02,0x04,0x7e,0x00,0x44,0x00,0x00]
+; VI-NEXT: v_mul_lo_u16_e32 v1, 0x4400, v0 ; encoding: [0xff,0x00,0x02,0x52,0x00,0x44,0x00,0x00]
+; VI-NEXT: v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x04,0x00,0x52,0x00,0x05,0x05,0x06]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
%y = mul <2 x i16> %x, bitcast (<2 x half> <half 4.0, half 4.0> to <2 x i16>)
ret <2 x i16> %y
@@ -651,6 +2815,39 @@ define <2 x i16> @mul_inline_imm_4.0_v2i16(<2 x i16> %x) {
; GFX10: v_pk_mul_lo_u16 v0, 0xffffc400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc4,0xff,0xff]
define <2 x i16> @mul_inline_imm_neg_4.0_v2i16(<2 x i16> %x) {
+; GFX11-TRUE16-LABEL: mul_inline_imm_neg_4.0_v2i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_mul_lo_u16 v0, 0xffffc400, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0xc4,0xff,0xff]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: mul_inline_imm_neg_4.0_v2i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_mul_lo_u16 v0, 0xffffc400, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0xc4,0xff,0xff]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX10-LABEL: mul_inline_imm_neg_4.0_v2i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX10-NEXT: v_pk_mul_lo_u16 v0, 0xffffc400, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x00,0xc4,0xff,0xff]
+; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
+;
+; GFX9-LABEL: mul_inline_imm_neg_4.0_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX9-NEXT: s_movk_i32 s4, 0xc400 ; encoding: [0x00,0xc4,0x04,0xb0]
+; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s4 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x81,0xd3,0x00,0x09,0x00,0x08]
+; GFX9-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
+;
+; VI-LABEL: mul_inline_imm_neg_4.0_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; VI-NEXT: v_mov_b32_e32 v2, 0xffffc400 ; encoding: [0xff,0x02,0x04,0x7e,0x00,0xc4,0xff,0xff]
+; VI-NEXT: v_mul_lo_u16_e32 v1, 0xc400, v0 ; encoding: [0xff,0x00,0x02,0x52,0x00,0xc4,0xff,0xff]
+; VI-NEXT: v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x04,0x00,0x52,0x00,0x05,0x05,0x06]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
%y = mul <2 x i16> %x, bitcast (<2 x half> <half -4.0, half -4.0> to <2 x i16>)
ret <2 x i16> %y
}
@@ -661,8 +2858,43 @@ define <2 x i16> @mul_inline_imm_neg_4.0_v2i16(<2 x i16> %x) {
; GFX10: v_pk_mul_lo_u16 v0, 0x3118, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x18,0x31,0x00,0x00]
define <2 x i16> @mul_inline_imm_inv2pi_v2i16(<2 x i16> %x) {
+; GFX11-TRUE16-LABEL: mul_inline_imm_inv2pi_v2i16:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-TRUE16-NEXT: v_pk_mul_lo_u16 v0, 0x3118, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x18,0x31,0x00,0x00]
+; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX11-FAKE16-LABEL: mul_inline_imm_inv2pi_v2i16:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x89,0xbf]
+; GFX11-FAKE16-NEXT: v_pk_mul_lo_u16 v0, 0x3118, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x18,0x31,0x00,0x00]
+; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x48,0x80,0xbe]
+;
+; GFX10-LABEL: mul_inline_imm_inv2pi_v2i16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX10-NEXT: v_pk_mul_lo_u16 v0, 0x3118, v0 op_sel_hi:[0,1] ; encoding: [0x00,0x40,0x01,0xcc,0xff,0x00,0x02,0x10,0x18,0x31,0x00,0x00]
+; GFX10-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x20,0x80,0xbe]
+;
+; GFX9-LABEL: mul_inline_imm_inv2pi_v2i16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; GFX9-NEXT: s_movk_i32 s4, 0x3118 ; encoding: [0x18,0x31,0x04,0xb0]
+; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s4 op_sel_hi:[1,0] ; encoding: [0x00,0x40,0x81,0xd3,0x00,0x09,0x00,0x08]
+; GFX9-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
+;
+; VI-LABEL: mul_inline_imm_inv2pi_v2i16:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+; VI-NEXT: v_mov_b32_e32 v2, 0x3118 ; encoding: [0xff,0x02,0x04,0x7e,0x18,0x31,0x00,0x00]
+; VI-NEXT: v_mul_lo_u16_e32 v1, 0x3118, v0 ; encoding: [0xff,0x00,0x02,0x52,0x18,0x31,0x00,0x00]
+; VI-NEXT: v_mul_lo_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x04,0x00,0x52,0x00,0x05,0x05,0x06]
+; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; encoding: [0x01,0x01,0x00,0x28]
+; VI-NEXT: s_setpc_b64 s[30:31] ; encoding: [0x1e,0x1d,0x80,0xbe]
%y = mul <2 x i16> %x, bitcast (<2 x half> <half 0xH3118, half 0xH3118> to <2 x i16>)
ret <2 x i16> %y
}
attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 8f40ee105cfa0..49879f66dd852 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -2,7 +2,8 @@
; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=CIVI,VI %s
; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=CIVI,CI %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
; GFX9-LABEL: s_insertelement_v2i16_0:
@@ -95,19 +96,34 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: s_insertelement_v2i16_0_reg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, s4, s2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_insertelement_v2i16_0_reg:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x30
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, s3, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_insertelement_v2i16_0_reg:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x30
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, s4, s2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
%vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
store <2 x i16> %vecins, ptr addrspace(1) %out
@@ -172,23 +188,44 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_pack_ll_b32_b16 s3, s4, s2
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s2
-; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x30
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s2
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s3, s3, s4
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: ;;#ASMSTART
+; GFX11-TRUE16-NEXT: ; use s2
+; GFX11-TRUE16-NEXT: ;;#ASMEND
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x30
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s3, s4, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: ;;#ASMSTART
+; GFX11-FAKE16-NEXT: ; use s2
+; GFX11-FAKE16-NEXT: ;;#ASMEND
+; GFX11-FAKE16-NEXT: s_endpgm
%vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
%elt1 = extractelement <2 x i16> %vec, i32 1
%vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
@@ -319,23 +356,42 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_lshr_b32 s3, s4, 16
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_pack_lh_b32_b16 s2, s3, s2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s3
-; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s4, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s3
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_pack_lh_b32_b16 s2, s4, s2
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: ;;#ASMSTART
+; GFX11-TRUE16-NEXT: ; use s3
+; GFX11-TRUE16-NEXT: ;;#ASMEND
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s4, 16
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_pack_lh_b32_b16 s2, s3, s2
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: ;;#ASMSTART
+; GFX11-FAKE16-NEXT: ; use s3
+; GFX11-FAKE16-NEXT: ;;#ASMEND
+; GFX11-FAKE16-NEXT: s_endpgm
%vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
%elt.hi = lshr i32 %elt.arg, 16
%elt = trunc i32 %elt.hi to i16
@@ -412,27 +468,52 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_lshr_b32 s3, s4, 16
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s2, s2, 16
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_pack_ll_b32_b16 s4, s3, s2
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s3
-; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s2
-; GFX11-NEXT: ;;#ASMEND
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s4, 16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s3
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s4, s4, s5
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: ;;#ASMSTART
+; GFX11-TRUE16-NEXT: ; use s3
+; GFX11-TRUE16-NEXT: ;;#ASMEND
+; GFX11-TRUE16-NEXT: ;;#ASMSTART
+; GFX11-TRUE16-NEXT: ; use s2
+; GFX11-TRUE16-NEXT: ;;#ASMEND
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s4, 16
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s4, s3, s2
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: ;;#ASMSTART
+; GFX11-FAKE16-NEXT: ; use s3
+; GFX11-FAKE16-NEXT: ;;#ASMEND
+; GFX11-FAKE16-NEXT: ;;#ASMSTART
+; GFX11-FAKE16-NEXT: ; use s2
+; GFX11-FAKE16-NEXT: ;;#ASMEND
+; GFX11-FAKE16-NEXT: s_endpgm
%vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
%elt.hi = lshr i32 %elt.arg, 16
%elt = trunc i32 %elt.hi to i16
@@ -537,19 +618,34 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: s_insertelement_v2i16_1_reg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: s_insertelement_v2i16_1_reg:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x30
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-TRUE16-NEXT: s_mov_b32 s3, s4
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_insertelement_v2i16_1_reg:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x30
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s4
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%vec = load <2 x i16>, ptr addrspace(4) %vec.ptr
%vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
store <2 x i16> %vecins, ptr addrspace(1) %out
@@ -771,19 +867,36 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_insertelement_v2i16_0_reghi:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_perm_b32 v1, v1, s4, 0x7060302
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_insertelement_v2i16_0_reghi:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e64 v2, 16, s4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v2
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_insertelement_v2i16_0_reghi:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, s4, 0x7060302
+; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -918,19 +1031,32 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_insertelement_v2i16_1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT: s_movk_i32 s2, 0x3e7
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_perm_b32 v1, s2, v1, 0x5040100
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_insertelement_v2i16_1:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0x3e7
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_insertelement_v2i16_1:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_movk_i32 s2, 0x3e7
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, s2, v1, 0x5040100
+; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -990,18 +1116,31 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_insertelement_v2i16_1_inlineimm:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_perm_b32 v1, -15, v1, 0x5040100
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_insertelement_v2i16_1_inlineimm:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, -15
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_insertelement_v2i16_1_inlineimm:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, -15, v1, 0x5040100
+; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1206,19 +1345,33 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_insertelement_v2f16_1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT: s_movk_i32 s2, 0x4500
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_perm_b32 v1, s2, v1, 0x5040100
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_insertelement_v2f16_1:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0x4500
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_insertelement_v2f16_1:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_movk_i32 s2, 0x4500
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, s2, v1, 0x5040100
+; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1278,18 +1431,32 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_insertelement_v2f16_1_inlineimm:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_perm_b32 v1, 35, v1, 0x5040100
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_insertelement_v2f16_1_inlineimm:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l
+; GFX11-TRUE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_insertelement_v2f16_1_inlineimm:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, 35, v1, 0x5040100
+; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1625,19 +1792,34 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_insertelement_v4f16_0:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_bfi_b32 v0, 0xffff, s4, v0
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_insertelement_v4f16_0:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x30
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v3, v0
+; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_insertelement_v4f16_0:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0xffff, s4, v0
+; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1703,19 +1885,33 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_insertelement_v4f16_1:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_perm_b32 v0, s4, v0, 0x5040100
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_insertelement_v4f16_1:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s4
+; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_insertelement_v4f16_1:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, s4, v0, 0x5040100
+; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1782,19 +1978,34 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_insertelement_v4f16_2:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x30
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s4, v1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_insertelement_v4f16_2:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x30
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v1
+; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_insertelement_v4f16_2:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x30
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0xffff, s4, v1
+; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1860,19 +2071,33 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_insertelement_v4f16_3:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_perm_b32 v1, s4, v1, 0x5040100
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_insertelement_v4f16_3:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s4
+; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_insertelement_v4f16_3:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, s4, v1, 0x5040100
+; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1939,19 +2164,34 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_insertelement_v4i16_2:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_bfi_b32 v1, 0xffff, s4, v1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_insertelement_v4i16_2:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, s4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v1
+; GFX11-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_insertelement_v4i16_2:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0xffff, s4, v1
+; GFX11-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2039,26 +2279,49 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_insertelement_v4i16_dynamic_vgpr:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: global_load_b32 v2, v[0:1], off glc dlc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v4, s[2:3]
-; GFX11-NEXT: s_pack_ll_b32_b16 s2, s4, s4
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 4, v2
-; GFX11-NEXT: v_lshlrev_b64 v[2:3], v2, 0xffff
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfi_b32 v1, v3, s2, v1
-; GFX11-NEXT: v_bfi_b32 v0, v2, s2, v0
-; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_insertelement_v4i16_dynamic_vgpr:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: global_load_b32 v2, v[0:1], off glc dlc
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b64 v[0:1], v4, s[2:3]
+; GFX11-TRUE16-NEXT: s_mov_b32 s2, s4
+; GFX11-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s2
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b64 v[2:3], v2, 0xffff
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v1, v3, s2, v1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v0, v2, s2, v0
+; GFX11-TRUE16-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_insertelement_v4i16_dynamic_vgpr:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: global_load_b32 v2, v[0:1], off glc dlc
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v4, s[2:3]
+; GFX11-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s4, s4
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 4, v2
+; GFX11-FAKE16-NEXT: v_lshlrev_b64 v[2:3], v2, 0xffff
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v1, v3, s2, v1
+; GFX11-FAKE16-NEXT: v_bfi_b32 v0, v2, s2, v0
+; GFX11-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2226,19 +2489,33 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_insertelement_v8f16_3:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_perm_b32 v1, s4, v1, 0x5040100
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_insertelement_v8f16_3:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s4
+; GFX11-TRUE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_insertelement_v8f16_3:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, s4, v1, 0x5040100
+; GFX11-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2306,19 +2583,34 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_insertelement_v8i16_6:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s4, v3
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_insertelement_v8i16_6:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v5, v3
+; GFX11-TRUE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_insertelement_v8i16_6:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0xffff, s4, v3
+; GFX11-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <8 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2501,51 +2793,93 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
; CI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_insertelement_v8f16_dynamic:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
-; GFX11-NEXT: s_cmp_eq_u32 s5, 6
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 7
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v3, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 4
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_cselect_b32 s3, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 5
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3
-; GFX11-NEXT: s_cselect_b32 s3, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 2
-; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 3
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 0
-; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, s3
-; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, s4, s2
-; GFX11-NEXT: v_perm_b32 v3, v3, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v7, v1, 0x5040100
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_insertelement_v8f16_dynamic:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 4, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v6, s[2:3]
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, s4
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 7
+; GFX11-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 4
+; GFX11-TRUE16-NEXT: s_cselect_b32 s6, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 5
+; GFX11-TRUE16-NEXT: s_cselect_b32 s7, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s8, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 3
+; GFX11-TRUE16-NEXT: s_cselect_b32 s9, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 0
+; GFX11-TRUE16-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 1
+; GFX11-TRUE16-NEXT: s_cselect_b32 s5, -1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v0
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v5.l, s2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.l, v5.l, s6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v5.l, s8
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v5.l, s4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v7.l, v5.l, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v5.l, s7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v5.l, s9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.l, v5.l, s5
+; GFX11-TRUE16-NEXT: global_store_b128 v6, v[1:4], s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_insertelement_v8f16_dynamic:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 6
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 7
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v5, v3, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_cselect_b32 s3, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3
+; GFX11-FAKE16-NEXT: s_cselect_b32 s3, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v6, v6, s4, s3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v8, v8, s4, s2
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v7, v1, 0x5040100
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100
+; GFX11-FAKE16-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2629,24 +2963,42 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_insertelement_v16f16_3:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3]
-; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_perm_b32 v1, s4, v1, 0x5040100
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_insertelement_v16f16_3:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v8, s[2:3] offset:16
+; GFX11-TRUE16-NEXT: global_load_b128 v[4:7], v8, s[2:3]
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, s4
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX11-TRUE16-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_insertelement_v16f16_3:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v8, s[2:3]
+; GFX11-FAKE16-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, s4, v1, 0x5040100
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX11-FAKE16-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext
@@ -2730,24 +3082,45 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
; CI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_insertelement_v16i16_6:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3]
-; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_bfi_b32 v3, 0xffff, s4, v3
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_insertelement_v16i16_6:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v8, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, s4
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v9, v3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX11-TRUE16-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_insertelement_v16i16_6:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v8, s[2:3]
+; GFX11-FAKE16-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0xffff, s4, v3
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX11-FAKE16-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <16 x i16>, ptr addrspace(1) %in, i64 %tid.ext
@@ -3068,87 +3441,162 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; CI-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
; CI-NEXT: s_endpgm
;
-; GFX11-LABEL: v_insertelement_v16f16_dynamic:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[0:3], v8, s[2:3]
-; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
-; GFX11-NEXT: s_cmp_eq_u32 s5, 6
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 7
-; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_cndmask_b32_e64 v9, v3, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 4
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_cselect_b32 s3, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 5
-; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3
-; GFX11-NEXT: s_cselect_b32 s3, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 2
-; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v1
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 3
-; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 0
-; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v0
-; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 14
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v7
-; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, s4, s3
-; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100
-; GFX11-NEXT: v_cndmask_b32_e64 v9, v12, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 15
-; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 12
-; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v6
-; GFX11-NEXT: v_perm_b32 v2, v10, v2, 0x5040100
-; GFX11-NEXT: v_cndmask_b32_e64 v10, v13, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 13
-; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 10
-; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v5
-; GFX11-NEXT: v_cndmask_b32_e64 v12, v14, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 11
-; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 8
-; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v4
-; GFX11-NEXT: v_cndmask_b32_e64 v13, v15, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: s_cmp_eq_u32 s5, 9
-; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s4, s2
-; GFX11-NEXT: s_cselect_b32 s2, -1, 0
-; GFX11-NEXT: v_perm_b32 v7, v10, v7, 0x5040100
-; GFX11-NEXT: v_cndmask_b32_e64 v14, v16, s4, s2
-; GFX11-NEXT: v_perm_b32 v6, v12, v6, 0x5040100
-; GFX11-NEXT: v_perm_b32 v5, v13, v5, 0x5040100
-; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100
-; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x5040100
-; GFX11-NEXT: v_perm_b32 v4, v14, v4, 0x5040100
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
-; GFX11-NEXT: s_endpgm
+; GFX11-TRUE16-LABEL: v_insertelement_v16f16_dynamic:
+; GFX11-TRUE16: ; %bb.0:
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 5, v0
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: global_load_b128 v[0:3], v12, s[2:3]
+; GFX11-TRUE16-NEXT: global_load_b128 v[4:7], v12, s[2:3] offset:16
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 6
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, s4
+; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 7
+; GFX11-TRUE16-NEXT: s_cselect_b32 s3, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 4
+; GFX11-TRUE16-NEXT: s_cselect_b32 s6, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 5
+; GFX11-TRUE16-NEXT: s_cselect_b32 s7, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 2
+; GFX11-TRUE16-NEXT: s_cselect_b32 s8, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 3
+; GFX11-TRUE16-NEXT: s_cselect_b32 s9, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 0
+; GFX11-TRUE16-NEXT: s_cselect_b32 s10, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 1
+; GFX11-TRUE16-NEXT: s_cselect_b32 s11, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 14
+; GFX11-TRUE16-NEXT: s_cselect_b32 s12, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 15
+; GFX11-TRUE16-NEXT: s_cselect_b32 s13, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 12
+; GFX11-TRUE16-NEXT: s_cselect_b32 s14, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 13
+; GFX11-TRUE16-NEXT: s_cselect_b32 s15, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 10
+; GFX11-TRUE16-NEXT: s_cselect_b32 s16, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 11
+; GFX11-TRUE16-NEXT: s_cselect_b32 s17, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 8
+; GFX11-TRUE16-NEXT: s_cselect_b32 s4, -1, 0
+; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s5, 9
+; GFX11-TRUE16-NEXT: s_cselect_b32 s5, -1, 0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v3.l, v8.h, s2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v2.l, v8.h, s6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v2
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v1.l, v8.h, s8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v0.l, v8.h, s10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v0
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v7.l, v8.h, s12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v6.l, v8.h, s14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v8.h, s16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v8.h, s4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v7.l, v8.h, s13
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v6.l, v8.h, s15
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v8.h, s17
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v13.l, v8.h, s3
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v8.h, s5
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v14.l, v8.h, s7
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v15.l, v8.h, s9
+; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v16.l, v8.h, s11
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: global_store_b128 v12, v[0:3], s[0:1] offset:16
+; GFX11-TRUE16-NEXT: global_store_b128 v12, v[8:11], s[0:1]
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_insertelement_v16f16_dynamic:
+; GFX11-FAKE16: ; %bb.0:
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
+; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: global_load_b128 v[0:3], v8, s[2:3]
+; GFX11-FAKE16-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:16
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 6
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 7
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v9, v3, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 4
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-FAKE16-NEXT: s_cselect_b32 s3, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 5
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, v2, s4, s3
+; GFX11-FAKE16-NEXT: s_cselect_b32 s3, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 2
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, v3, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 3
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, v1, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 0
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v11, v11, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 1
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, v0, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 14
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v7
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v10, v10, s4, s3
+; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v9, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v9, v12, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 15
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v7, v7, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 12
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; GFX11-FAKE16-NEXT: v_perm_b32 v2, v10, v2, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v10, v13, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 13
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v6, v6, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 10
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v5
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v12, v14, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 11
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v5, v5, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 8
+; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v4
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v13, v15, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s5, 9
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, v4, s4, s2
+; GFX11-FAKE16-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-FAKE16-NEXT: v_perm_b32 v7, v10, v7, 0x5040100
+; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v14, v16, s4, s2
+; GFX11-FAKE16-NEXT: v_perm_b32 v6, v12, v6, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v5, v13, v5, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v1, v11, v1, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v0, v9, v0, 0x5040100
+; GFX11-FAKE16-NEXT: v_perm_b32 v4, v14, v4, 0x5040100
+; GFX11-FAKE16-NEXT: s_clause 0x1
+; GFX11-FAKE16-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX11-FAKE16-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-FAKE16-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext
More information about the llvm-commits
mailing list