[llvm] e9d77cd - [AMDGPU][GFX11] Add test coverage for 16-bit conversions, part 3.

Ivan Kosarev via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 15 01:55:44 PDT 2023


Author: Ivan Kosarev
Date: 2023-06-15T09:55:25+01:00
New Revision: e9d77cd9b267cb43bf7a968053517ca499959f2f

URL: https://github.com/llvm/llvm-project/commit/e9d77cd9b267cb43bf7a968053517ca499959f2f
DIFF: https://github.com/llvm/llvm-project/commit/e9d77cd9b267cb43bf7a968053517ca499959f2f.diff

LOG: [AMDGPU][GFX11] Add test coverage for 16-bit conversions, part 3.

Reviewed By: Joe_Nash

Differential Revision: https://reviews.llvm.org/D152716

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
    llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
    llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
    llvm/test/CodeGen/AMDGPU/fnearbyint.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 843df525b80f5..d08699248e931 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefix=GFX906 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
 
 define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) {
 ; GCN-LABEL: uniform_vec_0_i16:
@@ -37,6 +38,19 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) {
 ; GFX906-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX906-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX906-NEXT:    s_endpgm
+;
+; GFX11-LABEL: uniform_vec_0_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tmp = insertelement <2 x i16> undef, i16 0, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1
   %val = bitcast <2 x i16> %vec to i32
@@ -62,6 +76,13 @@ define i32 @divergent_vec_0_i16(i16 %a) {
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: divergent_vec_0_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x i16> undef, i16 0, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1
   %val = bitcast <2 x i16> %vec to i32
@@ -102,6 +123,19 @@ define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) {
 ; GFX906-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX906-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX906-NEXT:    s_endpgm
+;
+; GFX11-LABEL: uniform_vec_i16_0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 0, i32 1
   %val = bitcast <2 x i16> %vec to i32
@@ -127,6 +161,13 @@ define i32 @divergent_vec_i16_0(i16 %a) {
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: divergent_vec_i16_0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 0, i32 1
   %val = bitcast <2 x i16> %vec to i32
@@ -167,6 +208,19 @@ define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) {
 ; GFX906-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX906-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX906-NEXT:    s_endpgm
+;
+; GFX11-LABEL: uniform_vec_f16_0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tmp = insertelement <2 x half> undef, half %a, i32 0
   %vec = insertelement <2 x half> %tmp, half 0.0, i32 1
   %val = bitcast <2 x half> %vec to float
@@ -192,6 +246,13 @@ define float @divergent_vec_f16_0(half %a) {
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: divergent_vec_f16_0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x half> undef, half %a, i32 0
   %vec = insertelement <2 x half> %tmp, half 0.0, i32 1
   %val = bitcast <2 x half> %vec to float
@@ -239,6 +300,19 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GFX906-NEXT:    ; use s0
 ; GFX906-NEXT:    ;;#ASMEND
 ; GFX906-NEXT:    s_endpgm
+;
+; GFX11-LABEL: uniform_vec_i16_LL:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_endpgm
   %val0 = load volatile i32, ptr addrspace(4) %in0
   %val1 = load volatile i32, ptr addrspace(4) %in1
   %lo = trunc i32 %val0 to i16
@@ -272,6 +346,13 @@ define i32 @divergent_vec_i16_LL(i16 %a, i16 %b) {
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: divergent_vec_i16_LL:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
   %vec = insertelement <2 x i16> %tmp, i16 %b, i32 1
   %val = bitcast <2 x i16> %vec to i32
@@ -313,6 +394,17 @@ define amdgpu_kernel void @uniform_vec_i16_LH(ptr addrspace(1) %out, i16 %a, i32
 ; GFX906-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX906-NEXT:    s_endpgm
+;
+; GFX11-LABEL: uniform_vec_i16_LH:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_pack_lh_b32_b16 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %shift = lshr i32 %b, 16
   %tr = trunc i32 %shift to i16
   %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
@@ -343,6 +435,13 @@ define i32 @divergent_vec_i16_LH(i16 %a, i32 %b) {
 ; GFX906-NEXT:    s_mov_b32 s4, 0xffff
 ; GFX906-NEXT:    v_bfi_b32 v0, s4, v0, v1
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: divergent_vec_i16_LH:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shift = lshr i32 %b, 16
   %tr = trunc i32 %shift to i16
   %tmp = insertelement <2 x i16> undef, i16 %a, i32 0
@@ -386,6 +485,17 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32
 ; GFX906-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX906-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX906-NEXT:    s_endpgm
+;
+; GFX11-LABEL: uniform_vec_i16_HH:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_pack_hh_b32_b16 s2, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %shift_a = lshr i32 %a, 16
   %tr_a = trunc i32 %shift_a to i16
   %shift_b = lshr i32 %b, 16
@@ -419,6 +529,13 @@ define i32 @divergent_vec_i16_HH(i32 %a, i32 %b) {
 ; GFX906-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: divergent_vec_i16_HH:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shift_a = lshr i32 %a, 16
   %tr_a = trunc i32 %shift_a to i16
   %shift_b = lshr i32 %b, 16
@@ -470,6 +587,19 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
 ; GFX906-NEXT:    ; use s0
 ; GFX906-NEXT:    ;;#ASMEND
 ; GFX906-NEXT:    s_endpgm
+;
+; GFX11-LABEL: uniform_vec_f16_LL:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT:    s_load_b32 s1, s[2:3], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use s0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    s_endpgm
   %val0 = load volatile i32, ptr addrspace(4) %in0
   %val1 = load volatile i32, ptr addrspace(4) %in1
   %lo.i = trunc i32 %val0 to i16
@@ -507,6 +637,13 @@ define float @divergent_vec_f16_LL(half %a, half %b) {
 ; GFX906-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX906-NEXT:    v_perm_b32 v0, v1, v0, s4
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: divergent_vec_f16_LL:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %tmp = insertelement <2 x half> undef, half %a, i32 0
   %vec = insertelement <2 x half> %tmp, half %b, i32 1
   %val = bitcast <2 x half> %vec to float
@@ -535,6 +672,14 @@ define <2 x i16> @build_vec_v2i16_undeflo_divergent(ptr addrspace(3) %in) #0 {
 ; GFX906-NEXT:    ds_read_u16 v0, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: build_vec_v2i16_undeflo_divergent:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    ds_load_u16_d16 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %load = load i16, ptr addrspace(3) %in
   %build = insertelement <2 x i16> undef, i16 %load, i32 0
@@ -579,6 +724,19 @@ define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in,
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_store_dword v1, v0, s[2:3]
 ; GFX906-NEXT:    s_endpgm
+;
+; GFX11-LABEL: build_vec_v2i16_undeflo_uniform:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x24
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x2c
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT:    ds_load_u16_d16 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 entry:
   %load = load i16, ptr addrspace(3) %in
   %build = insertelement <2 x i16> undef, i16 %load, i32 0

diff  --git a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
index 0b2ec25f38c7d..2e00b4b06b1b5 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs -o - %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s
 
 define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1) {
 ; SI-LABEL: vec_8xi16_extract_4xi16:
@@ -114,6 +115,38 @@ define <4 x i16> @vec_8xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1)
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v2, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v3, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: vec_8xi16_extract_4xi16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_cbranch_scc0 .LBB0_2
+; GFX11-NEXT:  ; %bb.1: ; %F
+; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_cbranch_execz .LBB0_3
+; GFX11-NEXT:    s_branch .LBB0_4
+; GFX11-NEXT:  .LBB0_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX11-NEXT:  .LBB0_3: ; %T
+; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:  .LBB0_4: ; %exit
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
+; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, 0xffff8000, v1
+; GFX11-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v2, 0xffff8000, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, 0xffff8000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 undef, label %T, label %F
 
 T:
@@ -246,6 +279,38 @@ define <4 x i16> @vec_8xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: vec_8xi16_extract_4xi16_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_cbranch_scc0 .LBB1_2
+; GFX11-NEXT:  ; %bb.1: ; %F
+; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_cbranch_execz .LBB1_3
+; GFX11-NEXT:    s_branch .LBB1_4
+; GFX11-NEXT:  .LBB1_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX11-NEXT:  .LBB1_3: ; %T
+; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:  .LBB1_4: ; %exit
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
+; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, 0xffff8000, v1
+; GFX11-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v2, 0xffff8000, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, 0xffff8000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 undef, label %T, label %F
 
 T:
@@ -381,6 +446,39 @@ define <4 x half> @vec_8xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v0, v5
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v4, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: vec_8xf16_extract_4xf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_cbranch_scc0 .LBB2_2
+; GFX11-NEXT:  ; %bb.1: ; %F
+; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_cbranch_execz .LBB2_3
+; GFX11-NEXT:    s_branch .LBB2_4
+; GFX11-NEXT:  .LBB2_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX11-NEXT:  .LBB2_3: ; %T
+; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:  .LBB2_4: ; %exit
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v1
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0
+; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, 0.5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo
+; GFX11-NEXT:    v_pack_b32_f16 v0, v2, v1
+; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 undef, label %T, label %F
 
 T:
@@ -550,6 +648,42 @@ define <4 x i16> @vec_16xi16_extract_4xi16(ptr addrspace(1) %p0, ptr addrspace(1
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: vec_16xi16_extract_4xi16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_cbranch_scc0 .LBB3_2
+; GFX11-NEXT:  ; %bb.1: ; %F
+; GFX11-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_cbranch_execz .LBB3_3
+; GFX11-NEXT:    s_branch .LBB3_4
+; GFX11-NEXT:  .LBB3_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; GFX11-NEXT:  .LBB3_3: ; %T
+; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:  .LBB3_4: ; %exit
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v2 op_sel_hi:[0,1]
+; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v3 op_sel_hi:[0,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, 0xffff8000, v1
+; GFX11-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v2, 0xffff8000, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, 0xffff8000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 undef, label %T, label %F
 
 T:
@@ -721,6 +855,42 @@ define <4 x i16> @vec_16xi16_extract_4xi16_2(ptr addrspace(1) %p0, ptr addrspace
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v3, s4
 ; GFX9-NEXT:    v_perm_b32 v1, v2, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: vec_16xi16_extract_4xi16_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_cbranch_scc0 .LBB4_2
+; GFX11-NEXT:  ; %bb.1: ; %F
+; GFX11-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_cbranch_execz .LBB4_3
+; GFX11-NEXT:    s_branch .LBB4_4
+; GFX11-NEXT:  .LBB4_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; GFX11-NEXT:  .LBB4_3: ; %T
+; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:  .LBB4_4: ; %exit
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 15, v4 op_sel_hi:[0,1]
+; GFX11-NEXT:    v_pk_ashrrev_i16 v1, 15, v5 op_sel_hi:[0,1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, 0xffff8000, v1
+; GFX11-NEXT:    v_or_b32_e32 v0, 0xffff8000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v2, 0xffff8000, v2
+; GFX11-NEXT:    v_or_b32_e32 v3, 0xffff8000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 undef, label %T, label %F
 
 T:
@@ -895,6 +1065,43 @@ define <4 x half> @vec_16xf16_extract_4xf16(ptr addrspace(1) %p0, ptr addrspace(
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v0, v4
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: vec_16xf16_extract_4xf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_cbranch_scc0 .LBB5_2
+; GFX11-NEXT:  ; %bb.1: ; %F
+; GFX11-NEXT:    global_load_b128 v[4:7], v[2:3], off offset:16 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[2:5], v[2:3], off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_cbranch_execz .LBB5_3
+; GFX11-NEXT:    s_branch .LBB5_4
+; GFX11-NEXT:  .LBB5_2:
+; GFX11-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+; GFX11-NEXT:  .LBB5_3: ; %T
+; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off offset:16 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[2:5], v[0:1], off glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:  .LBB5_4: ; %exit
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3d00
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3900, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v1
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0x3900 :: v_dual_cndmask_b32 v1, 0x3900, v0
+; GFX11-NEXT:    v_cmp_nge_f16_e32 vcc_lo, 0.5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, 0x3d00, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_ge_f16_e32 vcc_lo, 0.5, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x3900, v0, vcc_lo
+; GFX11-NEXT:    v_pack_b32_f16 v0, v2, v1
+; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v4
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   br i1 undef, label %T, label %F
 
 T:
@@ -945,6 +1152,16 @@ define <8 x i16> @large_vector(ptr addrspace(3) %p, i32 %idxp) {
 ; GFX9-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: large_vector:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_lshl_add_u32 v2, v1, 5, v0
+; GFX11-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset1:1
+; GFX11-NEXT:    ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %idx = shl i32 %idxp, 4
 
   %i.0 = or i32 %idx, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
index 4a77cade2a27a..3504a0513b950 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
@@ -8,6 +8,9 @@
 ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI-SAFE %s
 ; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI-NNAN %s
 
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SAFE %s
+; RUN: llc -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mtriple=amdgcn-- -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-NNAN %s
+
 
 define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 {
 ; GFX9-SAFE-LABEL: test_fmin_legacy_ule_f16:
@@ -55,6 +58,21 @@ define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 {
 ; SI-NNAN-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NNAN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-LABEL: test_fmin_legacy_ule_f16:
+; GFX11-SAFE:       ; %bb.0:
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-NNAN-LABEL: test_fmin_legacy_ule_f16:
+; GFX11-NNAN:       ; %bb.0:
+; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NNAN-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NNAN-NEXT:    v_min_f16_e32 v0, v0, v1
+; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule half %a, %b
   %val = select i1 %cmp, half %a, half %b
   ret half %val
@@ -130,6 +148,27 @@ define <2 x half> @test_fmin_legacy_ule_v2f16(<2 x half> %a, <2 x half> %b) #0 {
 ; SI-NNAN-NEXT:    v_min_f32_e32 v0, v0, v2
 ; SI-NNAN-NEXT:    v_min_f32_e32 v1, v1, v3
 ; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v2f16:
+; GFX11-SAFE:       ; %bb.0:
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v2
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v1
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-SAFE-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v2f16:
+; GFX11-NNAN:       ; %bb.0:
+; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NNAN-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NNAN-NEXT:    v_pk_min_f16 v0, v0, v1
+; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <2 x half> %a, %b
   %val = select <2 x i1> %cmp, <2 x half> %a, <2 x half> %b
   ret <2 x half> %val
@@ -221,6 +260,30 @@ define <3 x half> @test_fmin_legacy_ule_v3f16(<3 x half> %a, <3 x half> %b) #0 {
 ; SI-NNAN-NEXT:    v_min_f32_e32 v1, v1, v4
 ; SI-NNAN-NEXT:    v_min_f32_e32 v2, v2, v5
 ; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v3f16:
+; GFX11-SAFE:       ; %bb.0:
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-SAFE-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v3f16:
+; GFX11-NNAN:       ; %bb.0:
+; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NNAN-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NNAN-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-NNAN-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <3 x half> %a, %b
   %val = select <3 x i1> %cmp, <3 x half> %a, <3 x half> %b
   ret <3 x half> %val
@@ -335,6 +398,36 @@ define <4 x half> @test_fmin_legacy_ule_v4f16(<4 x half> %a, <4 x half> %b) #0 {
 ; SI-NNAN-NEXT:    v_min_f32_e32 v2, v2, v6
 ; SI-NNAN-NEXT:    v_min_f32_e32 v3, v3, v7
 ; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v4f16:
+; GFX11-SAFE:       ; %bb.0:
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v5, v4
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v7, v6
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v5, v6, v7, vcc_lo
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v2
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v3
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SAFE-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
+; GFX11-SAFE-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
+; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v4f16:
+; GFX11-NNAN:       ; %bb.0:
+; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NNAN-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NNAN-NEXT:    v_pk_min_f16 v0, v0, v2
+; GFX11-NNAN-NEXT:    v_pk_min_f16 v1, v1, v3
+; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <4 x half> %a, %b
   %val = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b
   ret <4 x half> %val
@@ -527,6 +620,52 @@ define <8 x half> @test_fmin_legacy_ule_v8f16(<8 x half> %a, <8 x half> %b) #0 {
 ; SI-NNAN-NEXT:    v_min_f32_e32 v6, v6, v14
 ; SI-NNAN-NEXT:    v_min_f32_e32 v7, v7, v15
 ; SI-NNAN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SAFE-LABEL: test_fmin_legacy_ule_v8f16:
+; GFX11-SAFE:       ; %bb.0:
+; GFX11-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SAFE-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v10, 16, v7
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v11, 16, v3
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v12, 16, v6
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v15, 16, v1
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v11, v10
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX11-SAFE-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc_lo
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v13, v12
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v11, v12, v13, vcc_lo
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v15, v14
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v12, v14, v15, vcc_lo
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v9, v8
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc_lo
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v2, v6
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc_lo
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0, v4
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
+; GFX11-SAFE-NEXT:    v_perm_b32 v2, v11, v2, 0x5040100
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v1, v5
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX11-SAFE-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v3, v7
+; GFX11-SAFE-NEXT:    v_perm_b32 v1, v12, v1, 0x5040100
+; GFX11-SAFE-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc_lo
+; GFX11-SAFE-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
+; GFX11-SAFE-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SAFE-NEXT:    v_perm_b32 v3, v10, v3, 0x5040100
+; GFX11-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-NNAN-LABEL: test_fmin_legacy_ule_v8f16:
+; GFX11-NNAN:       ; %bb.0:
+; GFX11-NNAN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NNAN-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NNAN-NEXT:    v_pk_min_f16 v0, v0, v4
+; GFX11-NNAN-NEXT:    v_pk_min_f16 v1, v1, v5
+; GFX11-NNAN-NEXT:    v_pk_min_f16 v2, v2, v6
+; GFX11-NNAN-NEXT:    v_pk_min_f16 v3, v3, v7
+; GFX11-NNAN-NEXT:    s_setpc_b64 s[30:31]
   %cmp = fcmp ule <8 x half> %a, %b
   %val = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b
   ret <8 x half> %val

diff  --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index 48f6c26c8d633..5d27394832596 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SICI,SI %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=SICI,CI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
 
 declare half @llvm.nearbyint.f16(half) #0
 declare float @llvm.nearbyint.f32(float) #0
@@ -48,6 +49,18 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 {
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fnearbyint_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_rndne_f16_e32 v1, s2
+; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %1 = call half @llvm.nearbyint.f16(half %in)
   store half %1, ptr addrspace(1) %out
   ret void
@@ -75,6 +88,18 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 {
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fnearbyint_f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_rndne_f32_e32 v1, s2
+; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call float @llvm.nearbyint.f32(float %in)
   store float %0, ptr addrspace(1) %out
@@ -105,6 +130,17 @@ define amdgpu_kernel void @fnearbyint_v2f32(ptr addrspace(1) %out, <2 x float> %
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fnearbyint_v2f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_rndne_f32_e32 v1, s3
+; GFX11-NEXT:    v_rndne_f32_e32 v0, s2
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in)
   store <2 x float> %0, ptr addrspace(1) %out
@@ -139,6 +175,21 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: fnearbyint_v4f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_rndne_f32_e32 v3, s7
+; GFX11-NEXT:    v_rndne_f32_e32 v2, s6
+; GFX11-NEXT:    v_rndne_f32_e32 v1, s5
+; GFX11-NEXT:    v_rndne_f32_e32 v0, s4
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in)
   store <4 x float> %0, ptr addrspace(1) %out
@@ -189,6 +240,16 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: nearbyint_f64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_rndne_f64_e32 v[0:1], s[2:3]
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call double @llvm.nearbyint.f64(double %in)
   store double %0, ptr addrspace(1) %out
@@ -251,6 +312,19 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: nearbyint_v2f64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_rndne_f64_e32 v[2:3], s[6:7]
+; GFX11-NEXT:    v_rndne_f64_e32 v[0:1], s[4:5]
+; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in)
   store <2 x double> %0, ptr addrspace(1) %out
@@ -341,6 +415,23 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
 ; VI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
 ; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: nearbyint_v4f64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b256 s[4:11], s[0:1], 0x44
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v8, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_rndne_f64_e32 v[6:7], s[10:11]
+; GFX11-NEXT:    v_rndne_f64_e32 v[4:5], s[8:9]
+; GFX11-NEXT:    v_rndne_f64_e32 v[2:3], s[6:7]
+; GFX11-NEXT:    v_rndne_f64_e32 v[0:1], s[4:5]
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 entry:
   %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in)
   store <4 x double> %0, ptr addrspace(1) %out


        


More information about the llvm-commits mailing list