[llvm] 7388520 - [GISel] Add more cases to isKnownNeverNaN

Pierre van Houtryve via llvm-commits llvm-commits at lists.llvm.org
Fri Sep 30 07:11:05 PDT 2022


Author: Pierre van Houtryve
Date: 2022-09-30T14:10:56Z
New Revision: 7388520d1c1810cb06fd71ce0c82dc5bf44d052a

URL: https://github.com/llvm/llvm-project/commit/7388520d1c1810cb06fd71ce0c82dc5bf44d052a
DIFF: https://github.com/llvm/llvm-project/commit/7388520d1c1810cb06fd71ce0c82dc5bf44d052a.diff

LOG: [GISel] Add more cases to isKnownNeverNaN

Make it even with the DAG implementation as of D134854

Reviewed By: arsenm, foad

Differential Revision: https://reviews.llvm.org/D134857

Added: 
    

Modified: 
    llvm/lib/CodeGen/GlobalISel/Utils.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir
    llvm/test/CodeGen/AMDGPU/v_pack.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 4e61fd98f79d..92368ab9beb7 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -658,6 +658,20 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
   switch (DefMI->getOpcode()) {
   default:
     break;
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FSUB:
+  case TargetOpcode::G_FMUL:
+  case TargetOpcode::G_FDIV:
+  case TargetOpcode::G_FREM:
+  case TargetOpcode::G_FSIN:
+  case TargetOpcode::G_FCOS:
+  case TargetOpcode::G_FMA:
+  case TargetOpcode::G_FMAD:
+    if (SNaN)
+      return true;
+
+    // TODO: Need isKnownNeverInfinity
+    return false;
   case TargetOpcode::G_FMINNUM_IEEE:
   case TargetOpcode::G_FMAXNUM_IEEE: {
     if (SNaN)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
index 4c4d6a4ea986..2857c916dd31 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-fmed3-const-combine.ll
@@ -105,8 +105,7 @@ define float @test_fmed3_maybe_SNaN_input_ieee_true_dx10clamp_true(float %a) #2
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_mul_f32_e32 v0, 2.0, v0
-; GFX10-NEXT:    v_med3_f32 v0, v0, 0, 1.0
+; GFX10-NEXT:    v_mul_f32_e64 v0, v0, 2.0 clamp
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
   %fmed = call float @llvm.amdgcn.fmed3.f32(float %fmul, float 0.0, float 1.0)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
index 566db10ae284..7f8725e7f0f0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/clamp-minmax-const-combine.ll
@@ -205,7 +205,6 @@ define float @test_min_max_maybe_NaN_input_ieee_true_dx10clamp_false(float %a) #
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 2.0, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
 ; GFX10-NEXT:    v_med3_f32 v0, v0, 0, 1.0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %fmul = fmul float %a, 2.0
@@ -222,7 +221,6 @@ define float @test_max_min_maybe_NaN_input_ieee_true(float %a) #0 {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 2.0, v0
-; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
 ; GFX10-NEXT:    v_min_f32_e32 v0, 1.0, v0
 ; GFX10-NEXT:    v_max_f32_e32 v0, 0, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
index 2b4ee22ad330..733be82403cf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
@@ -254,8 +254,7 @@ body: |
     ; CHECK-NEXT: %one_s32:_(s32) = G_ANYEXT %one(s16)
     ; CHECK-NEXT: %one_undef:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %one_s32(s32), %undef(s32)
     ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[COPY]], %two_splat
-    ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[FMUL]]
-    ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %zero_undef, [[FCANONICALIZE]]
+    ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %zero_undef, [[FMUL]]
     ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE %one_undef, [[FMAXNUM_IEEE]]
     ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
@@ -306,8 +305,7 @@ body: |
     ; CHECK-NEXT: %qnan_undef:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %qnan_s32(s32), %undef(s32)
     ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[COPY]], %two_splat
     ; CHECK-NEXT: %snan_undef_fcan:_(<2 x s16>) = G_FCANONICALIZE %snan_undef
-    ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(<2 x s16>) = G_FCANONICALIZE [[FMUL]]
-    ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %snan_undef_fcan, [[FCANONICALIZE]]
+    ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %snan_undef_fcan, [[FMUL]]
     ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE %qnan_undef, [[FMAXNUM_IEEE]]
     ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
index 6520a613fae7..27a41368f1f4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
@@ -453,8 +453,8 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX9-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_sub_f16_e32 v3, v1, v5
 ; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
+; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -484,13 +484,11 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX10-NEXT:    v_sub_f16_e32 v3, v1, v5
 ; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_sub_f16_e32 v3, v1, v5
 ; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
-; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
+; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -532,8 +530,8 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX9-NEXT:    v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_sub_f16_e32 v3, v5, v1
 ; GFX9-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
+; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@@ -563,13 +561,11 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3
 ; GFX10-NEXT:    v_sub_f16_e32 v2, v4, v0
-; GFX10-NEXT:    v_sub_f16_e32 v3, v5, v1
 ; GFX10-NEXT:    v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_sub_f16_e32 v3, v5, v1
 ; GFX10-NEXT:    v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
-; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
+; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
index 7a8df8f636a7..ffa396021ddc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
@@ -241,8 +241,8 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX9-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; GFX9-NEXT:    v_sub_f16_e32 v3, v1, v5
 ; GFX9-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
+; GFX9-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -272,13 +272,11 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
 ; GFX10-NEXT:    v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-NEXT:    v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
 ; GFX10-NEXT:    v_sub_f16_e32 v2, v0, v4
-; GFX10-NEXT:    v_sub_f16_e32 v3, v1, v5
 ; GFX10-NEXT:    v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT:    v_sub_f16_e32 v3, v1, v5
 ; GFX10-NEXT:    v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
-; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
+; GFX10-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT:    v_pack_b32_f16 v1, v3, v1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 2bc553146f5d..0af76dd62422 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -6,14 +6,14 @@
 ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
 ; RUN: llc -global-isel -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX8 %s
 
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-IEEE %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX89,GFX9,GFX9-FLUSH %s
 
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-IEEE %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-FLUSH %s
 
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=ieee -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-IEEE %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -denormal-fp-math=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FLUSH %s
 
 define half @v_fdiv_f16(half %a, half %b) {
 ; GFX6-IEEE-LABEL: v_fdiv_f16:
@@ -771,42 +771,77 @@ define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fdiv_v2f16_afn:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fdiv_v2f16_afn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_fdiv_v2f16_afn:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_rcp_f16_e32 v2, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_mul_f16_e32 v1, v3, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-IEEE-LABEL: v_fdiv_v2f16_afn:
+; GFX9-IEEE:       ; %bb.0:
+; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-IEEE-NEXT:    v_rcp_f16_e32 v2, v1
+; GFX9-IEEE-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-IEEE-NEXT:    v_mul_f16_e32 v2, v0, v2
+; GFX9-IEEE-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLUSH-LABEL: v_fdiv_v2f16_afn:
+; GFX9-FLUSH:       ; %bb.0:
+; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLUSH-NEXT:    v_rcp_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FLUSH-NEXT:    v_mul_f16_e32 v2, v0, v2
+; GFX9-FLUSH-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-FLUSH-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-IEEE-LABEL: v_fdiv_v2f16_afn:
+; GFX10-IEEE:       ; %bb.0:
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-IEEE-NEXT:    v_rcp_f16_e32 v2, v1
+; GFX10-IEEE-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-IEEE-NEXT:    v_mul_f16_e32 v2, v0, v2
+; GFX10-IEEE-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-IEEE-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_fdiv_v2f16_afn:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-FLUSH-NEXT:    v_rcp_f16_e32 v2, v1
+; GFX10-FLUSH-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v2, v0, v2
+; GFX10-FLUSH-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX10-FLUSH-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-IEEE-LABEL: v_fdiv_v2f16_afn:
+; GFX11-IEEE:       ; %bb.0:
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-IEEE-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-IEEE-NEXT:    v_rcp_f16_e32 v1, v1
+; GFX11-IEEE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-IEEE-NEXT:    v_rcp_f16_e32 v2, v2
+; GFX11-IEEE-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-IEEE-NEXT:    v_mul_f16_e32 v1, v3, v2
+; GFX11-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FLUSH-LABEL: v_fdiv_v2f16_afn:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FLUSH-NEXT:    v_rcp_f16_e32 v1, v1
+; GFX11-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FLUSH-NEXT:    v_rcp_f16_e32 v2, v2
+; GFX11-FLUSH-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v3, v2
+; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FLUSH-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn <2 x half> %a, %b
   ret <2 x half> %fdiv
 }
@@ -1479,42 +1514,77 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_rcp_f16_e32 v2, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_mul_f16_e32 v1, v3, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-IEEE-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX9-IEEE:       ; %bb.0:
+; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-IEEE-NEXT:    v_rcp_f16_e32 v2, v1
+; GFX9-IEEE-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-IEEE-NEXT:    v_mul_f16_e32 v2, v0, v2
+; GFX9-IEEE-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLUSH-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX9-FLUSH:       ; %bb.0:
+; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLUSH-NEXT:    v_rcp_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FLUSH-NEXT:    v_mul_f16_e32 v2, v0, v2
+; GFX9-FLUSH-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-FLUSH-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-IEEE-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX10-IEEE:       ; %bb.0:
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-IEEE-NEXT:    v_rcp_f16_e32 v2, v1
+; GFX10-IEEE-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-IEEE-NEXT:    v_mul_f16_e32 v2, v0, v2
+; GFX10-IEEE-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-IEEE-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-FLUSH-NEXT:    v_rcp_f16_e32 v2, v1
+; GFX10-FLUSH-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v2, v0, v2
+; GFX10-FLUSH-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX10-FLUSH-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-IEEE-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX11-IEEE:       ; %bb.0:
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-IEEE-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-IEEE-NEXT:    v_rcp_f16_e32 v1, v1
+; GFX11-IEEE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-IEEE-NEXT:    v_rcp_f16_e32 v2, v2
+; GFX11-IEEE-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-IEEE-NEXT:    v_mul_f16_e32 v1, v3, v2
+; GFX11-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FLUSH-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FLUSH-NEXT:    v_rcp_f16_e32 v1, v1
+; GFX11-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FLUSH-NEXT:    v_rcp_f16_e32 v2, v2
+; GFX11-FLUSH-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v3, v2
+; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FLUSH-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0
   ret <2 x half> %fdiv
 }
@@ -1709,42 +1779,77 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX9-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    v_rcp_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_rcp_f16_e32 v1, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_rcp_f16_e32 v2, v2
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT:    v_mul_f16_e32 v1, v3, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-IEEE-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX9-IEEE:       ; %bb.0:
+; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-IEEE-NEXT:    v_rcp_f16_e32 v2, v1
+; GFX9-IEEE-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-IEEE-NEXT:    v_mul_f16_e32 v2, v0, v2
+; GFX9-IEEE-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLUSH-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX9-FLUSH:       ; %bb.0:
+; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLUSH-NEXT:    v_rcp_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FLUSH-NEXT:    v_mul_f16_e32 v2, v0, v2
+; GFX9-FLUSH-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-FLUSH-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-IEEE-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX10-IEEE:       ; %bb.0:
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-IEEE-NEXT:    v_rcp_f16_e32 v2, v1
+; GFX10-IEEE-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-IEEE-NEXT:    v_mul_f16_e32 v2, v0, v2
+; GFX10-IEEE-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-IEEE-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-FLUSH-NEXT:    v_rcp_f16_e32 v2, v1
+; GFX10-FLUSH-NEXT:    v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-FLUSH-NEXT:    v_mul_f16_e32 v2, v0, v2
+; GFX10-FLUSH-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v1, 0xffff, v2
+; GFX10-FLUSH-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-IEEE-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX11-IEEE:       ; %bb.0:
+; GFX11-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-IEEE-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-IEEE-NEXT:    v_rcp_f16_e32 v1, v1
+; GFX11-IEEE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-IEEE-NEXT:    v_rcp_f16_e32 v2, v2
+; GFX11-IEEE-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-IEEE-NEXT:    v_mul_f16_e32 v1, v3, v2
+; GFX11-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FLUSH-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX11-FLUSH:       ; %bb.0:
+; GFX11-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FLUSH-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-FLUSH-NEXT:    v_rcp_f16_e32 v1, v1
+; GFX11-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-FLUSH-NEXT:    v_rcp_f16_e32 v2, v2
+; GFX11-FLUSH-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-FLUSH-NEXT:    v_mul_f16_e32 v1, v3, v2
+; GFX11-FLUSH-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FLUSH-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FLUSH-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv afn arcp <2 x half> %a, %b, !fpmath !0
   ret <2 x half> %fdiv
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
index d5c51eebe820..2e9a66c579cb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll
@@ -146,7 +146,6 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
 ; SI-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_sub_f32_e32 v2, 0x80000000, v2
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; SI-NEXT:    v_min_f32_e32 v5, v2, v3
 ; SI-NEXT:    v_max_f32_e32 v2, v2, v3
@@ -186,7 +185,6 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_sub_f32_e32 v4, 0x80000000, v7
 ; VI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; VI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; VI-NEXT:    v_min_f32_e32 v5, v4, v2
 ; VI-NEXT:    v_max_f32_e32 v2, v4, v2
 ; VI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
@@ -208,7 +206,6 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_sub_f32_e32 v1, 0x80000000, v1
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    v_min_f32_e32 v4, v1, v2
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_max_f32_e32 v2, v3, v3
@@ -231,7 +228,6 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
 ; GFX10-NEXT:    v_sub_f32_e32 v1, 0x80000000, v1
 ; GFX10-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX10-NEXT:    v_max_f32_e32 v3, v3, v3
-; GFX10-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX10-NEXT:    v_max_f32_e32 v4, v1, v2
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX10-NEXT:    v_min_f32_e32 v2, v4, v3
@@ -251,11 +247,9 @@ define amdgpu_kernel void @v_test_no_global_nnans_med3_f32_pat0_srcmod0(float ad
 ; GFX11-NEXT:    global_load_b32 v3, v0, s[6:7] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_dual_sub_f32 v1, 0x80000000, v1 :: v_dual_max_f32 v2, v2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f32_e32 v4, v1, v2
 ; GFX11-NEXT:    v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_minmax_f32 v1, v1, v2, v4
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir
index e1f33d0563e9..8025e8fc8129 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-clamp-fmed3-const.mir
@@ -242,12 +242,8 @@ body: |
     ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 2.000000e+00
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 1.000000e+00
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_FCONSTANT float 0.000000e+00
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[C2]](s32)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.fmed3), [[FMUL]](s32), [[COPY2]](s32), [[COPY3]](s32)
-    ; CHECK-NEXT: $vgpr0 = COPY [[INT]](s32)
+    ; CHECK-NEXT: [[AMDGPU_CLAMP:%[0-9]+]]:vgpr(s32) = G_AMDGPU_CLAMP [[FMUL]]
+    ; CHECK-NEXT: $vgpr0 = COPY [[AMDGPU_CLAMP]](s32)
     %0:vgpr(s32) = COPY $vgpr0
     %2:sgpr(s32) = G_FCONSTANT float 2.000000e+00
     %8:vgpr(s32) = COPY %2(s32)

diff  --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index d59ea23772b1..ac783fa72b9c 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -33,8 +33,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16(half addrspace(1)* %in0, half addrsp
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_add_f16_e32 v0, 2.0, v1
 ; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GISEL-NEXT:    ;;#ASMSTART
 ; GISEL-NEXT:    ; use v0
 ; GISEL-NEXT:    ;;#ASMEND
@@ -83,8 +82,7 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(half addrspace(1)* %in0, half ad
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    v_subrev_f16_e32 v0, 2.0, v1
 ; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GISEL-NEXT:    ;;#ASMSTART
 ; GISEL-NEXT:    ; use v0
 ; GISEL-NEXT:    ;;#ASMEND
@@ -230,8 +228,7 @@ define amdgpu_kernel void @v_pack_b32.fneg(half addrspace(1)* %in0, half addrspa
 ; GISEL-NEXT:    v_add_f16_e32 v1, 2.0, v2
 ; GISEL-NEXT:    v_sub_f16_e32 v0, 0x8000, v0
 ; GISEL-NEXT:    v_sub_f16_e32 v1, 0x8000, v1
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GISEL-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GISEL-NEXT:    ;;#ASMSTART
 ; GISEL-NEXT:    ; use v0
 ; GISEL-NEXT:    ;;#ASMEND


        


More information about the llvm-commits mailing list